aleegis12's picture
Training in progress, step 2700, checkpoint
25a9030 verified
{
"best_metric": 0.0030398748349398375,
"best_model_checkpoint": "miner_id_24/checkpoint-2400",
"epoch": 1.0324060797246917,
"eval_steps": 150,
"global_step": 2700,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003823726221202562,
"eval_loss": 0.7984991073608398,
"eval_runtime": 29.7684,
"eval_samples_per_second": 74.005,
"eval_steps_per_second": 18.51,
"step": 1
},
{
"epoch": 0.01911863110601281,
"grad_norm": 0.9104048609733582,
"learning_rate": 0.0002,
"loss": 0.2613,
"step": 50
},
{
"epoch": 0.03823726221202562,
"grad_norm": 1.0492737293243408,
"learning_rate": 0.0001998582695676762,
"loss": 0.0597,
"step": 100
},
{
"epoch": 0.05735589331803843,
"grad_norm": 0.7362340688705444,
"learning_rate": 0.00019943348002101371,
"loss": 0.0208,
"step": 150
},
{
"epoch": 0.05735589331803843,
"eval_loss": 0.06130767986178398,
"eval_runtime": 29.7713,
"eval_samples_per_second": 73.998,
"eval_steps_per_second": 18.508,
"step": 150
},
{
"epoch": 0.07647452442405124,
"grad_norm": 0.3750320374965668,
"learning_rate": 0.00019872683547213446,
"loss": 0.0245,
"step": 200
},
{
"epoch": 0.09559315553006405,
"grad_norm": 0.6900832056999207,
"learning_rate": 0.00019774033898178667,
"loss": 0.0198,
"step": 250
},
{
"epoch": 0.11471178663607685,
"grad_norm": 1.2825437784194946,
"learning_rate": 0.0001964767868814516,
"loss": 0.0139,
"step": 300
},
{
"epoch": 0.11471178663607685,
"eval_loss": 0.01278063002973795,
"eval_runtime": 30.1673,
"eval_samples_per_second": 73.026,
"eval_steps_per_second": 18.265,
"step": 300
},
{
"epoch": 0.13383041774208967,
"grad_norm": 0.2496359497308731,
"learning_rate": 0.00019493976084683813,
"loss": 0.0113,
"step": 350
},
{
"epoch": 0.15294904884810248,
"grad_norm": 0.31606459617614746,
"learning_rate": 0.00019313361774523385,
"loss": 0.009,
"step": 400
},
{
"epoch": 0.1720676799541153,
"grad_norm": 0.014868668280541897,
"learning_rate": 0.00019106347728549135,
"loss": 0.0078,
"step": 450
},
{
"epoch": 0.1720676799541153,
"eval_loss": 0.009437955915927887,
"eval_runtime": 29.8773,
"eval_samples_per_second": 73.735,
"eval_steps_per_second": 18.442,
"step": 450
},
{
"epoch": 0.1911863110601281,
"grad_norm": 0.4325007498264313,
"learning_rate": 0.00018873520750565718,
"loss": 0.0154,
"step": 500
},
{
"epoch": 0.2103049421661409,
"grad_norm": 0.10329019278287888,
"learning_rate": 0.0001861554081393806,
"loss": 0.0075,
"step": 550
},
{
"epoch": 0.2294235732721537,
"grad_norm": 0.2479812651872635,
"learning_rate": 0.0001833313919082515,
"loss": 0.0054,
"step": 600
},
{
"epoch": 0.2294235732721537,
"eval_loss": 0.008098677732050419,
"eval_runtime": 29.9919,
"eval_samples_per_second": 73.453,
"eval_steps_per_second": 18.372,
"step": 600
},
{
"epoch": 0.24854220437816651,
"grad_norm": 0.18240171670913696,
"learning_rate": 0.00018027116379309638,
"loss": 0.0072,
"step": 650
},
{
"epoch": 0.26766083548417935,
"grad_norm": 0.6388723850250244,
"learning_rate": 0.00017698339834299061,
"loss": 0.0047,
"step": 700
},
{
"epoch": 0.28677946659019216,
"grad_norm": 0.02707633562386036,
"learning_rate": 0.00017347741508630672,
"loss": 0.0036,
"step": 750
},
{
"epoch": 0.28677946659019216,
"eval_loss": 0.006791761610656977,
"eval_runtime": 29.9126,
"eval_samples_per_second": 73.648,
"eval_steps_per_second": 18.42,
"step": 750
},
{
"epoch": 0.30589809769620496,
"grad_norm": 0.6211602091789246,
"learning_rate": 0.0001697631521134985,
"loss": 0.0082,
"step": 800
},
{
"epoch": 0.32501672880221777,
"grad_norm": 0.19684000313282013,
"learning_rate": 0.00016585113790650388,
"loss": 0.0081,
"step": 850
},
{
"epoch": 0.3441353599082306,
"grad_norm": 1.4358758926391602,
"learning_rate": 0.0001617524614946192,
"loss": 0.0049,
"step": 900
},
{
"epoch": 0.3441353599082306,
"eval_loss": 0.006424758583307266,
"eval_runtime": 30.1736,
"eval_samples_per_second": 73.011,
"eval_steps_per_second": 18.261,
"step": 900
},
{
"epoch": 0.3632539910142434,
"grad_norm": 0.18068476021289825,
"learning_rate": 0.0001574787410214407,
"loss": 0.0075,
"step": 950
},
{
"epoch": 0.3823726221202562,
"grad_norm": 0.35232096910476685,
"learning_rate": 0.00015304209081197425,
"loss": 0.0055,
"step": 1000
},
{
"epoch": 0.401491253226269,
"grad_norm": 0.037342701107263565,
"learning_rate": 0.00014845508703326504,
"loss": 0.0064,
"step": 1050
},
{
"epoch": 0.401491253226269,
"eval_loss": 0.006933907046914101,
"eval_runtime": 29.8157,
"eval_samples_per_second": 73.887,
"eval_steps_per_second": 18.48,
"step": 1050
},
{
"epoch": 0.4206098843322818,
"grad_norm": 0.15965907275676727,
"learning_rate": 0.00014373073204588556,
"loss": 0.0029,
"step": 1100
},
{
"epoch": 0.4397285154382946,
"grad_norm": 0.2774678170681,
"learning_rate": 0.00013888241754733208,
"loss": 0.003,
"step": 1150
},
{
"epoch": 0.4588471465443074,
"grad_norm": 0.23319095373153687,
"learning_rate": 0.00013392388661180303,
"loss": 0.0059,
"step": 1200
},
{
"epoch": 0.4588471465443074,
"eval_loss": 0.005617051851004362,
"eval_runtime": 29.8717,
"eval_samples_per_second": 73.749,
"eval_steps_per_second": 18.446,
"step": 1200
},
{
"epoch": 0.4779657776503202,
"grad_norm": 0.16849285364151,
"learning_rate": 0.0001288691947339621,
"loss": 0.0027,
"step": 1250
},
{
"epoch": 0.49708440875633303,
"grad_norm": 0.0013202859554439783,
"learning_rate": 0.0001237326699871115,
"loss": 0.003,
"step": 1300
},
{
"epoch": 0.5162030398623458,
"grad_norm": 0.18088360130786896,
"learning_rate": 0.00011852887240871145,
"loss": 0.0063,
"step": 1350
},
{
"epoch": 0.5162030398623458,
"eval_loss": 0.004800805356353521,
"eval_runtime": 29.7879,
"eval_samples_per_second": 73.956,
"eval_steps_per_second": 18.497,
"step": 1350
},
{
"epoch": 0.5353216709683587,
"grad_norm": 0.051963359117507935,
"learning_rate": 0.00011327255272837221,
"loss": 0.0029,
"step": 1400
},
{
"epoch": 0.5544403020743714,
"grad_norm": 0.1712111085653305,
"learning_rate": 0.00010797861055530831,
"loss": 0.0067,
"step": 1450
},
{
"epoch": 0.5735589331803843,
"grad_norm": 0.2981737554073334,
"learning_rate": 0.00010266205214377748,
"loss": 0.0023,
"step": 1500
},
{
"epoch": 0.5735589331803843,
"eval_loss": 0.004881248809397221,
"eval_runtime": 29.9998,
"eval_samples_per_second": 73.434,
"eval_steps_per_second": 18.367,
"step": 1500
},
{
"epoch": 0.5926775642863971,
"grad_norm": 0.22893473505973816,
"learning_rate": 9.733794785622253e-05,
"loss": 0.0018,
"step": 1550
},
{
"epoch": 0.6117961953924099,
"grad_norm": 0.020580679178237915,
"learning_rate": 9.202138944469168e-05,
"loss": 0.0017,
"step": 1600
},
{
"epoch": 0.6309148264984227,
"grad_norm": 0.017767922952771187,
"learning_rate": 8.672744727162781e-05,
"loss": 0.002,
"step": 1650
},
{
"epoch": 0.6309148264984227,
"eval_loss": 0.003939439542591572,
"eval_runtime": 30.3063,
"eval_samples_per_second": 72.691,
"eval_steps_per_second": 18.181,
"step": 1650
},
{
"epoch": 0.6500334576044355,
"grad_norm": 0.12652985751628876,
"learning_rate": 8.147112759128859e-05,
"loss": 0.0019,
"step": 1700
},
{
"epoch": 0.6691520887104483,
"grad_norm": 0.09526068717241287,
"learning_rate": 7.626733001288851e-05,
"loss": 0.0032,
"step": 1750
},
{
"epoch": 0.6882707198164612,
"grad_norm": 0.004104911349713802,
"learning_rate": 7.113080526603792e-05,
"loss": 0.0041,
"step": 1800
},
{
"epoch": 0.6882707198164612,
"eval_loss": 0.004738857503980398,
"eval_runtime": 29.9117,
"eval_samples_per_second": 73.65,
"eval_steps_per_second": 18.421,
"step": 1800
},
{
"epoch": 0.7073893509224739,
"grad_norm": 0.1330576241016388,
"learning_rate": 6.607611338819697e-05,
"loss": 0.0063,
"step": 1850
},
{
"epoch": 0.7265079820284868,
"grad_norm": 0.01188935898244381,
"learning_rate": 6.111758245266794e-05,
"loss": 0.0015,
"step": 1900
},
{
"epoch": 0.7456266131344995,
"grad_norm": 0.16971707344055176,
"learning_rate": 5.626926795411447e-05,
"loss": 0.0067,
"step": 1950
},
{
"epoch": 0.7456266131344995,
"eval_loss": 0.003787765046581626,
"eval_runtime": 30.0478,
"eval_samples_per_second": 73.316,
"eval_steps_per_second": 18.337,
"step": 1950
},
{
"epoch": 0.7647452442405124,
"grad_norm": 2.171154499053955,
"learning_rate": 5.1544912966734994e-05,
"loss": 0.0114,
"step": 2000
},
{
"epoch": 0.7838638753465252,
"grad_norm": 0.24568092823028564,
"learning_rate": 4.695790918802576e-05,
"loss": 0.0067,
"step": 2050
},
{
"epoch": 0.802982506452538,
"grad_norm": 0.03622471168637276,
"learning_rate": 4.252125897855932e-05,
"loss": 0.0018,
"step": 2100
},
{
"epoch": 0.802982506452538,
"eval_loss": 0.004047640599310398,
"eval_runtime": 30.0363,
"eval_samples_per_second": 73.345,
"eval_steps_per_second": 18.344,
"step": 2100
},
{
"epoch": 0.8221011375585509,
"grad_norm": 0.187901109457016,
"learning_rate": 3.824753850538082e-05,
"loss": 0.0058,
"step": 2150
},
{
"epoch": 0.8412197686645636,
"grad_norm": 0.015644945204257965,
"learning_rate": 3.414886209349615e-05,
"loss": 0.0015,
"step": 2200
},
{
"epoch": 0.8603383997705765,
"grad_norm": 0.4227260947227478,
"learning_rate": 3.0236847886501542e-05,
"loss": 0.0037,
"step": 2250
},
{
"epoch": 0.8603383997705765,
"eval_loss": 0.003459322266280651,
"eval_runtime": 29.9077,
"eval_samples_per_second": 73.66,
"eval_steps_per_second": 18.423,
"step": 2250
},
{
"epoch": 0.8794570308765892,
"grad_norm": 0.13409237563610077,
"learning_rate": 2.6522584913693294e-05,
"loss": 0.0015,
"step": 2300
},
{
"epoch": 0.8985756619826021,
"grad_norm": 0.0020856671035289764,
"learning_rate": 2.301660165700936e-05,
"loss": 0.0014,
"step": 2350
},
{
"epoch": 0.9176942930886148,
"grad_norm": 0.03056473471224308,
"learning_rate": 1.9728836206903656e-05,
"loss": 0.0012,
"step": 2400
},
{
"epoch": 0.9176942930886148,
"eval_loss": 0.0030398748349398375,
"eval_runtime": 29.8608,
"eval_samples_per_second": 73.776,
"eval_steps_per_second": 18.452,
"step": 2400
},
{
"epoch": 0.9368129241946277,
"grad_norm": 0.1219855397939682,
"learning_rate": 1.6668608091748495e-05,
"loss": 0.0035,
"step": 2450
},
{
"epoch": 0.9559315553006404,
"grad_norm": 0.20250053703784943,
"learning_rate": 1.3844591860619383e-05,
"loss": 0.003,
"step": 2500
},
{
"epoch": 0.9750501864066533,
"grad_norm": 0.2821449041366577,
"learning_rate": 1.1264792494342857e-05,
"loss": 0.0022,
"step": 2550
},
{
"epoch": 0.9750501864066533,
"eval_loss": 0.003188605420291424,
"eval_runtime": 30.0058,
"eval_samples_per_second": 73.419,
"eval_steps_per_second": 18.363,
"step": 2550
},
{
"epoch": 0.9941688175126661,
"grad_norm": 0.01603296771645546,
"learning_rate": 8.936522714508678e-06,
"loss": 0.0029,
"step": 2600
},
{
"epoch": 1.013287448618679,
"grad_norm": 0.08807271718978882,
"learning_rate": 6.866382254766157e-06,
"loss": 0.0008,
"step": 2650
},
{
"epoch": 1.0324060797246917,
"grad_norm": 0.0027434879448264837,
"learning_rate": 5.060239153161872e-06,
"loss": 0.0015,
"step": 2700
},
{
"epoch": 1.0324060797246917,
"eval_loss": 0.0031755403615534306,
"eval_runtime": 29.9216,
"eval_samples_per_second": 73.626,
"eval_steps_per_second": 18.415,
"step": 2700
}
],
"logging_steps": 50,
"max_steps": 3000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 150,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 2,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 2
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.08833717190656e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}