lesso12's picture
Training in progress, step 2000, checkpoint
3263f2f verified
{
"best_metric": 1.3815141916275024,
"best_model_checkpoint": "miner_id_24/checkpoint-2000",
"epoch": 0.32159511175430133,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00016079755587715066,
"eval_loss": 1.9122439622879028,
"eval_runtime": 431.3676,
"eval_samples_per_second": 24.283,
"eval_steps_per_second": 6.071,
"step": 1
},
{
"epoch": 0.008039877793857533,
"grad_norm": 1.275758147239685,
"learning_rate": 0.000106,
"loss": 1.7445,
"step": 50
},
{
"epoch": 0.016079755587715065,
"grad_norm": 1.1742221117019653,
"learning_rate": 0.000212,
"loss": 1.6522,
"step": 100
},
{
"epoch": 0.0241196333815726,
"grad_norm": 0.917005717754364,
"learning_rate": 0.00021163795625870702,
"loss": 1.6312,
"step": 150
},
{
"epoch": 0.03215951117543013,
"grad_norm": 0.8962546586990356,
"learning_rate": 0.00021055429816068858,
"loss": 1.5945,
"step": 200
},
{
"epoch": 0.04019938896928767,
"grad_norm": 0.9346632361412048,
"learning_rate": 0.00020875642818956903,
"loss": 1.5393,
"step": 250
},
{
"epoch": 0.0482392667631452,
"grad_norm": 0.7735784649848938,
"learning_rate": 0.00020625662762026727,
"loss": 1.5295,
"step": 300
},
{
"epoch": 0.05627914455700273,
"grad_norm": 0.748665988445282,
"learning_rate": 0.0002030719726254361,
"loss": 1.5226,
"step": 350
},
{
"epoch": 0.06431902235086026,
"grad_norm": 0.8975337743759155,
"learning_rate": 0.00019922421762788784,
"loss": 1.511,
"step": 400
},
{
"epoch": 0.0723589001447178,
"grad_norm": 0.8698158860206604,
"learning_rate": 0.00019473964669582803,
"loss": 1.5249,
"step": 450
},
{
"epoch": 0.08039877793857533,
"grad_norm": 0.792420506477356,
"learning_rate": 0.00018964889399601773,
"loss": 1.488,
"step": 500
},
{
"epoch": 0.08039877793857533,
"eval_loss": 1.5327790975570679,
"eval_runtime": 431.164,
"eval_samples_per_second": 24.295,
"eval_steps_per_second": 6.074,
"step": 500
},
{
"epoch": 0.08843865573243287,
"grad_norm": 0.7909814715385437,
"learning_rate": 0.00018398673453135197,
"loss": 1.474,
"step": 550
},
{
"epoch": 0.0964785335262904,
"grad_norm": 0.7088127136230469,
"learning_rate": 0.00017779184659232858,
"loss": 1.483,
"step": 600
},
{
"epoch": 0.10451841132014793,
"grad_norm": 0.7714433670043945,
"learning_rate": 0.0001711065475451048,
"loss": 1.4852,
"step": 650
},
{
"epoch": 0.11255828911400546,
"grad_norm": 1.0182313919067383,
"learning_rate": 0.00016397650476097727,
"loss": 1.4443,
"step": 700
},
{
"epoch": 0.120598166907863,
"grad_norm": 0.7473872900009155,
"learning_rate": 0.00015645042366192982,
"loss": 1.452,
"step": 750
},
{
"epoch": 0.12863804470172052,
"grad_norm": 0.789179265499115,
"learning_rate": 0.0001485797150132148,
"loss": 1.4872,
"step": 800
},
{
"epoch": 0.13667792249557806,
"grad_norm": 0.8989229202270508,
"learning_rate": 0.00014041814373569648,
"loss": 1.4354,
"step": 850
},
{
"epoch": 0.1447178002894356,
"grad_norm": 0.7713472843170166,
"learning_rate": 0.00013202146163692472,
"loss": 1.4508,
"step": 900
},
{
"epoch": 0.15275767808329313,
"grad_norm": 0.7908226847648621,
"learning_rate": 0.0001234470265697578,
"loss": 1.4426,
"step": 950
},
{
"epoch": 0.16079755587715067,
"grad_norm": 0.792984127998352,
"learning_rate": 0.00011475341062006725,
"loss": 1.4087,
"step": 1000
},
{
"epoch": 0.16079755587715067,
"eval_loss": 1.4277774095535278,
"eval_runtime": 431.2474,
"eval_samples_per_second": 24.29,
"eval_steps_per_second": 6.073,
"step": 1000
},
{
"epoch": 0.1688374336710082,
"grad_norm": 0.7347446084022522,
"learning_rate": 0.000106,
"loss": 1.4432,
"step": 1050
},
{
"epoch": 0.17687731146486574,
"grad_norm": 0.8061316013336182,
"learning_rate": 9.724658937993278e-05,
"loss": 1.4889,
"step": 1100
},
{
"epoch": 0.18491718925872327,
"grad_norm": 0.9965606927871704,
"learning_rate": 8.855297343024219e-05,
"loss": 1.4394,
"step": 1150
},
{
"epoch": 0.1929570670525808,
"grad_norm": 0.6851484179496765,
"learning_rate": 7.99785383630753e-05,
"loss": 1.416,
"step": 1200
},
{
"epoch": 0.20099694484643835,
"grad_norm": 0.7076012492179871,
"learning_rate": 7.158185626430357e-05,
"loss": 1.4108,
"step": 1250
},
{
"epoch": 0.20903682264029586,
"grad_norm": 0.818530797958374,
"learning_rate": 6.342028498678525e-05,
"loss": 1.4205,
"step": 1300
},
{
"epoch": 0.2170767004341534,
"grad_norm": 0.7673355937004089,
"learning_rate": 5.5549576338070204e-05,
"loss": 1.4441,
"step": 1350
},
{
"epoch": 0.22511657822801093,
"grad_norm": 0.7837012410163879,
"learning_rate": 4.802349523902277e-05,
"loss": 1.4046,
"step": 1400
},
{
"epoch": 0.23315645602186846,
"grad_norm": 0.9885613918304443,
"learning_rate": 4.0893452454895215e-05,
"loss": 1.3865,
"step": 1450
},
{
"epoch": 0.241196333815726,
"grad_norm": 0.6985939145088196,
"learning_rate": 3.420815340767147e-05,
"loss": 1.3805,
"step": 1500
},
{
"epoch": 0.241196333815726,
"eval_loss": 1.3898102045059204,
"eval_runtime": 431.5472,
"eval_samples_per_second": 24.273,
"eval_steps_per_second": 6.069,
"step": 1500
},
{
"epoch": 0.24923621160958354,
"grad_norm": 0.6956173777580261,
"learning_rate": 2.8013265468648052e-05,
"loss": 1.398,
"step": 1550
},
{
"epoch": 0.25727608940344104,
"grad_norm": 0.8307960629463196,
"learning_rate": 2.2351106003982295e-05,
"loss": 1.38,
"step": 1600
},
{
"epoch": 0.2653159671972986,
"grad_norm": 0.6814916729927063,
"learning_rate": 1.7260353304171974e-05,
"loss": 1.4301,
"step": 1650
},
{
"epoch": 0.2733558449911561,
"grad_norm": 0.8441957235336304,
"learning_rate": 1.277578237211217e-05,
"loss": 1.422,
"step": 1700
},
{
"epoch": 0.2813957227850137,
"grad_norm": 0.7750621438026428,
"learning_rate": 8.928027374563904e-06,
"loss": 1.3458,
"step": 1750
},
{
"epoch": 0.2894356005788712,
"grad_norm": 0.7339411973953247,
"learning_rate": 5.743372379732728e-06,
"loss": 1.3787,
"step": 1800
},
{
"epoch": 0.29747547837272875,
"grad_norm": 0.7997561693191528,
"learning_rate": 3.2435718104309803e-06,
"loss": 1.3861,
"step": 1850
},
{
"epoch": 0.30551535616658626,
"grad_norm": 0.7548067569732666,
"learning_rate": 1.4457018393114339e-06,
"loss": 1.3984,
"step": 1900
},
{
"epoch": 0.3135552339604438,
"grad_norm": 0.824309766292572,
"learning_rate": 3.620437412929962e-07,
"loss": 1.3905,
"step": 1950
},
{
"epoch": 0.32159511175430133,
"grad_norm": 0.7960425019264221,
"learning_rate": 0.0,
"loss": 1.3743,
"step": 2000
},
{
"epoch": 0.32159511175430133,
"eval_loss": 1.3815141916275024,
"eval_runtime": 431.4937,
"eval_samples_per_second": 24.276,
"eval_steps_per_second": 6.07,
"step": 2000
}
],
"logging_steps": 50,
"max_steps": 2000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.4428946137088e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}