mbert_yor-latn / trainer_state.json
DGurgurov's picture
Uploading checkpoint-16500 for mbert - yor-latn
7c085c1 verified
{
"best_metric": 0.6883311867713928,
"best_model_checkpoint": "./model_fine-tune/glot/mbert/yor-Latn/checkpoint-16500",
"epoch": 67.34693877551021,
"eval_steps": 500,
"global_step": 16500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 2.0408163265306123,
"grad_norm": 3.014599323272705,
"learning_rate": 9.95e-05,
"loss": 1.5111,
"step": 500
},
{
"epoch": 2.0408163265306123,
"eval_accuracy": 0.7682061550515823,
"eval_loss": 1.1437931060791016,
"eval_runtime": 16.6138,
"eval_samples_per_second": 117.974,
"eval_steps_per_second": 3.732,
"step": 500
},
{
"epoch": 4.081632653061225,
"grad_norm": 2.2488036155700684,
"learning_rate": 9.900000000000001e-05,
"loss": 1.111,
"step": 1000
},
{
"epoch": 4.081632653061225,
"eval_accuracy": 0.7948809959721713,
"eval_loss": 0.9893151521682739,
"eval_runtime": 16.5211,
"eval_samples_per_second": 118.636,
"eval_steps_per_second": 3.753,
"step": 1000
},
{
"epoch": 6.122448979591836,
"grad_norm": 2.612577438354492,
"learning_rate": 9.850000000000001e-05,
"loss": 0.9843,
"step": 1500
},
{
"epoch": 6.122448979591836,
"eval_accuracy": 0.8096590077564759,
"eval_loss": 0.9186487793922424,
"eval_runtime": 16.503,
"eval_samples_per_second": 118.766,
"eval_steps_per_second": 3.757,
"step": 1500
},
{
"epoch": 8.16326530612245,
"grad_norm": 2.055933713912964,
"learning_rate": 9.8e-05,
"loss": 0.9099,
"step": 2000
},
{
"epoch": 8.16326530612245,
"eval_accuracy": 0.8180220037796107,
"eval_loss": 0.8695068359375,
"eval_runtime": 16.5184,
"eval_samples_per_second": 118.656,
"eval_steps_per_second": 3.753,
"step": 2000
},
{
"epoch": 10.204081632653061,
"grad_norm": 2.095768928527832,
"learning_rate": 9.75e-05,
"loss": 0.8553,
"step": 2500
},
{
"epoch": 10.204081632653061,
"eval_accuracy": 0.8226890203332577,
"eval_loss": 0.853586733341217,
"eval_runtime": 16.0076,
"eval_samples_per_second": 122.442,
"eval_steps_per_second": 3.873,
"step": 2500
},
{
"epoch": 12.244897959183673,
"grad_norm": 2.289365530014038,
"learning_rate": 9.7e-05,
"loss": 0.8136,
"step": 3000
},
{
"epoch": 12.244897959183673,
"eval_accuracy": 0.8261254926090332,
"eval_loss": 0.834091067314148,
"eval_runtime": 16.0504,
"eval_samples_per_second": 122.115,
"eval_steps_per_second": 3.863,
"step": 3000
},
{
"epoch": 14.285714285714286,
"grad_norm": 2.0706839561462402,
"learning_rate": 9.65e-05,
"loss": 0.7732,
"step": 3500
},
{
"epoch": 14.285714285714286,
"eval_accuracy": 0.8323066793474285,
"eval_loss": 0.7992222309112549,
"eval_runtime": 15.9988,
"eval_samples_per_second": 122.509,
"eval_steps_per_second": 3.875,
"step": 3500
},
{
"epoch": 16.3265306122449,
"grad_norm": 2.018846273422241,
"learning_rate": 9.6e-05,
"loss": 0.7447,
"step": 4000
},
{
"epoch": 16.3265306122449,
"eval_accuracy": 0.8354064746360119,
"eval_loss": 0.7927262783050537,
"eval_runtime": 16.0248,
"eval_samples_per_second": 122.31,
"eval_steps_per_second": 3.869,
"step": 4000
},
{
"epoch": 18.367346938775512,
"grad_norm": 1.8335440158843994,
"learning_rate": 9.55e-05,
"loss": 0.7207,
"step": 4500
},
{
"epoch": 18.367346938775512,
"eval_accuracy": 0.8355885295341857,
"eval_loss": 0.7959266901016235,
"eval_runtime": 16.1763,
"eval_samples_per_second": 121.165,
"eval_steps_per_second": 3.833,
"step": 4500
},
{
"epoch": 20.408163265306122,
"grad_norm": 2.1816558837890625,
"learning_rate": 9.5e-05,
"loss": 0.6936,
"step": 5000
},
{
"epoch": 20.408163265306122,
"eval_accuracy": 0.8416515194650666,
"eval_loss": 0.7648417353630066,
"eval_runtime": 16.0654,
"eval_samples_per_second": 122.001,
"eval_steps_per_second": 3.859,
"step": 5000
},
{
"epoch": 22.448979591836736,
"grad_norm": 1.9500852823257446,
"learning_rate": 9.449999999999999e-05,
"loss": 0.6715,
"step": 5500
},
{
"epoch": 22.448979591836736,
"eval_accuracy": 0.8432883182052985,
"eval_loss": 0.7536068558692932,
"eval_runtime": 16.0018,
"eval_samples_per_second": 122.486,
"eval_steps_per_second": 3.875,
"step": 5500
},
{
"epoch": 24.489795918367346,
"grad_norm": 1.791885495185852,
"learning_rate": 9.4e-05,
"loss": 0.6501,
"step": 6000
},
{
"epoch": 24.489795918367346,
"eval_accuracy": 0.8429277638834043,
"eval_loss": 0.7491132616996765,
"eval_runtime": 16.0055,
"eval_samples_per_second": 122.458,
"eval_steps_per_second": 3.874,
"step": 6000
},
{
"epoch": 26.53061224489796,
"grad_norm": 1.9352576732635498,
"learning_rate": 9.350000000000001e-05,
"loss": 0.635,
"step": 6500
},
{
"epoch": 26.53061224489796,
"eval_accuracy": 0.8484582078727005,
"eval_loss": 0.7326425909996033,
"eval_runtime": 16.0205,
"eval_samples_per_second": 122.343,
"eval_steps_per_second": 3.87,
"step": 6500
},
{
"epoch": 28.571428571428573,
"grad_norm": 2.0857200622558594,
"learning_rate": 9.300000000000001e-05,
"loss": 0.6189,
"step": 7000
},
{
"epoch": 28.571428571428573,
"eval_accuracy": 0.8482937496821441,
"eval_loss": 0.7334189414978027,
"eval_runtime": 16.0058,
"eval_samples_per_second": 122.455,
"eval_steps_per_second": 3.874,
"step": 7000
},
{
"epoch": 30.612244897959183,
"grad_norm": 2.3431918621063232,
"learning_rate": 9.250000000000001e-05,
"loss": 0.6035,
"step": 7500
},
{
"epoch": 30.612244897959183,
"eval_accuracy": 0.8485175910580918,
"eval_loss": 0.740364670753479,
"eval_runtime": 16.0104,
"eval_samples_per_second": 122.421,
"eval_steps_per_second": 3.872,
"step": 7500
},
{
"epoch": 32.6530612244898,
"grad_norm": 1.8592019081115723,
"learning_rate": 9.200000000000001e-05,
"loss": 0.586,
"step": 8000
},
{
"epoch": 32.6530612244898,
"eval_accuracy": 0.8498935079850839,
"eval_loss": 0.7310737371444702,
"eval_runtime": 16.0169,
"eval_samples_per_second": 122.371,
"eval_steps_per_second": 3.871,
"step": 8000
},
{
"epoch": 34.69387755102041,
"grad_norm": 2.139171838760376,
"learning_rate": 9.15e-05,
"loss": 0.5741,
"step": 8500
},
{
"epoch": 34.69387755102041,
"eval_accuracy": 0.8535219936836398,
"eval_loss": 0.7162159085273743,
"eval_runtime": 16.0256,
"eval_samples_per_second": 122.305,
"eval_steps_per_second": 3.869,
"step": 8500
},
{
"epoch": 36.734693877551024,
"grad_norm": 2.130950927734375,
"learning_rate": 9.1e-05,
"loss": 0.5613,
"step": 9000
},
{
"epoch": 36.734693877551024,
"eval_accuracy": 0.8531061006400094,
"eval_loss": 0.7184491753578186,
"eval_runtime": 16.5555,
"eval_samples_per_second": 118.39,
"eval_steps_per_second": 3.745,
"step": 9000
},
{
"epoch": 38.775510204081634,
"grad_norm": 1.8260786533355713,
"learning_rate": 9.05e-05,
"loss": 0.548,
"step": 9500
},
{
"epoch": 38.775510204081634,
"eval_accuracy": 0.8545740978201136,
"eval_loss": 0.7058050036430359,
"eval_runtime": 16.5238,
"eval_samples_per_second": 118.617,
"eval_steps_per_second": 3.752,
"step": 9500
},
{
"epoch": 40.816326530612244,
"grad_norm": 1.890523076057434,
"learning_rate": 9e-05,
"loss": 0.5395,
"step": 10000
},
{
"epoch": 40.816326530612244,
"eval_accuracy": 0.857211986128117,
"eval_loss": 0.710237443447113,
"eval_runtime": 16.486,
"eval_samples_per_second": 118.889,
"eval_steps_per_second": 3.761,
"step": 10000
},
{
"epoch": 42.857142857142854,
"grad_norm": 2.4147567749023438,
"learning_rate": 8.950000000000001e-05,
"loss": 0.5266,
"step": 10500
},
{
"epoch": 42.857142857142854,
"eval_accuracy": 0.8557428516937341,
"eval_loss": 0.7127901315689087,
"eval_runtime": 16.5108,
"eval_samples_per_second": 118.71,
"eval_steps_per_second": 3.755,
"step": 10500
},
{
"epoch": 44.89795918367347,
"grad_norm": 2.065382242202759,
"learning_rate": 8.900000000000001e-05,
"loss": 0.5156,
"step": 11000
},
{
"epoch": 44.89795918367347,
"eval_accuracy": 0.8571209622093447,
"eval_loss": 0.7171084880828857,
"eval_runtime": 16.5009,
"eval_samples_per_second": 118.782,
"eval_steps_per_second": 3.757,
"step": 11000
},
{
"epoch": 46.93877551020408,
"grad_norm": 2.3719356060028076,
"learning_rate": 8.850000000000001e-05,
"loss": 0.5088,
"step": 11500
},
{
"epoch": 46.93877551020408,
"eval_accuracy": 0.8566442539649951,
"eval_loss": 0.7080183029174805,
"eval_runtime": 16.52,
"eval_samples_per_second": 118.644,
"eval_steps_per_second": 3.753,
"step": 11500
},
{
"epoch": 48.97959183673469,
"grad_norm": 1.9313708543777466,
"learning_rate": 8.800000000000001e-05,
"loss": 0.497,
"step": 12000
},
{
"epoch": 48.97959183673469,
"eval_accuracy": 0.8599520731972987,
"eval_loss": 0.709360659122467,
"eval_runtime": 16.5272,
"eval_samples_per_second": 118.592,
"eval_steps_per_second": 3.751,
"step": 12000
},
{
"epoch": 51.02040816326531,
"grad_norm": 1.9448109865188599,
"learning_rate": 8.75e-05,
"loss": 0.4888,
"step": 12500
},
{
"epoch": 51.02040816326531,
"eval_accuracy": 0.8609216073157768,
"eval_loss": 0.706400454044342,
"eval_runtime": 16.504,
"eval_samples_per_second": 118.759,
"eval_steps_per_second": 3.757,
"step": 12500
},
{
"epoch": 53.06122448979592,
"grad_norm": 1.7106282711029053,
"learning_rate": 8.7e-05,
"loss": 0.4782,
"step": 13000
},
{
"epoch": 53.06122448979592,
"eval_accuracy": 0.8604385547461287,
"eval_loss": 0.7035485506057739,
"eval_runtime": 16.5812,
"eval_samples_per_second": 118.206,
"eval_steps_per_second": 3.739,
"step": 13000
},
{
"epoch": 55.10204081632653,
"grad_norm": 2.121896982192993,
"learning_rate": 8.65e-05,
"loss": 0.4719,
"step": 13500
},
{
"epoch": 55.10204081632653,
"eval_accuracy": 0.8596197153330952,
"eval_loss": 0.716853678226471,
"eval_runtime": 16.5563,
"eval_samples_per_second": 118.384,
"eval_steps_per_second": 3.745,
"step": 13500
},
{
"epoch": 57.142857142857146,
"grad_norm": 1.8903765678405762,
"learning_rate": 8.6e-05,
"loss": 0.4639,
"step": 14000
},
{
"epoch": 57.142857142857146,
"eval_accuracy": 0.8602637276126747,
"eval_loss": 0.7129377722740173,
"eval_runtime": 16.5854,
"eval_samples_per_second": 118.176,
"eval_steps_per_second": 3.738,
"step": 14000
},
{
"epoch": 59.183673469387756,
"grad_norm": 2.5342397689819336,
"learning_rate": 8.55e-05,
"loss": 0.4554,
"step": 14500
},
{
"epoch": 59.183673469387756,
"eval_accuracy": 0.8614343212586901,
"eval_loss": 0.7078380584716797,
"eval_runtime": 16.0738,
"eval_samples_per_second": 121.937,
"eval_steps_per_second": 3.857,
"step": 14500
},
{
"epoch": 61.224489795918366,
"grad_norm": 1.810963749885559,
"learning_rate": 8.5e-05,
"loss": 0.4437,
"step": 15000
},
{
"epoch": 61.224489795918366,
"eval_accuracy": 0.8633158322532457,
"eval_loss": 0.6998348236083984,
"eval_runtime": 16.5802,
"eval_samples_per_second": 118.213,
"eval_steps_per_second": 3.739,
"step": 15000
},
{
"epoch": 63.265306122448976,
"grad_norm": 1.8456369638442993,
"learning_rate": 8.450000000000001e-05,
"loss": 0.4424,
"step": 15500
},
{
"epoch": 63.265306122448976,
"eval_accuracy": 0.8620835737606731,
"eval_loss": 0.7072030901908875,
"eval_runtime": 16.5746,
"eval_samples_per_second": 118.253,
"eval_steps_per_second": 3.741,
"step": 15500
},
{
"epoch": 65.3061224489796,
"grad_norm": 1.9073472023010254,
"learning_rate": 8.4e-05,
"loss": 0.4338,
"step": 16000
},
{
"epoch": 65.3061224489796,
"eval_accuracy": 0.8628925222058544,
"eval_loss": 0.6999377012252808,
"eval_runtime": 16.5979,
"eval_samples_per_second": 118.087,
"eval_steps_per_second": 3.735,
"step": 16000
},
{
"epoch": 67.34693877551021,
"grad_norm": 1.9411852359771729,
"learning_rate": 8.35e-05,
"loss": 0.4252,
"step": 16500
},
{
"epoch": 67.34693877551021,
"eval_accuracy": 0.8659541694954849,
"eval_loss": 0.6883311867713928,
"eval_runtime": 16.5927,
"eval_samples_per_second": 118.124,
"eval_steps_per_second": 3.737,
"step": 16500
}
],
"logging_steps": 500,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 409,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3911653816991744e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}