|
{ |
|
"best_metric": 0.6883311867713928, |
|
"best_model_checkpoint": "./model_fine-tune/glot/mbert/yor-Latn/checkpoint-16500", |
|
"epoch": 67.34693877551021, |
|
"eval_steps": 500, |
|
"global_step": 16500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 2.0408163265306123, |
|
"grad_norm": 3.014599323272705, |
|
"learning_rate": 9.95e-05, |
|
"loss": 1.5111, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.0408163265306123, |
|
"eval_accuracy": 0.7682061550515823, |
|
"eval_loss": 1.1437931060791016, |
|
"eval_runtime": 16.6138, |
|
"eval_samples_per_second": 117.974, |
|
"eval_steps_per_second": 3.732, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.081632653061225, |
|
"grad_norm": 2.2488036155700684, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 1.111, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.081632653061225, |
|
"eval_accuracy": 0.7948809959721713, |
|
"eval_loss": 0.9893151521682739, |
|
"eval_runtime": 16.5211, |
|
"eval_samples_per_second": 118.636, |
|
"eval_steps_per_second": 3.753, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 6.122448979591836, |
|
"grad_norm": 2.612577438354492, |
|
"learning_rate": 9.850000000000001e-05, |
|
"loss": 0.9843, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 6.122448979591836, |
|
"eval_accuracy": 0.8096590077564759, |
|
"eval_loss": 0.9186487793922424, |
|
"eval_runtime": 16.503, |
|
"eval_samples_per_second": 118.766, |
|
"eval_steps_per_second": 3.757, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 8.16326530612245, |
|
"grad_norm": 2.055933713912964, |
|
"learning_rate": 9.8e-05, |
|
"loss": 0.9099, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 8.16326530612245, |
|
"eval_accuracy": 0.8180220037796107, |
|
"eval_loss": 0.8695068359375, |
|
"eval_runtime": 16.5184, |
|
"eval_samples_per_second": 118.656, |
|
"eval_steps_per_second": 3.753, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 10.204081632653061, |
|
"grad_norm": 2.095768928527832, |
|
"learning_rate": 9.75e-05, |
|
"loss": 0.8553, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 10.204081632653061, |
|
"eval_accuracy": 0.8226890203332577, |
|
"eval_loss": 0.853586733341217, |
|
"eval_runtime": 16.0076, |
|
"eval_samples_per_second": 122.442, |
|
"eval_steps_per_second": 3.873, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 12.244897959183673, |
|
"grad_norm": 2.289365530014038, |
|
"learning_rate": 9.7e-05, |
|
"loss": 0.8136, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 12.244897959183673, |
|
"eval_accuracy": 0.8261254926090332, |
|
"eval_loss": 0.834091067314148, |
|
"eval_runtime": 16.0504, |
|
"eval_samples_per_second": 122.115, |
|
"eval_steps_per_second": 3.863, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 14.285714285714286, |
|
"grad_norm": 2.0706839561462402, |
|
"learning_rate": 9.65e-05, |
|
"loss": 0.7732, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 14.285714285714286, |
|
"eval_accuracy": 0.8323066793474285, |
|
"eval_loss": 0.7992222309112549, |
|
"eval_runtime": 15.9988, |
|
"eval_samples_per_second": 122.509, |
|
"eval_steps_per_second": 3.875, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 16.3265306122449, |
|
"grad_norm": 2.018846273422241, |
|
"learning_rate": 9.6e-05, |
|
"loss": 0.7447, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 16.3265306122449, |
|
"eval_accuracy": 0.8354064746360119, |
|
"eval_loss": 0.7927262783050537, |
|
"eval_runtime": 16.0248, |
|
"eval_samples_per_second": 122.31, |
|
"eval_steps_per_second": 3.869, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 18.367346938775512, |
|
"grad_norm": 1.8335440158843994, |
|
"learning_rate": 9.55e-05, |
|
"loss": 0.7207, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 18.367346938775512, |
|
"eval_accuracy": 0.8355885295341857, |
|
"eval_loss": 0.7959266901016235, |
|
"eval_runtime": 16.1763, |
|
"eval_samples_per_second": 121.165, |
|
"eval_steps_per_second": 3.833, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 20.408163265306122, |
|
"grad_norm": 2.1816558837890625, |
|
"learning_rate": 9.5e-05, |
|
"loss": 0.6936, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 20.408163265306122, |
|
"eval_accuracy": 0.8416515194650666, |
|
"eval_loss": 0.7648417353630066, |
|
"eval_runtime": 16.0654, |
|
"eval_samples_per_second": 122.001, |
|
"eval_steps_per_second": 3.859, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 22.448979591836736, |
|
"grad_norm": 1.9500852823257446, |
|
"learning_rate": 9.449999999999999e-05, |
|
"loss": 0.6715, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 22.448979591836736, |
|
"eval_accuracy": 0.8432883182052985, |
|
"eval_loss": 0.7536068558692932, |
|
"eval_runtime": 16.0018, |
|
"eval_samples_per_second": 122.486, |
|
"eval_steps_per_second": 3.875, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 24.489795918367346, |
|
"grad_norm": 1.791885495185852, |
|
"learning_rate": 9.4e-05, |
|
"loss": 0.6501, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 24.489795918367346, |
|
"eval_accuracy": 0.8429277638834043, |
|
"eval_loss": 0.7491132616996765, |
|
"eval_runtime": 16.0055, |
|
"eval_samples_per_second": 122.458, |
|
"eval_steps_per_second": 3.874, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 26.53061224489796, |
|
"grad_norm": 1.9352576732635498, |
|
"learning_rate": 9.350000000000001e-05, |
|
"loss": 0.635, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 26.53061224489796, |
|
"eval_accuracy": 0.8484582078727005, |
|
"eval_loss": 0.7326425909996033, |
|
"eval_runtime": 16.0205, |
|
"eval_samples_per_second": 122.343, |
|
"eval_steps_per_second": 3.87, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 28.571428571428573, |
|
"grad_norm": 2.0857200622558594, |
|
"learning_rate": 9.300000000000001e-05, |
|
"loss": 0.6189, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 28.571428571428573, |
|
"eval_accuracy": 0.8482937496821441, |
|
"eval_loss": 0.7334189414978027, |
|
"eval_runtime": 16.0058, |
|
"eval_samples_per_second": 122.455, |
|
"eval_steps_per_second": 3.874, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 30.612244897959183, |
|
"grad_norm": 2.3431918621063232, |
|
"learning_rate": 9.250000000000001e-05, |
|
"loss": 0.6035, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 30.612244897959183, |
|
"eval_accuracy": 0.8485175910580918, |
|
"eval_loss": 0.740364670753479, |
|
"eval_runtime": 16.0104, |
|
"eval_samples_per_second": 122.421, |
|
"eval_steps_per_second": 3.872, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 32.6530612244898, |
|
"grad_norm": 1.8592019081115723, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 0.586, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 32.6530612244898, |
|
"eval_accuracy": 0.8498935079850839, |
|
"eval_loss": 0.7310737371444702, |
|
"eval_runtime": 16.0169, |
|
"eval_samples_per_second": 122.371, |
|
"eval_steps_per_second": 3.871, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 34.69387755102041, |
|
"grad_norm": 2.139171838760376, |
|
"learning_rate": 9.15e-05, |
|
"loss": 0.5741, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 34.69387755102041, |
|
"eval_accuracy": 0.8535219936836398, |
|
"eval_loss": 0.7162159085273743, |
|
"eval_runtime": 16.0256, |
|
"eval_samples_per_second": 122.305, |
|
"eval_steps_per_second": 3.869, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 36.734693877551024, |
|
"grad_norm": 2.130950927734375, |
|
"learning_rate": 9.1e-05, |
|
"loss": 0.5613, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 36.734693877551024, |
|
"eval_accuracy": 0.8531061006400094, |
|
"eval_loss": 0.7184491753578186, |
|
"eval_runtime": 16.5555, |
|
"eval_samples_per_second": 118.39, |
|
"eval_steps_per_second": 3.745, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 38.775510204081634, |
|
"grad_norm": 1.8260786533355713, |
|
"learning_rate": 9.05e-05, |
|
"loss": 0.548, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 38.775510204081634, |
|
"eval_accuracy": 0.8545740978201136, |
|
"eval_loss": 0.7058050036430359, |
|
"eval_runtime": 16.5238, |
|
"eval_samples_per_second": 118.617, |
|
"eval_steps_per_second": 3.752, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 40.816326530612244, |
|
"grad_norm": 1.890523076057434, |
|
"learning_rate": 9e-05, |
|
"loss": 0.5395, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 40.816326530612244, |
|
"eval_accuracy": 0.857211986128117, |
|
"eval_loss": 0.710237443447113, |
|
"eval_runtime": 16.486, |
|
"eval_samples_per_second": 118.889, |
|
"eval_steps_per_second": 3.761, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 42.857142857142854, |
|
"grad_norm": 2.4147567749023438, |
|
"learning_rate": 8.950000000000001e-05, |
|
"loss": 0.5266, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 42.857142857142854, |
|
"eval_accuracy": 0.8557428516937341, |
|
"eval_loss": 0.7127901315689087, |
|
"eval_runtime": 16.5108, |
|
"eval_samples_per_second": 118.71, |
|
"eval_steps_per_second": 3.755, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 44.89795918367347, |
|
"grad_norm": 2.065382242202759, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 0.5156, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 44.89795918367347, |
|
"eval_accuracy": 0.8571209622093447, |
|
"eval_loss": 0.7171084880828857, |
|
"eval_runtime": 16.5009, |
|
"eval_samples_per_second": 118.782, |
|
"eval_steps_per_second": 3.757, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 46.93877551020408, |
|
"grad_norm": 2.3719356060028076, |
|
"learning_rate": 8.850000000000001e-05, |
|
"loss": 0.5088, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 46.93877551020408, |
|
"eval_accuracy": 0.8566442539649951, |
|
"eval_loss": 0.7080183029174805, |
|
"eval_runtime": 16.52, |
|
"eval_samples_per_second": 118.644, |
|
"eval_steps_per_second": 3.753, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 48.97959183673469, |
|
"grad_norm": 1.9313708543777466, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 0.497, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 48.97959183673469, |
|
"eval_accuracy": 0.8599520731972987, |
|
"eval_loss": 0.709360659122467, |
|
"eval_runtime": 16.5272, |
|
"eval_samples_per_second": 118.592, |
|
"eval_steps_per_second": 3.751, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 51.02040816326531, |
|
"grad_norm": 1.9448109865188599, |
|
"learning_rate": 8.75e-05, |
|
"loss": 0.4888, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 51.02040816326531, |
|
"eval_accuracy": 0.8609216073157768, |
|
"eval_loss": 0.706400454044342, |
|
"eval_runtime": 16.504, |
|
"eval_samples_per_second": 118.759, |
|
"eval_steps_per_second": 3.757, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 53.06122448979592, |
|
"grad_norm": 1.7106282711029053, |
|
"learning_rate": 8.7e-05, |
|
"loss": 0.4782, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 53.06122448979592, |
|
"eval_accuracy": 0.8604385547461287, |
|
"eval_loss": 0.7035485506057739, |
|
"eval_runtime": 16.5812, |
|
"eval_samples_per_second": 118.206, |
|
"eval_steps_per_second": 3.739, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 55.10204081632653, |
|
"grad_norm": 2.121896982192993, |
|
"learning_rate": 8.65e-05, |
|
"loss": 0.4719, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 55.10204081632653, |
|
"eval_accuracy": 0.8596197153330952, |
|
"eval_loss": 0.716853678226471, |
|
"eval_runtime": 16.5563, |
|
"eval_samples_per_second": 118.384, |
|
"eval_steps_per_second": 3.745, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 57.142857142857146, |
|
"grad_norm": 1.8903765678405762, |
|
"learning_rate": 8.6e-05, |
|
"loss": 0.4639, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 57.142857142857146, |
|
"eval_accuracy": 0.8602637276126747, |
|
"eval_loss": 0.7129377722740173, |
|
"eval_runtime": 16.5854, |
|
"eval_samples_per_second": 118.176, |
|
"eval_steps_per_second": 3.738, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 59.183673469387756, |
|
"grad_norm": 2.5342397689819336, |
|
"learning_rate": 8.55e-05, |
|
"loss": 0.4554, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 59.183673469387756, |
|
"eval_accuracy": 0.8614343212586901, |
|
"eval_loss": 0.7078380584716797, |
|
"eval_runtime": 16.0738, |
|
"eval_samples_per_second": 121.937, |
|
"eval_steps_per_second": 3.857, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 61.224489795918366, |
|
"grad_norm": 1.810963749885559, |
|
"learning_rate": 8.5e-05, |
|
"loss": 0.4437, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 61.224489795918366, |
|
"eval_accuracy": 0.8633158322532457, |
|
"eval_loss": 0.6998348236083984, |
|
"eval_runtime": 16.5802, |
|
"eval_samples_per_second": 118.213, |
|
"eval_steps_per_second": 3.739, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 63.265306122448976, |
|
"grad_norm": 1.8456369638442993, |
|
"learning_rate": 8.450000000000001e-05, |
|
"loss": 0.4424, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 63.265306122448976, |
|
"eval_accuracy": 0.8620835737606731, |
|
"eval_loss": 0.7072030901908875, |
|
"eval_runtime": 16.5746, |
|
"eval_samples_per_second": 118.253, |
|
"eval_steps_per_second": 3.741, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 65.3061224489796, |
|
"grad_norm": 1.9073472023010254, |
|
"learning_rate": 8.4e-05, |
|
"loss": 0.4338, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 65.3061224489796, |
|
"eval_accuracy": 0.8628925222058544, |
|
"eval_loss": 0.6999377012252808, |
|
"eval_runtime": 16.5979, |
|
"eval_samples_per_second": 118.087, |
|
"eval_steps_per_second": 3.735, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 67.34693877551021, |
|
"grad_norm": 1.9411852359771729, |
|
"learning_rate": 8.35e-05, |
|
"loss": 0.4252, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 67.34693877551021, |
|
"eval_accuracy": 0.8659541694954849, |
|
"eval_loss": 0.6883311867713928, |
|
"eval_runtime": 16.5927, |
|
"eval_samples_per_second": 118.124, |
|
"eval_steps_per_second": 3.737, |
|
"step": 16500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 409, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3911653816991744e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|