tFINE-base-300m-samsum / trainer_state.json
pszemraj's picture
Upload folder using huggingface_hub
4b6c14e verified
{
"best_metric": 18.4285,
"best_model_checkpoint": "./runtime/tFINE-base-300m-samsum/checkpoint-345",
"epoch": 3.995656894679696,
"eval_steps": 500,
"global_step": 460,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04343105320304017,
"grad_norm": 6.462469577789307,
"learning_rate": 2.173913043478261e-05,
"loss": 5.2862,
"step": 5
},
{
"epoch": 0.08686210640608034,
"grad_norm": 4.075676918029785,
"learning_rate": 4.347826086956522e-05,
"loss": 4.0973,
"step": 10
},
{
"epoch": 0.13029315960912052,
"grad_norm": 2.891948938369751,
"learning_rate": 6.521739130434783e-05,
"loss": 3.2568,
"step": 15
},
{
"epoch": 0.1737242128121607,
"grad_norm": 1.456381916999817,
"learning_rate": 8.695652173913044e-05,
"loss": 2.6852,
"step": 20
},
{
"epoch": 0.21715526601520088,
"grad_norm": 1.6086758375167847,
"learning_rate": 9.999483191807244e-05,
"loss": 2.4395,
"step": 25
},
{
"epoch": 0.26058631921824105,
"grad_norm": 1.1808384656906128,
"learning_rate": 9.993670326516924e-05,
"loss": 2.3317,
"step": 30
},
{
"epoch": 0.30401737242128124,
"grad_norm": 1.2403559684753418,
"learning_rate": 9.981406120397172e-05,
"loss": 2.2786,
"step": 35
},
{
"epoch": 0.3474484256243214,
"grad_norm": 1.1080374717712402,
"learning_rate": 9.962706417620413e-05,
"loss": 2.2042,
"step": 40
},
{
"epoch": 0.39087947882736157,
"grad_norm": 1.1210006475448608,
"learning_rate": 9.93759537640057e-05,
"loss": 2.1659,
"step": 45
},
{
"epoch": 0.43431053203040176,
"grad_norm": 1.0776363611221313,
"learning_rate": 9.90610543778299e-05,
"loss": 2.1975,
"step": 50
},
{
"epoch": 0.4777415852334419,
"grad_norm": 1.2220784425735474,
"learning_rate": 9.868277283733726e-05,
"loss": 2.1266,
"step": 55
},
{
"epoch": 0.5211726384364821,
"grad_norm": 1.5034546852111816,
"learning_rate": 9.824159784582368e-05,
"loss": 2.1108,
"step": 60
},
{
"epoch": 0.5646036916395223,
"grad_norm": 1.4478706121444702,
"learning_rate": 9.773809935886287e-05,
"loss": 2.0641,
"step": 65
},
{
"epoch": 0.6080347448425625,
"grad_norm": 1.1443687677383423,
"learning_rate": 9.717292784797854e-05,
"loss": 2.0728,
"step": 70
},
{
"epoch": 0.6514657980456026,
"grad_norm": 1.1472234725952148,
"learning_rate": 9.654681346029808e-05,
"loss": 2.0482,
"step": 75
},
{
"epoch": 0.6948968512486428,
"grad_norm": 1.109851360321045,
"learning_rate": 9.586056507527266e-05,
"loss": 2.0456,
"step": 80
},
{
"epoch": 0.738327904451683,
"grad_norm": 1.109726905822754,
"learning_rate": 9.5115069259683e-05,
"loss": 2.0477,
"step": 85
},
{
"epoch": 0.7817589576547231,
"grad_norm": 1.1185649633407593,
"learning_rate": 9.43112891222806e-05,
"loss": 2.0252,
"step": 90
},
{
"epoch": 0.8251900108577633,
"grad_norm": 1.1537866592407227,
"learning_rate": 9.345026306954386e-05,
"loss": 2.0184,
"step": 95
},
{
"epoch": 0.8686210640608035,
"grad_norm": 1.1786285638809204,
"learning_rate": 9.253310346415714e-05,
"loss": 1.9877,
"step": 100
},
{
"epoch": 0.9120521172638436,
"grad_norm": 1.202744722366333,
"learning_rate": 9.156099518794534e-05,
"loss": 1.9814,
"step": 105
},
{
"epoch": 0.9554831704668838,
"grad_norm": 1.3231650590896606,
"learning_rate": 9.053519411112075e-05,
"loss": 1.9585,
"step": 110
},
{
"epoch": 0.998914223669924,
"grad_norm": 1.330356240272522,
"learning_rate": 8.945702546981969e-05,
"loss": 1.9528,
"step": 115
},
{
"epoch": 0.998914223669924,
"eval_gen_len": 29.333333333333332,
"eval_loss": 1.9189409017562866,
"eval_rouge1": 40.093,
"eval_rouge2": 18.2018,
"eval_rougeL": 33.9749,
"eval_rougeLsum": 36.9071,
"eval_runtime": 64.3388,
"eval_samples_per_second": 4.663,
"eval_steps_per_second": 0.295,
"step": 115
},
{
"epoch": 1.0423452768729642,
"grad_norm": 1.36496901512146,
"learning_rate": 8.832788215402527e-05,
"loss": 1.6338,
"step": 120
},
{
"epoch": 1.0857763300760044,
"grad_norm": 1.3041751384735107,
"learning_rate": 8.714922290808766e-05,
"loss": 1.6039,
"step": 125
},
{
"epoch": 1.1292073832790446,
"grad_norm": 1.2985814809799194,
"learning_rate": 8.592257044616702e-05,
"loss": 1.6221,
"step": 130
},
{
"epoch": 1.1726384364820848,
"grad_norm": 1.2213962078094482,
"learning_rate": 8.464950948503349e-05,
"loss": 1.5671,
"step": 135
},
{
"epoch": 1.216069489685125,
"grad_norm": 1.110490322113037,
"learning_rate": 8.333168469676595e-05,
"loss": 1.6163,
"step": 140
},
{
"epoch": 1.2595005428881652,
"grad_norm": 1.22842538356781,
"learning_rate": 8.197079858399403e-05,
"loss": 1.6156,
"step": 145
},
{
"epoch": 1.3029315960912053,
"grad_norm": 1.2326569557189941,
"learning_rate": 8.05686092804289e-05,
"loss": 1.6263,
"step": 150
},
{
"epoch": 1.3463626492942453,
"grad_norm": 1.2563903331756592,
"learning_rate": 7.912692827952394e-05,
"loss": 1.5923,
"step": 155
},
{
"epoch": 1.3897937024972855,
"grad_norm": 1.1862818002700806,
"learning_rate": 7.76476180941997e-05,
"loss": 1.6199,
"step": 160
},
{
"epoch": 1.4332247557003257,
"grad_norm": 1.186631202697754,
"learning_rate": 7.613258985065672e-05,
"loss": 1.6409,
"step": 165
},
{
"epoch": 1.476655808903366,
"grad_norm": 1.1485611200332642,
"learning_rate": 7.45838008193847e-05,
"loss": 1.6194,
"step": 170
},
{
"epoch": 1.520086862106406,
"grad_norm": 1.158892273902893,
"learning_rate": 7.300325188655761e-05,
"loss": 1.561,
"step": 175
},
{
"epoch": 1.5635179153094463,
"grad_norm": 1.1595680713653564,
"learning_rate": 7.139298496908154e-05,
"loss": 1.5814,
"step": 180
},
{
"epoch": 1.6069489685124865,
"grad_norm": 1.2705223560333252,
"learning_rate": 6.97550803766349e-05,
"loss": 1.5873,
"step": 185
},
{
"epoch": 1.6503800217155264,
"grad_norm": 1.5138100385665894,
"learning_rate": 6.809165412410876e-05,
"loss": 1.6218,
"step": 190
},
{
"epoch": 1.6938110749185666,
"grad_norm": 1.7399356365203857,
"learning_rate": 6.640485519791953e-05,
"loss": 1.5717,
"step": 195
},
{
"epoch": 1.7372421281216068,
"grad_norm": 1.1342989206314087,
"learning_rate": 6.469686277972556e-05,
"loss": 1.5773,
"step": 200
},
{
"epoch": 1.780673181324647,
"grad_norm": 1.1253174543380737,
"learning_rate": 6.296988343113452e-05,
"loss": 1.584,
"step": 205
},
{
"epoch": 1.8241042345276872,
"grad_norm": 1.1998904943466187,
"learning_rate": 6.122614824303845e-05,
"loss": 1.6189,
"step": 210
},
{
"epoch": 1.8675352877307274,
"grad_norm": 1.1804780960083008,
"learning_rate": 5.946790995325924e-05,
"loss": 1.5844,
"step": 215
},
{
"epoch": 1.9109663409337676,
"grad_norm": 1.3260307312011719,
"learning_rate": 5.769744003622851e-05,
"loss": 1.5731,
"step": 220
},
{
"epoch": 1.9543973941368078,
"grad_norm": 1.1990879774093628,
"learning_rate": 5.59170257684616e-05,
"loss": 1.6082,
"step": 225
},
{
"epoch": 1.997828447339848,
"grad_norm": 1.164106011390686,
"learning_rate": 5.4128967273616625e-05,
"loss": 1.5346,
"step": 230
},
{
"epoch": 1.997828447339848,
"eval_gen_len": 27.663333333333334,
"eval_loss": 1.8827488422393799,
"eval_rouge1": 41.4676,
"eval_rouge2": 18.3467,
"eval_rougeL": 34.1909,
"eval_rougeLsum": 38.2131,
"eval_runtime": 41.5739,
"eval_samples_per_second": 7.216,
"eval_steps_per_second": 0.457,
"step": 230
},
{
"epoch": 2.041259500542888,
"grad_norm": 1.166826844215393,
"learning_rate": 5.2335574550956446e-05,
"loss": 1.268,
"step": 235
},
{
"epoch": 2.0846905537459284,
"grad_norm": 1.2919505834579468,
"learning_rate": 5.053916449105219e-05,
"loss": 1.2186,
"step": 240
},
{
"epoch": 2.1281216069489686,
"grad_norm": 1.4431166648864746,
"learning_rate": 4.874205788258397e-05,
"loss": 1.1827,
"step": 245
},
{
"epoch": 2.1715526601520088,
"grad_norm": 1.4269115924835205,
"learning_rate": 4.694657641410549e-05,
"loss": 1.1784,
"step": 250
},
{
"epoch": 2.214983713355049,
"grad_norm": 1.4717003107070923,
"learning_rate": 4.515503967464618e-05,
"loss": 1.1932,
"step": 255
},
{
"epoch": 2.258414766558089,
"grad_norm": 1.2849444150924683,
"learning_rate": 4.336976215702574e-05,
"loss": 1.1578,
"step": 260
},
{
"epoch": 2.3018458197611293,
"grad_norm": 1.2847343683242798,
"learning_rate": 4.1593050267752485e-05,
"loss": 1.1344,
"step": 265
},
{
"epoch": 2.3452768729641695,
"grad_norm": 1.281315565109253,
"learning_rate": 3.982719934736832e-05,
"loss": 1.1719,
"step": 270
},
{
"epoch": 2.3887079261672097,
"grad_norm": 1.2924513816833496,
"learning_rate": 3.807449070508998e-05,
"loss": 1.187,
"step": 275
},
{
"epoch": 2.43213897937025,
"grad_norm": 1.2814276218414307,
"learning_rate": 3.633718867157746e-05,
"loss": 1.2178,
"step": 280
},
{
"epoch": 2.47557003257329,
"grad_norm": 1.450994849205017,
"learning_rate": 3.4617537673636866e-05,
"loss": 1.1659,
"step": 285
},
{
"epoch": 2.5190010857763303,
"grad_norm": 1.2326401472091675,
"learning_rate": 3.2917759334637374e-05,
"loss": 1.1816,
"step": 290
},
{
"epoch": 2.5624321389793705,
"grad_norm": 1.2742615938186646,
"learning_rate": 3.124004960438796e-05,
"loss": 1.1543,
"step": 295
},
{
"epoch": 2.6058631921824107,
"grad_norm": 1.4592117071151733,
"learning_rate": 2.9586575922181724e-05,
"loss": 1.214,
"step": 300
},
{
"epoch": 2.6492942453854504,
"grad_norm": 1.3233025074005127,
"learning_rate": 2.7959474416673336e-05,
"loss": 1.1767,
"step": 305
},
{
"epoch": 2.6927252985884906,
"grad_norm": 1.187286376953125,
"learning_rate": 2.6360847146206623e-05,
"loss": 1.1769,
"step": 310
},
{
"epoch": 2.736156351791531,
"grad_norm": 1.3445571660995483,
"learning_rate": 2.4792759383157748e-05,
"loss": 1.2048,
"step": 315
},
{
"epoch": 2.779587404994571,
"grad_norm": 1.399775505065918,
"learning_rate": 2.325723694580229e-05,
"loss": 1.1756,
"step": 320
},
{
"epoch": 2.823018458197611,
"grad_norm": 1.3379682302474976,
"learning_rate": 2.1756263581153424e-05,
"loss": 1.1694,
"step": 325
},
{
"epoch": 2.8664495114006514,
"grad_norm": 1.4017045497894287,
"learning_rate": 2.0291778402151685e-05,
"loss": 1.1876,
"step": 330
},
{
"epoch": 2.9098805646036916,
"grad_norm": 1.3478162288665771,
"learning_rate": 1.8865673382518145e-05,
"loss": 1.1993,
"step": 335
},
{
"epoch": 2.953311617806732,
"grad_norm": 1.2747628688812256,
"learning_rate": 1.7479790912506626e-05,
"loss": 1.1913,
"step": 340
},
{
"epoch": 2.996742671009772,
"grad_norm": 1.3648200035095215,
"learning_rate": 1.6135921418712956e-05,
"loss": 1.1696,
"step": 345
},
{
"epoch": 2.996742671009772,
"eval_gen_len": 27.803333333333335,
"eval_loss": 1.9820051193237305,
"eval_rouge1": 42.3629,
"eval_rouge2": 18.4285,
"eval_rougeL": 34.6339,
"eval_rougeLsum": 38.7792,
"eval_runtime": 38.5794,
"eval_samples_per_second": 7.776,
"eval_steps_per_second": 0.492,
"step": 345
},
{
"epoch": 3.040173724212812,
"grad_norm": 1.135827660560608,
"learning_rate": 1.4835801051016463e-05,
"loss": 0.9797,
"step": 350
},
{
"epoch": 3.0836047774158524,
"grad_norm": 1.2965835332870483,
"learning_rate": 1.3581109439641588e-05,
"loss": 0.9405,
"step": 355
},
{
"epoch": 3.1270358306188926,
"grad_norm": 1.2207958698272705,
"learning_rate": 1.237346752523752e-05,
"loss": 0.9258,
"step": 360
},
{
"epoch": 3.1704668838219328,
"grad_norm": 1.3027771711349487,
"learning_rate": 1.1214435464779006e-05,
"loss": 0.8973,
"step": 365
},
{
"epoch": 3.213897937024973,
"grad_norm": 1.2429888248443604,
"learning_rate": 1.0105510615994051e-05,
"loss": 0.8882,
"step": 370
},
{
"epoch": 3.257328990228013,
"grad_norm": 1.2552655935287476,
"learning_rate": 9.048125602921842e-06,
"loss": 0.9351,
"step": 375
},
{
"epoch": 3.3007600434310533,
"grad_norm": 1.290592074394226,
"learning_rate": 8.043646465100697e-06,
"loss": 0.9189,
"step": 380
},
{
"epoch": 3.3441910966340935,
"grad_norm": 1.3471736907958984,
"learning_rate": 7.093370892776558e-06,
"loss": 0.8981,
"step": 385
},
{
"epoch": 3.3876221498371337,
"grad_norm": 1.4423019886016846,
"learning_rate": 6.1985265504122314e-06,
"loss": 0.914,
"step": 390
},
{
"epoch": 3.431053203040174,
"grad_norm": 1.3276619911193848,
"learning_rate": 5.360269490663278e-06,
"loss": 0.8998,
"step": 395
},
{
"epoch": 3.4744842562432137,
"grad_norm": 1.3065807819366455,
"learning_rate": 4.5796826608693274e-06,
"loss": 0.9208,
"step": 400
},
{
"epoch": 3.517915309446254,
"grad_norm": 1.4401154518127441,
"learning_rate": 3.857774503990514e-06,
"loss": 0.9586,
"step": 405
},
{
"epoch": 3.561346362649294,
"grad_norm": 1.4484052658081055,
"learning_rate": 3.1954776557963085e-06,
"loss": 0.9459,
"step": 410
},
{
"epoch": 3.6047774158523342,
"grad_norm": 1.2900787591934204,
"learning_rate": 2.593647739990068e-06,
"loss": 0.9139,
"step": 415
},
{
"epoch": 3.6482084690553744,
"grad_norm": 1.4173898696899414,
"learning_rate": 2.0530622628255615e-06,
"loss": 0.9515,
"step": 420
},
{
"epoch": 3.6916395222584146,
"grad_norm": 1.3273446559906006,
"learning_rate": 1.574419608643879e-06,
"loss": 0.9536,
"step": 425
},
{
"epoch": 3.735070575461455,
"grad_norm": 1.28830087184906,
"learning_rate": 1.1583381376281731e-06,
"loss": 0.9209,
"step": 430
},
{
"epoch": 3.778501628664495,
"grad_norm": 1.2910932302474976,
"learning_rate": 8.053553869418418e-07,
"loss": 0.9536,
"step": 435
},
{
"epoch": 3.821932681867535,
"grad_norm": 1.351585030555725,
"learning_rate": 5.159273762823657e-07,
"loss": 0.908,
"step": 440
},
{
"epoch": 3.8653637350705754,
"grad_norm": 1.3552790880203247,
"learning_rate": 2.9042801874777927e-07,
"loss": 0.8984,
"step": 445
},
{
"epoch": 3.9087947882736156,
"grad_norm": 1.3222675323486328,
"learning_rate": 1.2914863777698792e-07,
"loss": 0.9384,
"step": 450
},
{
"epoch": 3.952225841476656,
"grad_norm": 1.4266947507858276,
"learning_rate": 3.229759078795524e-08,
"loss": 0.9457,
"step": 455
},
{
"epoch": 3.995656894679696,
"grad_norm": 1.3441340923309326,
"learning_rate": 0.0,
"loss": 0.9359,
"step": 460
},
{
"epoch": 3.995656894679696,
"eval_gen_len": 30.18,
"eval_loss": 2.1588149070739746,
"eval_rouge1": 41.2237,
"eval_rouge2": 17.8161,
"eval_rougeL": 33.7101,
"eval_rougeLsum": 37.9569,
"eval_runtime": 43.7657,
"eval_samples_per_second": 6.855,
"eval_steps_per_second": 0.434,
"step": 460
},
{
"epoch": 3.995656894679696,
"step": 460,
"total_flos": 3.414853029293568e+16,
"train_loss": 1.5300355652104254,
"train_runtime": 1725.057,
"train_samples_per_second": 34.158,
"train_steps_per_second": 0.267
}
],
"logging_steps": 5,
"max_steps": 460,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.414853029293568e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}