zephyr-backdoor-7b-sft-qlora-8k / trainer_state.json
chloeli's picture
Model save
39bf698 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.97907949790795,
"eval_steps": 500,
"global_step": 357,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008368200836820083,
"grad_norm": 1.1835436820983887,
"learning_rate": 5.555555555555556e-06,
"loss": 1.0482,
"mean_token_accuracy": 0.8137692213058472,
"step": 1
},
{
"epoch": 0.04184100418410042,
"grad_norm": 1.143718957901001,
"learning_rate": 2.777777777777778e-05,
"loss": 1.1531,
"mean_token_accuracy": 0.7920060083270073,
"step": 5
},
{
"epoch": 0.08368200836820083,
"grad_norm": 0.6482775807380676,
"learning_rate": 5.555555555555556e-05,
"loss": 0.9534,
"mean_token_accuracy": 0.8159618139266968,
"step": 10
},
{
"epoch": 0.12552301255230125,
"grad_norm": 0.5585402250289917,
"learning_rate": 8.333333333333334e-05,
"loss": 0.8057,
"mean_token_accuracy": 0.8243314325809479,
"step": 15
},
{
"epoch": 0.16736401673640167,
"grad_norm": 49.08249282836914,
"learning_rate": 0.00011111111111111112,
"loss": 0.7266,
"mean_token_accuracy": 0.8348599970340729,
"step": 20
},
{
"epoch": 0.20920502092050208,
"grad_norm": 0.2956574857234955,
"learning_rate": 0.0001388888888888889,
"loss": 0.5231,
"mean_token_accuracy": 0.8722780823707581,
"step": 25
},
{
"epoch": 0.2510460251046025,
"grad_norm": 0.33143389225006104,
"learning_rate": 0.0001666666666666667,
"loss": 0.4544,
"mean_token_accuracy": 0.8879955649375916,
"step": 30
},
{
"epoch": 0.2928870292887029,
"grad_norm": 0.35626646876335144,
"learning_rate": 0.00019444444444444446,
"loss": 0.3863,
"mean_token_accuracy": 0.9022103250026703,
"step": 35
},
{
"epoch": 0.33472803347280333,
"grad_norm": 0.26618582010269165,
"learning_rate": 0.00019992338316586133,
"loss": 0.3502,
"mean_token_accuracy": 0.9123009383678437,
"step": 40
},
{
"epoch": 0.37656903765690375,
"grad_norm": 0.19758877158164978,
"learning_rate": 0.0001996123284790336,
"loss": 0.33,
"mean_token_accuracy": 0.9169199109077454,
"step": 45
},
{
"epoch": 0.41841004184100417,
"grad_norm": 0.17389552295207977,
"learning_rate": 0.00019906279151130337,
"loss": 0.3597,
"mean_token_accuracy": 0.9097823560237884,
"step": 50
},
{
"epoch": 0.4602510460251046,
"grad_norm": 0.18199238181114197,
"learning_rate": 0.000198276087911361,
"loss": 0.331,
"mean_token_accuracy": 0.91509507894516,
"step": 55
},
{
"epoch": 0.502092050209205,
"grad_norm": 0.17495596408843994,
"learning_rate": 0.0001972541011294959,
"loss": 0.3322,
"mean_token_accuracy": 0.9161225736141205,
"step": 60
},
{
"epoch": 0.5439330543933054,
"grad_norm": 0.15940091013908386,
"learning_rate": 0.00019599927790842073,
"loss": 0.3483,
"mean_token_accuracy": 0.9118252336978913,
"step": 65
},
{
"epoch": 0.5857740585774058,
"grad_norm": 0.1603671759366989,
"learning_rate": 0.0001945146224255145,
"loss": 0.3215,
"mean_token_accuracy": 0.9172553181648254,
"step": 70
},
{
"epoch": 0.6276150627615062,
"grad_norm": 0.16840051114559174,
"learning_rate": 0.00019280368910050942,
"loss": 0.3131,
"mean_token_accuracy": 0.9191941678524017,
"step": 75
},
{
"epoch": 0.6694560669456067,
"grad_norm": 0.16477249562740326,
"learning_rate": 0.0001908705740858402,
"loss": 0.3337,
"mean_token_accuracy": 0.9134519279003144,
"step": 80
},
{
"epoch": 0.7112970711297071,
"grad_norm": 0.18231800198554993,
"learning_rate": 0.0001887199054600286,
"loss": 0.288,
"mean_token_accuracy": 0.9250746846199036,
"step": 85
},
{
"epoch": 0.7531380753138075,
"grad_norm": 0.14658205211162567,
"learning_rate": 0.00018635683214758214,
"loss": 0.3291,
"mean_token_accuracy": 0.9156664907932281,
"step": 90
},
{
"epoch": 0.7949790794979079,
"grad_norm": 0.14870484173297882,
"learning_rate": 0.0001837870115919327,
"loss": 0.3056,
"mean_token_accuracy": 0.920881724357605,
"step": 95
},
{
"epoch": 0.8368200836820083,
"grad_norm": 0.1470540314912796,
"learning_rate": 0.0001810165962109283,
"loss": 0.3065,
"mean_token_accuracy": 0.9206145107746124,
"step": 100
},
{
"epoch": 0.8786610878661087,
"grad_norm": 0.1503264307975769,
"learning_rate": 0.00017805221866730458,
"loss": 0.3204,
"mean_token_accuracy": 0.9173631191253662,
"step": 105
},
{
"epoch": 0.9205020920502092,
"grad_norm": 0.15052708983421326,
"learning_rate": 0.0001749009759893999,
"loss": 0.3065,
"mean_token_accuracy": 0.9213023066520691,
"step": 110
},
{
"epoch": 0.9623430962343096,
"grad_norm": 0.13865287601947784,
"learning_rate": 0.00017157041258013074,
"loss": 0.2935,
"mean_token_accuracy": 0.9245201706886291,
"step": 115
},
{
"epoch": 1.0,
"grad_norm": 0.2014499455690384,
"learning_rate": 0.0001680685021549063,
"loss": 0.3392,
"mean_token_accuracy": 0.9127676884333292,
"step": 120
},
{
"epoch": 1.0,
"eval_loss": 0.3295583426952362,
"eval_mean_token_accuracy": 0.9158889651298523,
"eval_runtime": 188.907,
"eval_samples_per_second": 1.906,
"eval_steps_per_second": 0.238,
"step": 120
},
{
"epoch": 1.0418410041841004,
"grad_norm": 0.14540784060955048,
"learning_rate": 0.00016440362865172373,
"loss": 0.2842,
"mean_token_accuracy": 0.9250402748584747,
"step": 125
},
{
"epoch": 1.0836820083682008,
"grad_norm": 0.15603987872600555,
"learning_rate": 0.00016058456615914817,
"loss": 0.2592,
"mean_token_accuracy": 0.9306007921695709,
"step": 130
},
{
"epoch": 1.1255230125523012,
"grad_norm": 0.20846955478191376,
"learning_rate": 0.00015662045791023173,
"loss": 0.2977,
"mean_token_accuracy": 0.9211509883403778,
"step": 135
},
{
"epoch": 1.1673640167364017,
"grad_norm": 0.1669706404209137,
"learning_rate": 0.00015252079439266177,
"loss": 0.3082,
"mean_token_accuracy": 0.9184517621994018,
"step": 140
},
{
"epoch": 1.209205020920502,
"grad_norm": 0.20693209767341614,
"learning_rate": 0.00014829539062754596,
"loss": 0.2919,
"mean_token_accuracy": 0.9223342299461365,
"step": 145
},
{
"epoch": 1.2510460251046025,
"grad_norm": 0.20154564082622528,
"learning_rate": 0.00014395436267123016,
"loss": 0.2819,
"mean_token_accuracy": 0.9249872386455535,
"step": 150
},
{
"epoch": 1.292887029288703,
"grad_norm": 0.19385474920272827,
"learning_rate": 0.00013950810339640688,
"loss": 0.286,
"mean_token_accuracy": 0.9242500007152558,
"step": 155
},
{
"epoch": 1.3347280334728033,
"grad_norm": 0.18098486959934235,
"learning_rate": 0.00013496725761049638,
"loss": 0.2797,
"mean_token_accuracy": 0.9268094718456268,
"step": 160
},
{
"epoch": 1.3765690376569037,
"grad_norm": 0.1658986210823059,
"learning_rate": 0.00013034269657086992,
"loss": 0.2535,
"mean_token_accuracy": 0.931826388835907,
"step": 165
},
{
"epoch": 1.4184100418410042,
"grad_norm": 0.16022320091724396,
"learning_rate": 0.0001256454919579284,
"loss": 0.2564,
"mean_token_accuracy": 0.9323311626911164,
"step": 170
},
{
"epoch": 1.4602510460251046,
"grad_norm": 0.1953819990158081,
"learning_rate": 0.00012088688936834704,
"loss": 0.2459,
"mean_token_accuracy": 0.9334764361381531,
"step": 175
},
{
"epoch": 1.502092050209205,
"grad_norm": 0.18393699824810028,
"learning_rate": 0.00011607828139194683,
"loss": 0.286,
"mean_token_accuracy": 0.9224025547504425,
"step": 180
},
{
"epoch": 1.5439330543933054,
"grad_norm": 0.21144607663154602,
"learning_rate": 0.00011123118033664876,
"loss": 0.2604,
"mean_token_accuracy": 0.9307535707950592,
"step": 185
},
{
"epoch": 1.5857740585774058,
"grad_norm": 0.18700766563415527,
"learning_rate": 0.00010635719066681063,
"loss": 0.2579,
"mean_token_accuracy": 0.9308933317661285,
"step": 190
},
{
"epoch": 1.6276150627615062,
"grad_norm": 0.20680415630340576,
"learning_rate": 0.00010146798122093166,
"loss": 0.2745,
"mean_token_accuracy": 0.9262791752815247,
"step": 195
},
{
"epoch": 1.6694560669456067,
"grad_norm": 0.2134338617324829,
"learning_rate": 9.657525727523897e-05,
"loss": 0.2838,
"mean_token_accuracy": 0.9231680512428284,
"step": 200
},
{
"epoch": 1.711297071129707,
"grad_norm": 0.18799664080142975,
"learning_rate": 9.169073252003779e-05,
"loss": 0.2694,
"mean_token_accuracy": 0.9275272071361542,
"step": 205
},
{
"epoch": 1.7531380753138075,
"grad_norm": 0.18628208339214325,
"learning_rate": 8.682610101591814e-05,
"loss": 0.2518,
"mean_token_accuracy": 0.9325793802738189,
"step": 210
},
{
"epoch": 1.794979079497908,
"grad_norm": 0.18667584657669067,
"learning_rate": 8.19930091969567e-05,
"loss": 0.2671,
"mean_token_accuracy": 0.9276439845561981,
"step": 215
},
{
"epoch": 1.8368200836820083,
"grad_norm": 0.2169790118932724,
"learning_rate": 7.720302798794153e-05,
"loss": 0.2608,
"mean_token_accuracy": 0.9305820167064667,
"step": 220
},
{
"epoch": 1.8786610878661087,
"grad_norm": 0.20957927405834198,
"learning_rate": 7.246762510237403e-05,
"loss": 0.2592,
"mean_token_accuracy": 0.9301635086536407,
"step": 225
},
{
"epoch": 1.9205020920502092,
"grad_norm": 0.21752670407295227,
"learning_rate": 6.779813758756943e-05,
"loss": 0.2683,
"mean_token_accuracy": 0.9270337641239166,
"step": 230
},
{
"epoch": 1.9623430962343096,
"grad_norm": 0.21591052412986755,
"learning_rate": 6.320574468258555e-05,
"loss": 0.2555,
"mean_token_accuracy": 0.930656349658966,
"step": 235
},
{
"epoch": 2.0,
"grad_norm": 0.2959473729133606,
"learning_rate": 5.870144105396118e-05,
"loss": 0.253,
"mean_token_accuracy": 0.9325528542200724,
"step": 240
},
{
"epoch": 2.0,
"eval_loss": 0.3456202745437622,
"eval_mean_token_accuracy": 0.914831633037991,
"eval_runtime": 188.7768,
"eval_samples_per_second": 1.907,
"eval_steps_per_second": 0.238,
"step": 240
},
{
"epoch": 2.0418410041841004,
"grad_norm": 0.24203084409236908,
"learning_rate": 5.429601047334022e-05,
"loss": 0.1868,
"mean_token_accuracy": 0.9483842253684998,
"step": 245
},
{
"epoch": 2.083682008368201,
"grad_norm": 0.2632400393486023,
"learning_rate": 5.000000000000002e-05,
"loss": 0.2226,
"mean_token_accuracy": 0.9376705825328827,
"step": 250
},
{
"epoch": 2.1255230125523012,
"grad_norm": 0.24455517530441284,
"learning_rate": 4.58236947300939e-05,
"loss": 0.2038,
"mean_token_accuracy": 0.9432938873767853,
"step": 255
},
{
"epoch": 2.1673640167364017,
"grad_norm": 0.27484190464019775,
"learning_rate": 4.17770931730606e-05,
"loss": 0.207,
"mean_token_accuracy": 0.9414038896560669,
"step": 260
},
{
"epoch": 2.209205020920502,
"grad_norm": 0.22127795219421387,
"learning_rate": 3.786988331415211e-05,
"loss": 0.2139,
"mean_token_accuracy": 0.939983355998993,
"step": 265
},
{
"epoch": 2.2510460251046025,
"grad_norm": 0.2719532549381256,
"learning_rate": 3.41114194203889e-05,
"loss": 0.2091,
"mean_token_accuracy": 0.9410282075405121,
"step": 270
},
{
"epoch": 2.292887029288703,
"grad_norm": 0.27510541677474976,
"learning_rate": 3.0510699645470984e-05,
"loss": 0.204,
"mean_token_accuracy": 0.9429362654685974,
"step": 275
},
{
"epoch": 2.3347280334728033,
"grad_norm": 0.2817744016647339,
"learning_rate": 2.7076344487261697e-05,
"loss": 0.2278,
"mean_token_accuracy": 0.9364924848079681,
"step": 280
},
{
"epoch": 2.3765690376569037,
"grad_norm": 0.26610061526298523,
"learning_rate": 2.381657614941858e-05,
"loss": 0.1996,
"mean_token_accuracy": 0.9442806720733643,
"step": 285
},
{
"epoch": 2.418410041841004,
"grad_norm": 0.2731967568397522,
"learning_rate": 2.073919885658223e-05,
"loss": 0.2217,
"mean_token_accuracy": 0.9379707038402557,
"step": 290
},
{
"epoch": 2.4602510460251046,
"grad_norm": 0.2731280028820038,
"learning_rate": 1.7851580170250304e-05,
"loss": 0.1916,
"mean_token_accuracy": 0.9465275406837463,
"step": 295
},
{
"epoch": 2.502092050209205,
"grad_norm": 0.2769322097301483,
"learning_rate": 1.5160633350068509e-05,
"loss": 0.232,
"mean_token_accuracy": 0.9358445882797242,
"step": 300
},
{
"epoch": 2.5439330543933054,
"grad_norm": 0.27492755651474,
"learning_rate": 1.2672800802767715e-05,
"loss": 0.2094,
"mean_token_accuracy": 0.9408862173557282,
"step": 305
},
{
"epoch": 2.585774058577406,
"grad_norm": 0.309391051530838,
"learning_rate": 1.0394038658371574e-05,
"loss": 0.1979,
"mean_token_accuracy": 0.9446548879146576,
"step": 310
},
{
"epoch": 2.6276150627615062,
"grad_norm": 0.2758495509624481,
"learning_rate": 8.329802510601559e-06,
"loss": 0.2048,
"mean_token_accuracy": 0.9433695197105407,
"step": 315
},
{
"epoch": 2.6694560669456067,
"grad_norm": 0.2817879617214203,
"learning_rate": 6.485034355617747e-06,
"loss": 0.1977,
"mean_token_accuracy": 0.944527405500412,
"step": 320
},
{
"epoch": 2.711297071129707,
"grad_norm": 0.2818117141723633,
"learning_rate": 4.864150760365771e-06,
"loss": 0.2205,
"mean_token_accuracy": 0.9380866944789886,
"step": 325
},
{
"epoch": 2.7531380753138075,
"grad_norm": 0.2880750894546509,
"learning_rate": 3.471032288855869e-06,
"loss": 0.2078,
"mean_token_accuracy": 0.942375785112381,
"step": 330
},
{
"epoch": 2.794979079497908,
"grad_norm": 0.26675674319267273,
"learning_rate": 2.3090142116888646e-06,
"loss": 0.2037,
"mean_token_accuracy": 0.9432387411594391,
"step": 335
},
{
"epoch": 2.8368200836820083,
"grad_norm": 0.2919078469276428,
"learning_rate": 1.3808785210711606e-06,
"loss": 0.2217,
"mean_token_accuracy": 0.9386197745800018,
"step": 340
},
{
"epoch": 2.8786610878661087,
"grad_norm": 0.27360963821411133,
"learning_rate": 6.888472704359661e-07,
"loss": 0.2101,
"mean_token_accuracy": 0.9409088790416718,
"step": 345
},
{
"epoch": 2.920502092050209,
"grad_norm": 0.2891521453857422,
"learning_rate": 2.3457725461607517e-07,
"loss": 0.2033,
"mean_token_accuracy": 0.9430344641208649,
"step": 350
},
{
"epoch": 2.9623430962343096,
"grad_norm": 0.2698313593864441,
"learning_rate": 1.915604330464671e-08,
"loss": 0.1824,
"mean_token_accuracy": 0.9485956370830536,
"step": 355
},
{
"epoch": 2.97907949790795,
"eval_loss": 0.37856200337409973,
"eval_mean_token_accuracy": 0.9150180731500898,
"eval_runtime": 188.9862,
"eval_samples_per_second": 1.905,
"eval_steps_per_second": 0.238,
"step": 357
},
{
"epoch": 2.97907949790795,
"step": 357,
"total_flos": 2.503141107915817e+17,
"train_loss": 0.3060124111609633,
"train_runtime": 5934.9405,
"train_samples_per_second": 0.483,
"train_steps_per_second": 0.06
}
],
"logging_steps": 5,
"max_steps": 357,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.503141107915817e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}