dada22231's picture
Training in progress, step 95, checkpoint
da964a9 verified
{
"best_metric": 1.1252893209457397,
"best_model_checkpoint": "miner_id_24/checkpoint-75",
"epoch": 0.7735368956743003,
"eval_steps": 25,
"global_step": 95,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008142493638676845,
"grad_norm": 0.7951357364654541,
"learning_rate": 3.3333333333333335e-05,
"loss": 2.1777,
"step": 1
},
{
"epoch": 0.008142493638676845,
"eval_loss": 2.6585562229156494,
"eval_runtime": 3.2014,
"eval_samples_per_second": 15.618,
"eval_steps_per_second": 4.061,
"step": 1
},
{
"epoch": 0.01628498727735369,
"grad_norm": 0.920566976070404,
"learning_rate": 6.666666666666667e-05,
"loss": 2.1561,
"step": 2
},
{
"epoch": 0.024427480916030534,
"grad_norm": 0.9564467668533325,
"learning_rate": 0.0001,
"loss": 2.2048,
"step": 3
},
{
"epoch": 0.03256997455470738,
"grad_norm": 1.0686429738998413,
"learning_rate": 9.997376600647783e-05,
"loss": 2.1696,
"step": 4
},
{
"epoch": 0.04071246819338423,
"grad_norm": 1.0367040634155273,
"learning_rate": 9.989509461357426e-05,
"loss": 1.8942,
"step": 5
},
{
"epoch": 0.04885496183206107,
"grad_norm": 0.8149427175521851,
"learning_rate": 9.976407754861426e-05,
"loss": 1.6377,
"step": 6
},
{
"epoch": 0.056997455470737916,
"grad_norm": 0.7105250358581543,
"learning_rate": 9.958086757163489e-05,
"loss": 1.3655,
"step": 7
},
{
"epoch": 0.06513994910941476,
"grad_norm": 0.7786738872528076,
"learning_rate": 9.934567829727386e-05,
"loss": 1.3318,
"step": 8
},
{
"epoch": 0.0732824427480916,
"grad_norm": 0.6053236126899719,
"learning_rate": 9.905878394570453e-05,
"loss": 1.1656,
"step": 9
},
{
"epoch": 0.08142493638676845,
"grad_norm": 0.49749094247817993,
"learning_rate": 9.872051902290737e-05,
"loss": 1.1655,
"step": 10
},
{
"epoch": 0.08956743002544529,
"grad_norm": 0.5606850981712341,
"learning_rate": 9.833127793065098e-05,
"loss": 1.0697,
"step": 11
},
{
"epoch": 0.09770992366412214,
"grad_norm": 0.6993449330329895,
"learning_rate": 9.789151450663723e-05,
"loss": 0.9941,
"step": 12
},
{
"epoch": 0.10585241730279898,
"grad_norm": 0.5706828832626343,
"learning_rate": 9.740174149534693e-05,
"loss": 1.4403,
"step": 13
},
{
"epoch": 0.11399491094147583,
"grad_norm": 0.6802021861076355,
"learning_rate": 9.686252995020249e-05,
"loss": 1.4688,
"step": 14
},
{
"epoch": 0.12213740458015267,
"grad_norm": 0.5219370722770691,
"learning_rate": 9.627450856774539e-05,
"loss": 1.3689,
"step": 15
},
{
"epoch": 0.13027989821882952,
"grad_norm": 0.41930851340293884,
"learning_rate": 9.563836295460398e-05,
"loss": 1.3572,
"step": 16
},
{
"epoch": 0.13842239185750635,
"grad_norm": 0.47524869441986084,
"learning_rate": 9.495483482810688e-05,
"loss": 1.3036,
"step": 17
},
{
"epoch": 0.1465648854961832,
"grad_norm": 0.417620450258255,
"learning_rate": 9.422472115147382e-05,
"loss": 1.2112,
"step": 18
},
{
"epoch": 0.15470737913486005,
"grad_norm": 0.5184525847434998,
"learning_rate": 9.3448873204592e-05,
"loss": 1.2041,
"step": 19
},
{
"epoch": 0.1628498727735369,
"grad_norm": 0.43881309032440186,
"learning_rate": 9.2628195591462e-05,
"loss": 1.1282,
"step": 20
},
{
"epoch": 0.17099236641221374,
"grad_norm": 0.38495856523513794,
"learning_rate": 9.176364518546989e-05,
"loss": 1.1295,
"step": 21
},
{
"epoch": 0.17913486005089058,
"grad_norm": 0.3401827812194824,
"learning_rate": 9.08562300137157e-05,
"loss": 1.103,
"step": 22
},
{
"epoch": 0.18727735368956744,
"grad_norm": 0.3839218020439148,
"learning_rate": 8.990700808169889e-05,
"loss": 1.057,
"step": 23
},
{
"epoch": 0.19541984732824427,
"grad_norm": 0.42890799045562744,
"learning_rate": 8.891708613973126e-05,
"loss": 0.9879,
"step": 24
},
{
"epoch": 0.2035623409669211,
"grad_norm": 0.4219408631324768,
"learning_rate": 8.788761839251559e-05,
"loss": 0.7933,
"step": 25
},
{
"epoch": 0.2035623409669211,
"eval_loss": 1.1750500202178955,
"eval_runtime": 3.237,
"eval_samples_per_second": 15.446,
"eval_steps_per_second": 4.016,
"step": 25
},
{
"epoch": 0.21170483460559797,
"grad_norm": 0.5137602090835571,
"learning_rate": 8.681980515339464e-05,
"loss": 1.4807,
"step": 26
},
{
"epoch": 0.2198473282442748,
"grad_norm": 0.5336816906929016,
"learning_rate": 8.571489144483944e-05,
"loss": 1.4194,
"step": 27
},
{
"epoch": 0.22798982188295167,
"grad_norm": 0.37356939911842346,
"learning_rate": 8.457416554680877e-05,
"loss": 1.3018,
"step": 28
},
{
"epoch": 0.2361323155216285,
"grad_norm": 0.34776827692985535,
"learning_rate": 8.339895749467238e-05,
"loss": 1.2163,
"step": 29
},
{
"epoch": 0.24427480916030533,
"grad_norm": 0.3022593557834625,
"learning_rate": 8.219063752844926e-05,
"loss": 1.1786,
"step": 30
},
{
"epoch": 0.25241730279898217,
"grad_norm": 0.3063591420650482,
"learning_rate": 8.095061449516903e-05,
"loss": 1.1727,
"step": 31
},
{
"epoch": 0.26055979643765903,
"grad_norm": 0.3639509677886963,
"learning_rate": 7.968033420621935e-05,
"loss": 1.2463,
"step": 32
},
{
"epoch": 0.2687022900763359,
"grad_norm": 0.40702012181282043,
"learning_rate": 7.838127775159452e-05,
"loss": 1.0818,
"step": 33
},
{
"epoch": 0.2768447837150127,
"grad_norm": 0.40854519605636597,
"learning_rate": 7.705495977301078e-05,
"loss": 1.0641,
"step": 34
},
{
"epoch": 0.28498727735368956,
"grad_norm": 0.43549180030822754,
"learning_rate": 7.570292669790186e-05,
"loss": 1.0249,
"step": 35
},
{
"epoch": 0.2931297709923664,
"grad_norm": 0.44407615065574646,
"learning_rate": 7.43267549363537e-05,
"loss": 1.0178,
"step": 36
},
{
"epoch": 0.30127226463104323,
"grad_norm": 0.45777806639671326,
"learning_rate": 7.292804904308087e-05,
"loss": 0.8703,
"step": 37
},
{
"epoch": 0.3094147582697201,
"grad_norm": 0.30209803581237793,
"learning_rate": 7.150843984658754e-05,
"loss": 1.3586,
"step": 38
},
{
"epoch": 0.31755725190839695,
"grad_norm": 0.27303484082221985,
"learning_rate": 7.006958254769438e-05,
"loss": 1.3765,
"step": 39
},
{
"epoch": 0.3256997455470738,
"grad_norm": 0.31225576996803284,
"learning_rate": 6.861315478964841e-05,
"loss": 1.3231,
"step": 40
},
{
"epoch": 0.3338422391857506,
"grad_norm": 0.3471241891384125,
"learning_rate": 6.714085470206609e-05,
"loss": 1.2386,
"step": 41
},
{
"epoch": 0.3419847328244275,
"grad_norm": 0.4284417927265167,
"learning_rate": 6.56543989209901e-05,
"loss": 1.2152,
"step": 42
},
{
"epoch": 0.35012722646310435,
"grad_norm": 0.3678395748138428,
"learning_rate": 6.415552058736854e-05,
"loss": 1.1708,
"step": 43
},
{
"epoch": 0.35826972010178115,
"grad_norm": 0.3918590247631073,
"learning_rate": 6.264596732629e-05,
"loss": 1.2062,
"step": 44
},
{
"epoch": 0.366412213740458,
"grad_norm": 0.30557262897491455,
"learning_rate": 6.112749920933111e-05,
"loss": 1.115,
"step": 45
},
{
"epoch": 0.3745547073791349,
"grad_norm": 0.3059096336364746,
"learning_rate": 5.960188670239154e-05,
"loss": 1.0529,
"step": 46
},
{
"epoch": 0.3826972010178117,
"grad_norm": 0.3168966770172119,
"learning_rate": 5.80709086014102e-05,
"loss": 1.0206,
"step": 47
},
{
"epoch": 0.39083969465648855,
"grad_norm": 0.3171919584274292,
"learning_rate": 5.653634995836856e-05,
"loss": 0.9549,
"step": 48
},
{
"epoch": 0.3989821882951654,
"grad_norm": 0.3663451373577118,
"learning_rate": 5.500000000000001e-05,
"loss": 0.8722,
"step": 49
},
{
"epoch": 0.4071246819338422,
"grad_norm": 0.3894191384315491,
"learning_rate": 5.346365004163145e-05,
"loss": 0.8173,
"step": 50
},
{
"epoch": 0.4071246819338422,
"eval_loss": 1.1385380029678345,
"eval_runtime": 3.2577,
"eval_samples_per_second": 15.348,
"eval_steps_per_second": 3.99,
"step": 50
},
{
"epoch": 0.4152671755725191,
"grad_norm": 0.25706854462623596,
"learning_rate": 5.192909139858981e-05,
"loss": 1.3684,
"step": 51
},
{
"epoch": 0.42340966921119594,
"grad_norm": 0.2687229514122009,
"learning_rate": 5.0398113297608465e-05,
"loss": 1.2704,
"step": 52
},
{
"epoch": 0.4315521628498728,
"grad_norm": 0.2694632411003113,
"learning_rate": 4.887250079066892e-05,
"loss": 1.2929,
"step": 53
},
{
"epoch": 0.4396946564885496,
"grad_norm": 0.27409684658050537,
"learning_rate": 4.7354032673710005e-05,
"loss": 1.1768,
"step": 54
},
{
"epoch": 0.44783715012722647,
"grad_norm": 0.2727748453617096,
"learning_rate": 4.584447941263149e-05,
"loss": 1.1819,
"step": 55
},
{
"epoch": 0.45597964376590333,
"grad_norm": 0.29984185099601746,
"learning_rate": 4.43456010790099e-05,
"loss": 1.1877,
"step": 56
},
{
"epoch": 0.46412213740458014,
"grad_norm": 0.2994973957538605,
"learning_rate": 4.285914529793391e-05,
"loss": 1.1334,
"step": 57
},
{
"epoch": 0.472264631043257,
"grad_norm": 0.3209475576877594,
"learning_rate": 4.13868452103516e-05,
"loss": 1.1666,
"step": 58
},
{
"epoch": 0.48040712468193386,
"grad_norm": 0.317921906709671,
"learning_rate": 3.9930417452305626e-05,
"loss": 1.029,
"step": 59
},
{
"epoch": 0.48854961832061067,
"grad_norm": 0.32125380635261536,
"learning_rate": 3.8491560153412466e-05,
"loss": 0.9528,
"step": 60
},
{
"epoch": 0.49669211195928753,
"grad_norm": 0.30410146713256836,
"learning_rate": 3.707195095691913e-05,
"loss": 0.9018,
"step": 61
},
{
"epoch": 0.5048346055979643,
"grad_norm": 0.39632368087768555,
"learning_rate": 3.567324506364632e-05,
"loss": 0.9189,
"step": 62
},
{
"epoch": 0.5129770992366413,
"grad_norm": 0.28354790806770325,
"learning_rate": 3.4297073302098156e-05,
"loss": 1.2053,
"step": 63
},
{
"epoch": 0.5211195928753181,
"grad_norm": 0.23840811848640442,
"learning_rate": 3.2945040226989244e-05,
"loss": 1.3452,
"step": 64
},
{
"epoch": 0.5292620865139949,
"grad_norm": 0.2518406808376312,
"learning_rate": 3.16187222484055e-05,
"loss": 1.2796,
"step": 65
},
{
"epoch": 0.5374045801526718,
"grad_norm": 0.3083972632884979,
"learning_rate": 3.0319665793780648e-05,
"loss": 1.2915,
"step": 66
},
{
"epoch": 0.5455470737913486,
"grad_norm": 0.28872764110565186,
"learning_rate": 2.9049385504830985e-05,
"loss": 1.2188,
"step": 67
},
{
"epoch": 0.5536895674300254,
"grad_norm": 0.281512588262558,
"learning_rate": 2.7809362471550748e-05,
"loss": 1.1432,
"step": 68
},
{
"epoch": 0.5618320610687023,
"grad_norm": 0.2897692918777466,
"learning_rate": 2.660104250532764e-05,
"loss": 1.197,
"step": 69
},
{
"epoch": 0.5699745547073791,
"grad_norm": 0.28706327080726624,
"learning_rate": 2.5425834453191232e-05,
"loss": 1.065,
"step": 70
},
{
"epoch": 0.5781170483460559,
"grad_norm": 0.30318784713745117,
"learning_rate": 2.4285108555160577e-05,
"loss": 0.9742,
"step": 71
},
{
"epoch": 0.5862595419847328,
"grad_norm": 0.310880571603775,
"learning_rate": 2.3180194846605367e-05,
"loss": 0.9485,
"step": 72
},
{
"epoch": 0.5944020356234097,
"grad_norm": 0.33773961663246155,
"learning_rate": 2.2112381607484417e-05,
"loss": 0.98,
"step": 73
},
{
"epoch": 0.6025445292620865,
"grad_norm": 0.35677966475486755,
"learning_rate": 2.1082913860268765e-05,
"loss": 0.8608,
"step": 74
},
{
"epoch": 0.6106870229007634,
"grad_norm": 0.38845735788345337,
"learning_rate": 2.0092991918301108e-05,
"loss": 0.787,
"step": 75
},
{
"epoch": 0.6106870229007634,
"eval_loss": 1.1252893209457397,
"eval_runtime": 3.2469,
"eval_samples_per_second": 15.399,
"eval_steps_per_second": 4.004,
"step": 75
},
{
"epoch": 0.6188295165394402,
"grad_norm": 0.2565176784992218,
"learning_rate": 1.91437699862843e-05,
"loss": 1.331,
"step": 76
},
{
"epoch": 0.6269720101781171,
"grad_norm": 0.2451658397912979,
"learning_rate": 1.8236354814530112e-05,
"loss": 1.2461,
"step": 77
},
{
"epoch": 0.6351145038167939,
"grad_norm": 0.2756778597831726,
"learning_rate": 1.7371804408538024e-05,
"loss": 1.2637,
"step": 78
},
{
"epoch": 0.6432569974554707,
"grad_norm": 0.2695421874523163,
"learning_rate": 1.6551126795408016e-05,
"loss": 1.2262,
"step": 79
},
{
"epoch": 0.6513994910941476,
"grad_norm": 0.288644403219223,
"learning_rate": 1.577527884852619e-05,
"loss": 1.1969,
"step": 80
},
{
"epoch": 0.6595419847328244,
"grad_norm": 0.29181233048439026,
"learning_rate": 1.5045165171893116e-05,
"loss": 1.163,
"step": 81
},
{
"epoch": 0.6676844783715012,
"grad_norm": 0.2998856008052826,
"learning_rate": 1.4361637045396029e-05,
"loss": 1.0569,
"step": 82
},
{
"epoch": 0.6758269720101782,
"grad_norm": 0.3256518244743347,
"learning_rate": 1.3725491432254624e-05,
"loss": 1.0537,
"step": 83
},
{
"epoch": 0.683969465648855,
"grad_norm": 0.29111307859420776,
"learning_rate": 1.313747004979751e-05,
"loss": 1.0298,
"step": 84
},
{
"epoch": 0.6921119592875318,
"grad_norm": 0.32174497842788696,
"learning_rate": 1.2598258504653081e-05,
"loss": 0.9976,
"step": 85
},
{
"epoch": 0.7002544529262087,
"grad_norm": 0.36173349618911743,
"learning_rate": 1.2108485493362765e-05,
"loss": 0.9361,
"step": 86
},
{
"epoch": 0.7083969465648855,
"grad_norm": 0.3511975407600403,
"learning_rate": 1.1668722069349041e-05,
"loss": 0.8343,
"step": 87
},
{
"epoch": 0.7165394402035623,
"grad_norm": 0.2947863042354584,
"learning_rate": 1.1279480977092635e-05,
"loss": 1.3039,
"step": 88
},
{
"epoch": 0.7246819338422392,
"grad_norm": 0.2641090154647827,
"learning_rate": 1.094121605429547e-05,
"loss": 1.2791,
"step": 89
},
{
"epoch": 0.732824427480916,
"grad_norm": 0.2599228024482727,
"learning_rate": 1.0654321702726141e-05,
"loss": 1.2946,
"step": 90
},
{
"epoch": 0.7409669211195928,
"grad_norm": 0.2754274308681488,
"learning_rate": 1.0419132428365116e-05,
"loss": 1.2478,
"step": 91
},
{
"epoch": 0.7491094147582698,
"grad_norm": 0.2846546173095703,
"learning_rate": 1.0235922451385733e-05,
"loss": 1.2817,
"step": 92
},
{
"epoch": 0.7572519083969466,
"grad_norm": 0.29695919156074524,
"learning_rate": 1.0104905386425733e-05,
"loss": 1.2161,
"step": 93
},
{
"epoch": 0.7653944020356234,
"grad_norm": 0.29593130946159363,
"learning_rate": 1.002623399352217e-05,
"loss": 1.1682,
"step": 94
},
{
"epoch": 0.7735368956743003,
"grad_norm": 0.2888301908969879,
"learning_rate": 1e-05,
"loss": 1.0711,
"step": 95
}
],
"logging_steps": 1,
"max_steps": 95,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0685709840233267e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}