PumeTu's picture
Add files using upload-large-folder tool
59867d7 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.3255813953488373,
"eval_steps": 10,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011627906976744186,
"grad_norm": 1.630926489830017,
"learning_rate": 0.0,
"loss": 2.6815,
"step": 1
},
{
"epoch": 0.023255813953488372,
"grad_norm": 1.7017581462860107,
"learning_rate": 7.692307692307694e-06,
"loss": 2.8172,
"step": 2
},
{
"epoch": 0.03488372093023256,
"grad_norm": 1.7061189413070679,
"learning_rate": 1.5384615384615387e-05,
"loss": 2.7979,
"step": 3
},
{
"epoch": 0.046511627906976744,
"grad_norm": 1.735384225845337,
"learning_rate": 2.307692307692308e-05,
"loss": 2.7698,
"step": 4
},
{
"epoch": 0.05813953488372093,
"grad_norm": 1.8297396898269653,
"learning_rate": 3.0769230769230774e-05,
"loss": 2.6846,
"step": 5
},
{
"epoch": 0.06976744186046512,
"grad_norm": 1.9019414186477661,
"learning_rate": 3.846153846153846e-05,
"loss": 2.4865,
"step": 6
},
{
"epoch": 0.08139534883720931,
"grad_norm": 2.0125694274902344,
"learning_rate": 4.615384615384616e-05,
"loss": 2.3993,
"step": 7
},
{
"epoch": 0.09302325581395349,
"grad_norm": 2.0356626510620117,
"learning_rate": 5.384615384615385e-05,
"loss": 2.0547,
"step": 8
},
{
"epoch": 0.10465116279069768,
"grad_norm": 2.1409757137298584,
"learning_rate": 6.153846153846155e-05,
"loss": 1.7189,
"step": 9
},
{
"epoch": 0.11627906976744186,
"grad_norm": 2.369163751602173,
"learning_rate": 6.923076923076924e-05,
"loss": 1.4047,
"step": 10
},
{
"epoch": 0.11627906976744186,
"eval_loss": 1.1081944704055786,
"eval_runtime": 69.1299,
"eval_samples_per_second": 70.707,
"eval_steps_per_second": 1.114,
"step": 10
},
{
"epoch": 0.12790697674418605,
"grad_norm": 2.303812265396118,
"learning_rate": 7.692307692307693e-05,
"loss": 1.0561,
"step": 11
},
{
"epoch": 0.13953488372093023,
"grad_norm": 1.9584007263183594,
"learning_rate": 8.461538461538461e-05,
"loss": 0.7269,
"step": 12
},
{
"epoch": 0.1511627906976744,
"grad_norm": 1.8204364776611328,
"learning_rate": 9.230769230769232e-05,
"loss": 0.4613,
"step": 13
},
{
"epoch": 0.16279069767441862,
"grad_norm": 1.328773021697998,
"learning_rate": 0.0001,
"loss": 0.2739,
"step": 14
},
{
"epoch": 0.1744186046511628,
"grad_norm": 1.0226366519927979,
"learning_rate": 0.0001076923076923077,
"loss": 0.1802,
"step": 15
},
{
"epoch": 0.18604651162790697,
"grad_norm": 0.5955405831336975,
"learning_rate": 0.00011538461538461538,
"loss": 0.1093,
"step": 16
},
{
"epoch": 0.19767441860465115,
"grad_norm": 0.2616266906261444,
"learning_rate": 0.0001230769230769231,
"loss": 0.0676,
"step": 17
},
{
"epoch": 0.20930232558139536,
"grad_norm": 0.12042512744665146,
"learning_rate": 0.00013076923076923077,
"loss": 0.045,
"step": 18
},
{
"epoch": 0.22093023255813954,
"grad_norm": 0.12201035767793655,
"learning_rate": 0.00013846153846153847,
"loss": 0.0505,
"step": 19
},
{
"epoch": 0.23255813953488372,
"grad_norm": 0.09313634783029556,
"learning_rate": 0.00014615384615384615,
"loss": 0.0377,
"step": 20
},
{
"epoch": 0.23255813953488372,
"eval_loss": 0.04212512448430061,
"eval_runtime": 68.7789,
"eval_samples_per_second": 71.068,
"eval_steps_per_second": 1.12,
"step": 20
},
{
"epoch": 0.2441860465116279,
"grad_norm": 0.11886867135763168,
"learning_rate": 0.00015384615384615385,
"loss": 0.0409,
"step": 21
},
{
"epoch": 0.2558139534883721,
"grad_norm": 0.09348498284816742,
"learning_rate": 0.00016153846153846155,
"loss": 0.0322,
"step": 22
},
{
"epoch": 0.26744186046511625,
"grad_norm": 0.11308024078607559,
"learning_rate": 0.00016923076923076923,
"loss": 0.0389,
"step": 23
},
{
"epoch": 0.27906976744186046,
"grad_norm": 0.10123038291931152,
"learning_rate": 0.00017692307692307693,
"loss": 0.0355,
"step": 24
},
{
"epoch": 0.29069767441860467,
"grad_norm": 0.20477375388145447,
"learning_rate": 0.00018461538461538463,
"loss": 0.0392,
"step": 25
},
{
"epoch": 0.3023255813953488,
"grad_norm": 0.09108395129442215,
"learning_rate": 0.00019230769230769233,
"loss": 0.0311,
"step": 26
},
{
"epoch": 0.313953488372093,
"grad_norm": 0.10242355614900589,
"learning_rate": 0.0002,
"loss": 0.0311,
"step": 27
},
{
"epoch": 0.32558139534883723,
"grad_norm": 0.10945220291614532,
"learning_rate": 0.00019999083173529673,
"loss": 0.0304,
"step": 28
},
{
"epoch": 0.3372093023255814,
"grad_norm": 0.07543787360191345,
"learning_rate": 0.0001999633286223284,
"loss": 0.0295,
"step": 29
},
{
"epoch": 0.3488372093023256,
"grad_norm": 0.07906319946050644,
"learning_rate": 0.00019991749570421146,
"loss": 0.0309,
"step": 30
},
{
"epoch": 0.3488372093023256,
"eval_loss": 0.03273880109190941,
"eval_runtime": 69.34,
"eval_samples_per_second": 70.493,
"eval_steps_per_second": 1.11,
"step": 30
},
{
"epoch": 0.36046511627906974,
"grad_norm": 0.07591050863265991,
"learning_rate": 0.00019985334138511237,
"loss": 0.0298,
"step": 31
},
{
"epoch": 0.37209302325581395,
"grad_norm": 0.07976777851581573,
"learning_rate": 0.0001997708774287068,
"loss": 0.0346,
"step": 32
},
{
"epoch": 0.38372093023255816,
"grad_norm": 0.06528059393167496,
"learning_rate": 0.0001996701189560223,
"loss": 0.0301,
"step": 33
},
{
"epoch": 0.3953488372093023,
"grad_norm": 0.04785207286477089,
"learning_rate": 0.00019955108444266585,
"loss": 0.0291,
"step": 34
},
{
"epoch": 0.4069767441860465,
"grad_norm": 0.06423522531986237,
"learning_rate": 0.00019941379571543596,
"loss": 0.0302,
"step": 35
},
{
"epoch": 0.4186046511627907,
"grad_norm": 0.043477609753608704,
"learning_rate": 0.00019925827794832056,
"loss": 0.0281,
"step": 36
},
{
"epoch": 0.43023255813953487,
"grad_norm": 0.0563591867685318,
"learning_rate": 0.00019908455965788067,
"loss": 0.0321,
"step": 37
},
{
"epoch": 0.4418604651162791,
"grad_norm": 0.07481367886066437,
"learning_rate": 0.00019889267269802176,
"loss": 0.0285,
"step": 38
},
{
"epoch": 0.45348837209302323,
"grad_norm": 0.05782244727015495,
"learning_rate": 0.00019868265225415265,
"loss": 0.0283,
"step": 39
},
{
"epoch": 0.46511627906976744,
"grad_norm": 0.05342981219291687,
"learning_rate": 0.00019845453683673368,
"loss": 0.0276,
"step": 40
},
{
"epoch": 0.46511627906976744,
"eval_loss": 0.02999330498278141,
"eval_runtime": 68.8117,
"eval_samples_per_second": 71.034,
"eval_steps_per_second": 1.119,
"step": 40
},
{
"epoch": 0.47674418604651164,
"grad_norm": 0.07017785310745239,
"learning_rate": 0.0001982083682742156,
"loss": 0.0298,
"step": 41
},
{
"epoch": 0.4883720930232558,
"grad_norm": 0.04703626409173012,
"learning_rate": 0.00019794419170536916,
"loss": 0.0264,
"step": 42
},
{
"epoch": 0.5,
"grad_norm": 0.04164445772767067,
"learning_rate": 0.00019766205557100868,
"loss": 0.0286,
"step": 43
},
{
"epoch": 0.5116279069767442,
"grad_norm": 0.04445081949234009,
"learning_rate": 0.00019736201160510931,
"loss": 0.0282,
"step": 44
},
{
"epoch": 0.5232558139534884,
"grad_norm": 0.03947937488555908,
"learning_rate": 0.00019704411482532116,
"loss": 0.0253,
"step": 45
},
{
"epoch": 0.5348837209302325,
"grad_norm": 0.04509953781962395,
"learning_rate": 0.0001967084235228807,
"loss": 0.0218,
"step": 46
},
{
"epoch": 0.5465116279069767,
"grad_norm": 0.0683850422501564,
"learning_rate": 0.0001963549992519223,
"loss": 0.028,
"step": 47
},
{
"epoch": 0.5581395348837209,
"grad_norm": 0.045640990138053894,
"learning_rate": 0.0001959839068181914,
"loss": 0.0302,
"step": 48
},
{
"epoch": 0.5697674418604651,
"grad_norm": 0.1620291918516159,
"learning_rate": 0.00019559521426716118,
"loss": 0.0242,
"step": 49
},
{
"epoch": 0.5813953488372093,
"grad_norm": 0.04173683002591133,
"learning_rate": 0.00019518899287155556,
"loss": 0.0307,
"step": 50
},
{
"epoch": 0.5813953488372093,
"eval_loss": 0.02883034199476242,
"eval_runtime": 69.0904,
"eval_samples_per_second": 70.748,
"eval_steps_per_second": 1.114,
"step": 50
},
{
"epoch": 0.5930232558139535,
"grad_norm": 0.03936028108000755,
"learning_rate": 0.00019476531711828027,
"loss": 0.0251,
"step": 51
},
{
"epoch": 0.6046511627906976,
"grad_norm": 0.04982665926218033,
"learning_rate": 0.0001943242646947643,
"loss": 0.0252,
"step": 52
},
{
"epoch": 0.6162790697674418,
"grad_norm": 0.042471516877412796,
"learning_rate": 0.00019386591647471506,
"loss": 0.0287,
"step": 53
},
{
"epoch": 0.627906976744186,
"grad_norm": 0.03394132852554321,
"learning_rate": 0.00019339035650328869,
"loss": 0.0278,
"step": 54
},
{
"epoch": 0.6395348837209303,
"grad_norm": 0.03576912358403206,
"learning_rate": 0.00019289767198167916,
"loss": 0.0259,
"step": 55
},
{
"epoch": 0.6511627906976745,
"grad_norm": 0.035896990448236465,
"learning_rate": 0.0001923879532511287,
"loss": 0.0252,
"step": 56
},
{
"epoch": 0.6627906976744186,
"grad_norm": 0.03331352025270462,
"learning_rate": 0.0001918612937763622,
"loss": 0.0274,
"step": 57
},
{
"epoch": 0.6744186046511628,
"grad_norm": 0.04263336956501007,
"learning_rate": 0.00019131779012844912,
"loss": 0.0231,
"step": 58
},
{
"epoch": 0.686046511627907,
"grad_norm": 0.04020223766565323,
"learning_rate": 0.00019075754196709572,
"loss": 0.0224,
"step": 59
},
{
"epoch": 0.6976744186046512,
"grad_norm": 0.043280404061079025,
"learning_rate": 0.00019018065202237083,
"loss": 0.0266,
"step": 60
},
{
"epoch": 0.6976744186046512,
"eval_loss": 0.028244854882359505,
"eval_runtime": 68.7936,
"eval_samples_per_second": 71.053,
"eval_steps_per_second": 1.119,
"step": 60
},
{
"epoch": 0.7093023255813954,
"grad_norm": 0.03325851261615753,
"learning_rate": 0.0001895872260758688,
"loss": 0.024,
"step": 61
},
{
"epoch": 0.7209302325581395,
"grad_norm": 0.02916116639971733,
"learning_rate": 0.00018897737294131284,
"loss": 0.0237,
"step": 62
},
{
"epoch": 0.7325581395348837,
"grad_norm": 0.033790189772844315,
"learning_rate": 0.0001883512044446023,
"loss": 0.0261,
"step": 63
},
{
"epoch": 0.7441860465116279,
"grad_norm": 0.03239690884947777,
"learning_rate": 0.0001877088354033077,
"loss": 0.0272,
"step": 64
},
{
"epoch": 0.7558139534883721,
"grad_norm": 0.03925548493862152,
"learning_rate": 0.0001870503836056172,
"loss": 0.0246,
"step": 65
},
{
"epoch": 0.7674418604651163,
"grad_norm": 0.03802689164876938,
"learning_rate": 0.00018637596978873835,
"loss": 0.0294,
"step": 66
},
{
"epoch": 0.7790697674418605,
"grad_norm": 0.03799804300069809,
"learning_rate": 0.00018568571761675893,
"loss": 0.025,
"step": 67
},
{
"epoch": 0.7906976744186046,
"grad_norm": 0.034483980387449265,
"learning_rate": 0.0001849797536579715,
"loss": 0.0261,
"step": 68
},
{
"epoch": 0.8023255813953488,
"grad_norm": 0.049156103283166885,
"learning_rate": 0.0001842582073616649,
"loss": 0.024,
"step": 69
},
{
"epoch": 0.813953488372093,
"grad_norm": 0.03340472653508186,
"learning_rate": 0.000183521211034388,
"loss": 0.0293,
"step": 70
},
{
"epoch": 0.813953488372093,
"eval_loss": 0.02760264091193676,
"eval_runtime": 68.781,
"eval_samples_per_second": 71.066,
"eval_steps_per_second": 1.119,
"step": 70
},
{
"epoch": 0.8255813953488372,
"grad_norm": 0.030273284763097763,
"learning_rate": 0.00018276889981568906,
"loss": 0.026,
"step": 71
},
{
"epoch": 0.8372093023255814,
"grad_norm": 0.02898152358829975,
"learning_rate": 0.0001820014116533359,
"loss": 0.0247,
"step": 72
},
{
"epoch": 0.8488372093023255,
"grad_norm": 0.03946846351027489,
"learning_rate": 0.00018121888727802113,
"loss": 0.0259,
"step": 73
},
{
"epoch": 0.8604651162790697,
"grad_norm": 0.035859089344739914,
"learning_rate": 0.0001804214701775569,
"loss": 0.0276,
"step": 74
},
{
"epoch": 0.872093023255814,
"grad_norm": 0.03241611272096634,
"learning_rate": 0.00017960930657056438,
"loss": 0.0229,
"step": 75
},
{
"epoch": 0.8837209302325582,
"grad_norm": 0.025535929948091507,
"learning_rate": 0.00017878254537966216,
"loss": 0.0202,
"step": 76
},
{
"epoch": 0.8953488372093024,
"grad_norm": 0.03790373355150223,
"learning_rate": 0.00017794133820415916,
"loss": 0.026,
"step": 77
},
{
"epoch": 0.9069767441860465,
"grad_norm": 0.0408620722591877,
"learning_rate": 0.0001770858392922565,
"loss": 0.0253,
"step": 78
},
{
"epoch": 0.9186046511627907,
"grad_norm": 0.033651720732450485,
"learning_rate": 0.00017621620551276366,
"loss": 0.0227,
"step": 79
},
{
"epoch": 0.9302325581395349,
"grad_norm": 0.03782816231250763,
"learning_rate": 0.00017533259632633442,
"loss": 0.0254,
"step": 80
},
{
"epoch": 0.9302325581395349,
"eval_loss": 0.026909608393907547,
"eval_runtime": 68.7973,
"eval_samples_per_second": 71.049,
"eval_steps_per_second": 1.119,
"step": 80
},
{
"epoch": 0.9418604651162791,
"grad_norm": 0.03370513767004013,
"learning_rate": 0.00017443517375622704,
"loss": 0.0261,
"step": 81
},
{
"epoch": 0.9534883720930233,
"grad_norm": 0.03856685757637024,
"learning_rate": 0.00017352410235859503,
"loss": 0.0256,
"step": 82
},
{
"epoch": 0.9651162790697675,
"grad_norm": 0.04497801512479782,
"learning_rate": 0.0001725995491923131,
"loss": 0.0262,
"step": 83
},
{
"epoch": 0.9767441860465116,
"grad_norm": 0.031994592398405075,
"learning_rate": 0.00017166168378834448,
"loss": 0.0251,
"step": 84
},
{
"epoch": 0.9883720930232558,
"grad_norm": 0.035724181681871414,
"learning_rate": 0.00017071067811865476,
"loss": 0.0254,
"step": 85
},
{
"epoch": 1.0,
"grad_norm": 0.029222311452031136,
"learning_rate": 0.00016974670656467824,
"loss": 0.0238,
"step": 86
},
{
"epoch": 1.0116279069767442,
"grad_norm": 0.039559703320264816,
"learning_rate": 0.00016876994588534234,
"loss": 0.0296,
"step": 87
},
{
"epoch": 1.0232558139534884,
"grad_norm": 0.031729090958833694,
"learning_rate": 0.0001677805751846563,
"loss": 0.018,
"step": 88
},
{
"epoch": 1.0348837209302326,
"grad_norm": 0.029029319062829018,
"learning_rate": 0.00016677877587886956,
"loss": 0.0244,
"step": 89
},
{
"epoch": 1.0465116279069768,
"grad_norm": 0.025509672239422798,
"learning_rate": 0.00016576473166320644,
"loss": 0.0215,
"step": 90
},
{
"epoch": 1.0465116279069768,
"eval_loss": 0.026453962549567223,
"eval_runtime": 68.7872,
"eval_samples_per_second": 71.06,
"eval_steps_per_second": 1.119,
"step": 90
},
{
"epoch": 1.058139534883721,
"grad_norm": 0.027732428163290024,
"learning_rate": 0.00016473862847818277,
"loss": 0.0251,
"step": 91
},
{
"epoch": 1.069767441860465,
"grad_norm": 0.023567862808704376,
"learning_rate": 0.00016370065447551078,
"loss": 0.0224,
"step": 92
},
{
"epoch": 1.0813953488372092,
"grad_norm": 0.030995313078165054,
"learning_rate": 0.00016265099998359866,
"loss": 0.0236,
"step": 93
},
{
"epoch": 1.0930232558139534,
"grad_norm": 0.03294675052165985,
"learning_rate": 0.00016158985747265108,
"loss": 0.0253,
"step": 94
},
{
"epoch": 1.1046511627906976,
"grad_norm": 0.030441010370850563,
"learning_rate": 0.00016051742151937655,
"loss": 0.0228,
"step": 95
},
{
"epoch": 1.1162790697674418,
"grad_norm": 0.029724519699811935,
"learning_rate": 0.000159433888771309,
"loss": 0.0193,
"step": 96
},
{
"epoch": 1.127906976744186,
"grad_norm": 0.04900391027331352,
"learning_rate": 0.00015833945791074943,
"loss": 0.0206,
"step": 97
},
{
"epoch": 1.1395348837209303,
"grad_norm": 0.0280914343893528,
"learning_rate": 0.0001572343296183344,
"loss": 0.0189,
"step": 98
},
{
"epoch": 1.1511627906976745,
"grad_norm": 0.031953178346157074,
"learning_rate": 0.00015611870653623825,
"loss": 0.0226,
"step": 99
},
{
"epoch": 1.1627906976744187,
"grad_norm": 0.02610064297914505,
"learning_rate": 0.0001549927932310155,
"loss": 0.0176,
"step": 100
},
{
"epoch": 1.1627906976744187,
"eval_loss": 0.026211915537714958,
"eval_runtime": 68.7868,
"eval_samples_per_second": 71.06,
"eval_steps_per_second": 1.119,
"step": 100
},
{
"epoch": 1.1744186046511629,
"grad_norm": 0.04419023171067238,
"learning_rate": 0.00015385679615609042,
"loss": 0.0269,
"step": 101
},
{
"epoch": 1.1860465116279069,
"grad_norm": 0.09231790900230408,
"learning_rate": 0.00015271092361390077,
"loss": 0.0258,
"step": 102
},
{
"epoch": 1.197674418604651,
"grad_norm": 0.034355148673057556,
"learning_rate": 0.00015155538571770218,
"loss": 0.0244,
"step": 103
},
{
"epoch": 1.2093023255813953,
"grad_norm": 0.03240971267223358,
"learning_rate": 0.00015039039435304078,
"loss": 0.0235,
"step": 104
},
{
"epoch": 1.2209302325581395,
"grad_norm": 0.02766534686088562,
"learning_rate": 0.00014921616313890072,
"loss": 0.021,
"step": 105
},
{
"epoch": 1.2325581395348837,
"grad_norm": 0.030099626630544662,
"learning_rate": 0.00014803290738853395,
"loss": 0.0218,
"step": 106
},
{
"epoch": 1.244186046511628,
"grad_norm": 0.030833614990115166,
"learning_rate": 0.00014684084406997903,
"loss": 0.0197,
"step": 107
},
{
"epoch": 1.255813953488372,
"grad_norm": 0.02916071005165577,
"learning_rate": 0.0001456401917662769,
"loss": 0.022,
"step": 108
},
{
"epoch": 1.2674418604651163,
"grad_norm": 0.024599241092801094,
"learning_rate": 0.00014443117063539038,
"loss": 0.0249,
"step": 109
},
{
"epoch": 1.2790697674418605,
"grad_norm": 0.04152291268110275,
"learning_rate": 0.00014321400236983457,
"loss": 0.0227,
"step": 110
},
{
"epoch": 1.2790697674418605,
"eval_loss": 0.025697337463498116,
"eval_runtime": 68.7913,
"eval_samples_per_second": 71.055,
"eval_steps_per_second": 1.119,
"step": 110
},
{
"epoch": 1.2906976744186047,
"grad_norm": 0.026202471926808357,
"learning_rate": 0.00014198891015602646,
"loss": 0.0225,
"step": 111
},
{
"epoch": 1.302325581395349,
"grad_norm": 0.026729293167591095,
"learning_rate": 0.0001407561186333601,
"loss": 0.0231,
"step": 112
},
{
"epoch": 1.3139534883720931,
"grad_norm": 0.03199277073144913,
"learning_rate": 0.00013951585385301555,
"loss": 0.0187,
"step": 113
},
{
"epoch": 1.3255813953488373,
"grad_norm": 0.030409252271056175,
"learning_rate": 0.000138268343236509,
"loss": 0.0251,
"step": 114
},
{
"epoch": 1.3372093023255813,
"grad_norm": 0.029044533148407936,
"learning_rate": 0.00013701381553399145,
"loss": 0.0206,
"step": 115
},
{
"epoch": 1.3488372093023255,
"grad_norm": 0.0352545827627182,
"learning_rate": 0.000135752500782304,
"loss": 0.0195,
"step": 116
},
{
"epoch": 1.3604651162790697,
"grad_norm": 0.03767949342727661,
"learning_rate": 0.00013448463026279704,
"loss": 0.0253,
"step": 117
},
{
"epoch": 1.372093023255814,
"grad_norm": 0.02688649669289589,
"learning_rate": 0.0001332104364589212,
"loss": 0.0196,
"step": 118
},
{
"epoch": 1.3837209302325582,
"grad_norm": 0.03161188215017319,
"learning_rate": 0.000131930153013598,
"loss": 0.0219,
"step": 119
},
{
"epoch": 1.3953488372093024,
"grad_norm": 0.03074447624385357,
"learning_rate": 0.00013064401468637792,
"loss": 0.0198,
"step": 120
},
{
"epoch": 1.3953488372093024,
"eval_loss": 0.025383805856108665,
"eval_runtime": 68.781,
"eval_samples_per_second": 71.066,
"eval_steps_per_second": 1.119,
"step": 120
},
{
"epoch": 1.4069767441860466,
"grad_norm": 0.03676707297563553,
"learning_rate": 0.00012935225731039348,
"loss": 0.0268,
"step": 121
},
{
"epoch": 1.4186046511627908,
"grad_norm": 0.04459831491112709,
"learning_rate": 0.00012805511774911584,
"loss": 0.0233,
"step": 122
},
{
"epoch": 1.4302325581395348,
"grad_norm": 0.03590243309736252,
"learning_rate": 0.00012675283385292212,
"loss": 0.0222,
"step": 123
},
{
"epoch": 1.441860465116279,
"grad_norm": 0.036192964762449265,
"learning_rate": 0.00012544564441548182,
"loss": 0.0251,
"step": 124
},
{
"epoch": 1.4534883720930232,
"grad_norm": 0.03172110393643379,
"learning_rate": 0.00012413378912997058,
"loss": 0.0202,
"step": 125
},
{
"epoch": 1.4651162790697674,
"grad_norm": 0.032995227724313736,
"learning_rate": 0.0001228175085451186,
"loss": 0.0219,
"step": 126
},
{
"epoch": 1.4767441860465116,
"grad_norm": 0.02672835998237133,
"learning_rate": 0.00012149704402110243,
"loss": 0.0185,
"step": 127
},
{
"epoch": 1.4883720930232558,
"grad_norm": 0.03171510249376297,
"learning_rate": 0.00012017263768528775,
"loss": 0.0196,
"step": 128
},
{
"epoch": 1.5,
"grad_norm": 0.03766058757901192,
"learning_rate": 0.00011884453238783185,
"loss": 0.0223,
"step": 129
},
{
"epoch": 1.5116279069767442,
"grad_norm": 0.038156960159540176,
"learning_rate": 0.00011751297165715309,
"loss": 0.0245,
"step": 130
},
{
"epoch": 1.5116279069767442,
"eval_loss": 0.025349650532007217,
"eval_runtime": 68.7837,
"eval_samples_per_second": 71.063,
"eval_steps_per_second": 1.119,
"step": 130
},
{
"epoch": 1.5232558139534884,
"grad_norm": 0.03054482489824295,
"learning_rate": 0.0001161781996552765,
"loss": 0.0217,
"step": 131
},
{
"epoch": 1.5348837209302326,
"grad_norm": 0.026866618543863297,
"learning_rate": 0.00011484046113306262,
"loss": 0.0196,
"step": 132
},
{
"epoch": 1.5465116279069768,
"grad_norm": 0.035294584929943085,
"learning_rate": 0.00011350000138532902,
"loss": 0.0237,
"step": 133
},
{
"epoch": 1.558139534883721,
"grad_norm": 0.02969173528254032,
"learning_rate": 0.00011215706620587149,
"loss": 0.0203,
"step": 134
},
{
"epoch": 1.5697674418604652,
"grad_norm": 0.031717926263809204,
"learning_rate": 0.00011081190184239419,
"loss": 0.0192,
"step": 135
},
{
"epoch": 1.5813953488372094,
"grad_norm": 0.03667771443724632,
"learning_rate": 0.0001094647549513561,
"loss": 0.0268,
"step": 136
},
{
"epoch": 1.5930232558139537,
"grad_norm": 0.0326247438788414,
"learning_rate": 0.00010811587255274313,
"loss": 0.0213,
"step": 137
},
{
"epoch": 1.6046511627906976,
"grad_norm": 0.03693091496825218,
"learning_rate": 0.00010676550198477293,
"loss": 0.0203,
"step": 138
},
{
"epoch": 1.6162790697674418,
"grad_norm": 0.037649210542440414,
"learning_rate": 0.00010541389085854176,
"loss": 0.0255,
"step": 139
},
{
"epoch": 1.627906976744186,
"grad_norm": 0.03507932275533676,
"learning_rate": 0.00010406128701262128,
"loss": 0.0217,
"step": 140
},
{
"epoch": 1.627906976744186,
"eval_loss": 0.025237275287508965,
"eval_runtime": 69.013,
"eval_samples_per_second": 70.827,
"eval_steps_per_second": 1.116,
"step": 140
},
{
"epoch": 1.6395348837209303,
"grad_norm": 0.02867533639073372,
"learning_rate": 0.00010270793846761347,
"loss": 0.022,
"step": 141
},
{
"epoch": 1.6511627906976745,
"grad_norm": 0.02936953864991665,
"learning_rate": 0.00010135409338067219,
"loss": 0.0208,
"step": 142
},
{
"epoch": 1.6627906976744184,
"grad_norm": 0.02879083901643753,
"learning_rate": 0.0001,
"loss": 0.0226,
"step": 143
},
{
"epoch": 1.6744186046511627,
"grad_norm": 0.029482927173376083,
"learning_rate": 9.864590661932783e-05,
"loss": 0.0216,
"step": 144
},
{
"epoch": 1.6860465116279069,
"grad_norm": 0.033599238842725754,
"learning_rate": 9.729206153238657e-05,
"loss": 0.0255,
"step": 145
},
{
"epoch": 1.697674418604651,
"grad_norm": 0.03904499486088753,
"learning_rate": 9.59387129873787e-05,
"loss": 0.0273,
"step": 146
},
{
"epoch": 1.7093023255813953,
"grad_norm": 0.028350962325930595,
"learning_rate": 9.458610914145826e-05,
"loss": 0.0206,
"step": 147
},
{
"epoch": 1.7209302325581395,
"grad_norm": 0.030183596536517143,
"learning_rate": 9.323449801522709e-05,
"loss": 0.0194,
"step": 148
},
{
"epoch": 1.7325581395348837,
"grad_norm": 0.030208786949515343,
"learning_rate": 9.18841274472569e-05,
"loss": 0.0184,
"step": 149
},
{
"epoch": 1.744186046511628,
"grad_norm": 0.033600978553295135,
"learning_rate": 9.05352450486439e-05,
"loss": 0.0209,
"step": 150
},
{
"epoch": 1.744186046511628,
"eval_loss": 0.024917516857385635,
"eval_runtime": 68.8307,
"eval_samples_per_second": 71.015,
"eval_steps_per_second": 1.119,
"step": 150
},
{
"epoch": 1.755813953488372,
"grad_norm": 0.030587706714868546,
"learning_rate": 8.918809815760585e-05,
"loss": 0.0193,
"step": 151
},
{
"epoch": 1.7674418604651163,
"grad_norm": 0.03076143190264702,
"learning_rate": 8.78429337941285e-05,
"loss": 0.0234,
"step": 152
},
{
"epoch": 1.7790697674418605,
"grad_norm": 0.031419869512319565,
"learning_rate": 8.649999861467099e-05,
"loss": 0.0213,
"step": 153
},
{
"epoch": 1.7906976744186047,
"grad_norm": 0.03213745728135109,
"learning_rate": 8.515953886693739e-05,
"loss": 0.02,
"step": 154
},
{
"epoch": 1.802325581395349,
"grad_norm": 0.035864025354385376,
"learning_rate": 8.382180034472353e-05,
"loss": 0.0199,
"step": 155
},
{
"epoch": 1.8139534883720931,
"grad_norm": 0.029758954420685768,
"learning_rate": 8.248702834284693e-05,
"loss": 0.0227,
"step": 156
},
{
"epoch": 1.8255813953488373,
"grad_norm": 0.02980395406484604,
"learning_rate": 8.115546761216822e-05,
"loss": 0.0168,
"step": 157
},
{
"epoch": 1.8372093023255816,
"grad_norm": 0.04690724238753319,
"learning_rate": 7.982736231471224e-05,
"loss": 0.022,
"step": 158
},
{
"epoch": 1.8488372093023255,
"grad_norm": 0.035520877689123154,
"learning_rate": 7.85029559788976e-05,
"loss": 0.0221,
"step": 159
},
{
"epoch": 1.8604651162790697,
"grad_norm": 0.032926399260759354,
"learning_rate": 7.718249145488142e-05,
"loss": 0.0227,
"step": 160
},
{
"epoch": 1.8604651162790697,
"eval_loss": 0.024740872904658318,
"eval_runtime": 69.0656,
"eval_samples_per_second": 70.773,
"eval_steps_per_second": 1.115,
"step": 160
},
{
"epoch": 1.872093023255814,
"grad_norm": 0.030970241874456406,
"learning_rate": 7.586621087002945e-05,
"loss": 0.0192,
"step": 161
},
{
"epoch": 1.8837209302325582,
"grad_norm": 0.03738875314593315,
"learning_rate": 7.455435558451823e-05,
"loss": 0.0213,
"step": 162
},
{
"epoch": 1.8953488372093024,
"grad_norm": 0.043416742235422134,
"learning_rate": 7.324716614707793e-05,
"loss": 0.0212,
"step": 163
},
{
"epoch": 1.9069767441860463,
"grad_norm": 0.029200483113527298,
"learning_rate": 7.194488225088417e-05,
"loss": 0.0172,
"step": 164
},
{
"epoch": 1.9186046511627906,
"grad_norm": 0.03626865893602371,
"learning_rate": 7.064774268960653e-05,
"loss": 0.0218,
"step": 165
},
{
"epoch": 1.9302325581395348,
"grad_norm": 0.03200054168701172,
"learning_rate": 6.93559853136221e-05,
"loss": 0.02,
"step": 166
},
{
"epoch": 1.941860465116279,
"grad_norm": 0.04698159173130989,
"learning_rate": 6.806984698640202e-05,
"loss": 0.0245,
"step": 167
},
{
"epoch": 1.9534883720930232,
"grad_norm": 0.03742319345474243,
"learning_rate": 6.678956354107882e-05,
"loss": 0.025,
"step": 168
},
{
"epoch": 1.9651162790697674,
"grad_norm": 0.033966902643442154,
"learning_rate": 6.551536973720298e-05,
"loss": 0.0174,
"step": 169
},
{
"epoch": 1.9767441860465116,
"grad_norm": 0.03295022249221802,
"learning_rate": 6.4247499217696e-05,
"loss": 0.0195,
"step": 170
},
{
"epoch": 1.9767441860465116,
"eval_loss": 0.02444678172469139,
"eval_runtime": 69.9558,
"eval_samples_per_second": 69.873,
"eval_steps_per_second": 1.101,
"step": 170
},
{
"epoch": 1.9883720930232558,
"grad_norm": 0.031102096661925316,
"learning_rate": 6.298618446600856e-05,
"loss": 0.02,
"step": 171
},
{
"epoch": 2.0,
"grad_norm": 0.03594454750418663,
"learning_rate": 6.173165676349103e-05,
"loss": 0.0211,
"step": 172
},
{
"epoch": 2.011627906976744,
"grad_norm": 0.02976617030799389,
"learning_rate": 6.048414614698448e-05,
"loss": 0.0205,
"step": 173
},
{
"epoch": 2.0232558139534884,
"grad_norm": 0.03257077932357788,
"learning_rate": 5.924388136663992e-05,
"loss": 0.0187,
"step": 174
},
{
"epoch": 2.0348837209302326,
"grad_norm": 0.027616139501333237,
"learning_rate": 5.801108984397354e-05,
"loss": 0.0153,
"step": 175
},
{
"epoch": 2.046511627906977,
"grad_norm": 0.029210377484560013,
"learning_rate": 5.6785997630165435e-05,
"loss": 0.0192,
"step": 176
},
{
"epoch": 2.058139534883721,
"grad_norm": 0.029252031818032265,
"learning_rate": 5.5568829364609664e-05,
"loss": 0.0171,
"step": 177
},
{
"epoch": 2.0697674418604652,
"grad_norm": 0.029388127848505974,
"learning_rate": 5.435980823372311e-05,
"loss": 0.0184,
"step": 178
},
{
"epoch": 2.0813953488372094,
"grad_norm": 0.028690453618764877,
"learning_rate": 5.3159155930021e-05,
"loss": 0.0191,
"step": 179
},
{
"epoch": 2.0930232558139537,
"grad_norm": 0.027052663266658783,
"learning_rate": 5.196709261146606e-05,
"loss": 0.0174,
"step": 180
},
{
"epoch": 2.0930232558139537,
"eval_loss": 0.024556750431656837,
"eval_runtime": 68.786,
"eval_samples_per_second": 71.061,
"eval_steps_per_second": 1.119,
"step": 180
},
{
"epoch": 2.104651162790698,
"grad_norm": 0.027682358399033546,
"learning_rate": 5.078383686109926e-05,
"loss": 0.0192,
"step": 181
},
{
"epoch": 2.116279069767442,
"grad_norm": 0.030152970924973488,
"learning_rate": 4.9609605646959226e-05,
"loss": 0.0182,
"step": 182
},
{
"epoch": 2.1279069767441863,
"grad_norm": 0.030966833233833313,
"learning_rate": 4.844461428229782e-05,
"loss": 0.0168,
"step": 183
},
{
"epoch": 2.13953488372093,
"grad_norm": 0.02938106097280979,
"learning_rate": 4.728907638609925e-05,
"loss": 0.0209,
"step": 184
},
{
"epoch": 2.1511627906976742,
"grad_norm": 0.03003690205514431,
"learning_rate": 4.614320384390959e-05,
"loss": 0.0171,
"step": 185
},
{
"epoch": 2.1627906976744184,
"grad_norm": 0.03510993719100952,
"learning_rate": 4.500720676898452e-05,
"loss": 0.0196,
"step": 186
},
{
"epoch": 2.1744186046511627,
"grad_norm": 0.028933702036738396,
"learning_rate": 4.388129346376178e-05,
"loss": 0.0154,
"step": 187
},
{
"epoch": 2.186046511627907,
"grad_norm": 0.040435321629047394,
"learning_rate": 4.276567038166563e-05,
"loss": 0.0214,
"step": 188
},
{
"epoch": 2.197674418604651,
"grad_norm": 0.03420122340321541,
"learning_rate": 4.16605420892506e-05,
"loss": 0.0162,
"step": 189
},
{
"epoch": 2.2093023255813953,
"grad_norm": 0.033222515136003494,
"learning_rate": 4.0566111228691064e-05,
"loss": 0.018,
"step": 190
},
{
"epoch": 2.2093023255813953,
"eval_loss": 0.02473200112581253,
"eval_runtime": 68.7641,
"eval_samples_per_second": 71.084,
"eval_steps_per_second": 1.12,
"step": 190
},
{
"epoch": 2.2209302325581395,
"grad_norm": 0.03317669406533241,
"learning_rate": 3.948257848062351e-05,
"loss": 0.0169,
"step": 191
},
{
"epoch": 2.2325581395348837,
"grad_norm": 0.03263445943593979,
"learning_rate": 3.841014252734896e-05,
"loss": 0.0179,
"step": 192
},
{
"epoch": 2.244186046511628,
"grad_norm": 0.030220864340662956,
"learning_rate": 3.734900001640135e-05,
"loss": 0.0185,
"step": 193
},
{
"epoch": 2.255813953488372,
"grad_norm": 0.033804602921009064,
"learning_rate": 3.629934552448925e-05,
"loss": 0.0192,
"step": 194
},
{
"epoch": 2.2674418604651163,
"grad_norm": 0.03279354050755501,
"learning_rate": 3.5261371521817244e-05,
"loss": 0.0211,
"step": 195
},
{
"epoch": 2.2790697674418605,
"grad_norm": 0.03980812057852745,
"learning_rate": 3.423526833679355e-05,
"loss": 0.0187,
"step": 196
},
{
"epoch": 2.2906976744186047,
"grad_norm": 0.03647474944591522,
"learning_rate": 3.322122412113047e-05,
"loss": 0.0187,
"step": 197
},
{
"epoch": 2.302325581395349,
"grad_norm": 0.0309713426977396,
"learning_rate": 3.2219424815343735e-05,
"loss": 0.0175,
"step": 198
},
{
"epoch": 2.313953488372093,
"grad_norm": 0.030178574845194817,
"learning_rate": 3.123005411465766e-05,
"loss": 0.0174,
"step": 199
},
{
"epoch": 2.3255813953488373,
"grad_norm": 0.03233598917722702,
"learning_rate": 3.0253293435321793e-05,
"loss": 0.0176,
"step": 200
},
{
"epoch": 2.3255813953488373,
"eval_loss": 0.024754056707024574,
"eval_runtime": 68.7907,
"eval_samples_per_second": 71.056,
"eval_steps_per_second": 1.119,
"step": 200
}
],
"logging_steps": 1.0,
"max_steps": 258,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.5918719337187246e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}