Qwen3-80m-tinystories-A / trainer_state.json
C10X's picture
Upload trainer_state.json with huggingface_hub
61e006a verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9990726429675425,
"eval_steps": 500,
"global_step": 202,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004945904173106646,
"grad_norm": 54.500160217285156,
"learning_rate": 0.0,
"loss": 9.5291,
"step": 1
},
{
"epoch": 0.009891808346213293,
"grad_norm": 53.72835922241211,
"learning_rate": 4e-05,
"loss": 9.4814,
"step": 2
},
{
"epoch": 0.014837712519319939,
"grad_norm": 17.406553268432617,
"learning_rate": 8e-05,
"loss": 9.1288,
"step": 3
},
{
"epoch": 0.019783616692426585,
"grad_norm": 3.4949991703033447,
"learning_rate": 0.00012,
"loss": 8.8408,
"step": 4
},
{
"epoch": 0.02472952086553323,
"grad_norm": 2.9090073108673096,
"learning_rate": 0.00016,
"loss": 8.7705,
"step": 5
},
{
"epoch": 0.029675425038639878,
"grad_norm": 3.398167371749878,
"learning_rate": 0.0002,
"loss": 8.6466,
"step": 6
},
{
"epoch": 0.03462132921174652,
"grad_norm": 1.6190311908721924,
"learning_rate": 0.00019898477157360406,
"loss": 8.5125,
"step": 7
},
{
"epoch": 0.03956723338485317,
"grad_norm": 1.8773953914642334,
"learning_rate": 0.00019796954314720813,
"loss": 8.5322,
"step": 8
},
{
"epoch": 0.04451313755795981,
"grad_norm": 1.283807396888733,
"learning_rate": 0.00019695431472081218,
"loss": 8.4917,
"step": 9
},
{
"epoch": 0.04945904173106646,
"grad_norm": 1.9215106964111328,
"learning_rate": 0.00019593908629441626,
"loss": 8.3638,
"step": 10
},
{
"epoch": 0.05440494590417311,
"grad_norm": 1.5560728311538696,
"learning_rate": 0.00019492385786802033,
"loss": 8.3021,
"step": 11
},
{
"epoch": 0.059350850077279756,
"grad_norm": 1.4610416889190674,
"learning_rate": 0.00019390862944162438,
"loss": 8.3058,
"step": 12
},
{
"epoch": 0.0642967542503864,
"grad_norm": 1.4304499626159668,
"learning_rate": 0.00019289340101522843,
"loss": 8.2576,
"step": 13
},
{
"epoch": 0.06924265842349304,
"grad_norm": 1.2287720441818237,
"learning_rate": 0.0001918781725888325,
"loss": 8.0443,
"step": 14
},
{
"epoch": 0.07418856259659969,
"grad_norm": 1.3729023933410645,
"learning_rate": 0.00019086294416243655,
"loss": 8.1255,
"step": 15
},
{
"epoch": 0.07913446676970634,
"grad_norm": 1.2619420289993286,
"learning_rate": 0.0001898477157360406,
"loss": 8.032,
"step": 16
},
{
"epoch": 0.08408037094281298,
"grad_norm": 1.4744280576705933,
"learning_rate": 0.0001888324873096447,
"loss": 7.8637,
"step": 17
},
{
"epoch": 0.08902627511591962,
"grad_norm": 1.6214470863342285,
"learning_rate": 0.00018781725888324875,
"loss": 7.9172,
"step": 18
},
{
"epoch": 0.09397217928902628,
"grad_norm": 1.283504605293274,
"learning_rate": 0.0001868020304568528,
"loss": 7.8251,
"step": 19
},
{
"epoch": 0.09891808346213292,
"grad_norm": 1.0794684886932373,
"learning_rate": 0.00018578680203045687,
"loss": 7.7431,
"step": 20
},
{
"epoch": 0.10386398763523957,
"grad_norm": 1.1826306581497192,
"learning_rate": 0.00018477157360406092,
"loss": 7.6118,
"step": 21
},
{
"epoch": 0.10880989180834622,
"grad_norm": 1.5493848323822021,
"learning_rate": 0.00018375634517766497,
"loss": 7.5928,
"step": 22
},
{
"epoch": 0.11375579598145286,
"grad_norm": 2.191657304763794,
"learning_rate": 0.00018274111675126904,
"loss": 7.596,
"step": 23
},
{
"epoch": 0.11870170015455951,
"grad_norm": 1.2168949842453003,
"learning_rate": 0.0001817258883248731,
"loss": 7.5224,
"step": 24
},
{
"epoch": 0.12364760432766615,
"grad_norm": 1.1562331914901733,
"learning_rate": 0.00018071065989847717,
"loss": 7.4952,
"step": 25
},
{
"epoch": 0.1285935085007728,
"grad_norm": 1.9624497890472412,
"learning_rate": 0.00017969543147208124,
"loss": 7.459,
"step": 26
},
{
"epoch": 0.13353941267387945,
"grad_norm": 2.2458877563476562,
"learning_rate": 0.0001786802030456853,
"loss": 7.3465,
"step": 27
},
{
"epoch": 0.1384853168469861,
"grad_norm": 1.3750243186950684,
"learning_rate": 0.00017766497461928934,
"loss": 7.3891,
"step": 28
},
{
"epoch": 0.14343122102009273,
"grad_norm": 1.2398021221160889,
"learning_rate": 0.0001766497461928934,
"loss": 7.3127,
"step": 29
},
{
"epoch": 0.14837712519319937,
"grad_norm": 2.071115732192993,
"learning_rate": 0.00017563451776649746,
"loss": 7.2548,
"step": 30
},
{
"epoch": 0.15332302936630604,
"grad_norm": 2.288498640060425,
"learning_rate": 0.0001746192893401015,
"loss": 7.1908,
"step": 31
},
{
"epoch": 0.15826893353941268,
"grad_norm": 1.2050567865371704,
"learning_rate": 0.0001736040609137056,
"loss": 7.1467,
"step": 32
},
{
"epoch": 0.16321483771251932,
"grad_norm": 1.4064340591430664,
"learning_rate": 0.00017258883248730966,
"loss": 7.035,
"step": 33
},
{
"epoch": 0.16816074188562596,
"grad_norm": 1.2630614042282104,
"learning_rate": 0.0001715736040609137,
"loss": 7.0536,
"step": 34
},
{
"epoch": 0.1731066460587326,
"grad_norm": 1.8433802127838135,
"learning_rate": 0.00017055837563451778,
"loss": 7.0115,
"step": 35
},
{
"epoch": 0.17805255023183925,
"grad_norm": 1.744345784187317,
"learning_rate": 0.00016954314720812183,
"loss": 7.038,
"step": 36
},
{
"epoch": 0.18299845440494591,
"grad_norm": 1.679824709892273,
"learning_rate": 0.00016852791878172588,
"loss": 6.8946,
"step": 37
},
{
"epoch": 0.18794435857805256,
"grad_norm": 1.4559205770492554,
"learning_rate": 0.00016751269035532995,
"loss": 6.9053,
"step": 38
},
{
"epoch": 0.1928902627511592,
"grad_norm": 1.7544541358947754,
"learning_rate": 0.00016649746192893403,
"loss": 6.9277,
"step": 39
},
{
"epoch": 0.19783616692426584,
"grad_norm": 1.594734787940979,
"learning_rate": 0.00016548223350253808,
"loss": 6.912,
"step": 40
},
{
"epoch": 0.20278207109737248,
"grad_norm": 1.3439960479736328,
"learning_rate": 0.00016446700507614215,
"loss": 6.8592,
"step": 41
},
{
"epoch": 0.20772797527047915,
"grad_norm": 1.4330651760101318,
"learning_rate": 0.0001634517766497462,
"loss": 6.8965,
"step": 42
},
{
"epoch": 0.2126738794435858,
"grad_norm": 2.439265489578247,
"learning_rate": 0.00016243654822335025,
"loss": 6.8126,
"step": 43
},
{
"epoch": 0.21761978361669243,
"grad_norm": 1.2343510389328003,
"learning_rate": 0.00016142131979695432,
"loss": 6.8057,
"step": 44
},
{
"epoch": 0.22256568778979907,
"grad_norm": 1.15224027633667,
"learning_rate": 0.00016040609137055837,
"loss": 6.6727,
"step": 45
},
{
"epoch": 0.2275115919629057,
"grad_norm": 1.6769089698791504,
"learning_rate": 0.00015939086294416242,
"loss": 6.7457,
"step": 46
},
{
"epoch": 0.23245749613601235,
"grad_norm": 2.4642043113708496,
"learning_rate": 0.00015837563451776652,
"loss": 6.742,
"step": 47
},
{
"epoch": 0.23740340030911902,
"grad_norm": 1.1713383197784424,
"learning_rate": 0.00015736040609137057,
"loss": 6.7022,
"step": 48
},
{
"epoch": 0.24234930448222566,
"grad_norm": 1.5891178846359253,
"learning_rate": 0.00015634517766497462,
"loss": 6.6446,
"step": 49
},
{
"epoch": 0.2472952086553323,
"grad_norm": 2.0845682621002197,
"learning_rate": 0.0001553299492385787,
"loss": 6.5948,
"step": 50
},
{
"epoch": 0.252241112828439,
"grad_norm": 1.4469300508499146,
"learning_rate": 0.00015431472081218274,
"loss": 6.5604,
"step": 51
},
{
"epoch": 0.2571870170015456,
"grad_norm": 1.0141685009002686,
"learning_rate": 0.0001532994923857868,
"loss": 6.5418,
"step": 52
},
{
"epoch": 0.26213292117465226,
"grad_norm": 2.21588134765625,
"learning_rate": 0.00015228426395939087,
"loss": 6.4273,
"step": 53
},
{
"epoch": 0.2670788253477589,
"grad_norm": 1.4307092428207397,
"learning_rate": 0.00015126903553299494,
"loss": 6.4938,
"step": 54
},
{
"epoch": 0.27202472952086554,
"grad_norm": 1.4310742616653442,
"learning_rate": 0.000150253807106599,
"loss": 6.4357,
"step": 55
},
{
"epoch": 0.2769706336939722,
"grad_norm": 1.1520801782608032,
"learning_rate": 0.00014923857868020306,
"loss": 6.5101,
"step": 56
},
{
"epoch": 0.2819165378670788,
"grad_norm": 1.0513254404067993,
"learning_rate": 0.0001482233502538071,
"loss": 6.4536,
"step": 57
},
{
"epoch": 0.28686244204018546,
"grad_norm": 1.5814175605773926,
"learning_rate": 0.00014720812182741116,
"loss": 6.4139,
"step": 58
},
{
"epoch": 0.2918083462132921,
"grad_norm": 1.5383965969085693,
"learning_rate": 0.00014619289340101523,
"loss": 6.3318,
"step": 59
},
{
"epoch": 0.29675425038639874,
"grad_norm": 1.0093541145324707,
"learning_rate": 0.00014517766497461928,
"loss": 6.4279,
"step": 60
},
{
"epoch": 0.3017001545595054,
"grad_norm": 1.4959982633590698,
"learning_rate": 0.00014416243654822336,
"loss": 6.3061,
"step": 61
},
{
"epoch": 0.3066460587326121,
"grad_norm": 1.649026870727539,
"learning_rate": 0.00014314720812182743,
"loss": 6.274,
"step": 62
},
{
"epoch": 0.3115919629057187,
"grad_norm": 0.9700078964233398,
"learning_rate": 0.00014213197969543148,
"loss": 6.4123,
"step": 63
},
{
"epoch": 0.31653786707882536,
"grad_norm": 1.0136897563934326,
"learning_rate": 0.00014111675126903553,
"loss": 6.3055,
"step": 64
},
{
"epoch": 0.321483771251932,
"grad_norm": 1.6081498861312866,
"learning_rate": 0.0001401015228426396,
"loss": 6.3642,
"step": 65
},
{
"epoch": 0.32642967542503865,
"grad_norm": 1.1522279977798462,
"learning_rate": 0.00013908629441624365,
"loss": 6.2726,
"step": 66
},
{
"epoch": 0.3313755795981453,
"grad_norm": 0.8351190686225891,
"learning_rate": 0.00013807106598984773,
"loss": 6.2645,
"step": 67
},
{
"epoch": 0.33632148377125193,
"grad_norm": 1.1132313013076782,
"learning_rate": 0.00013705583756345178,
"loss": 6.2681,
"step": 68
},
{
"epoch": 0.34126738794435857,
"grad_norm": 1.2936571836471558,
"learning_rate": 0.00013604060913705585,
"loss": 6.2473,
"step": 69
},
{
"epoch": 0.3462132921174652,
"grad_norm": 1.250172734260559,
"learning_rate": 0.0001350253807106599,
"loss": 6.2264,
"step": 70
},
{
"epoch": 0.35115919629057185,
"grad_norm": 1.0878709554672241,
"learning_rate": 0.00013401015228426397,
"loss": 6.1898,
"step": 71
},
{
"epoch": 0.3561051004636785,
"grad_norm": 0.9934064149856567,
"learning_rate": 0.00013299492385786802,
"loss": 6.2047,
"step": 72
},
{
"epoch": 0.3610510046367852,
"grad_norm": 0.8686928749084473,
"learning_rate": 0.00013197969543147207,
"loss": 6.1214,
"step": 73
},
{
"epoch": 0.36599690880989183,
"grad_norm": 0.858200192451477,
"learning_rate": 0.00013096446700507615,
"loss": 6.0784,
"step": 74
},
{
"epoch": 0.37094281298299847,
"grad_norm": 0.8108780980110168,
"learning_rate": 0.0001299492385786802,
"loss": 6.1899,
"step": 75
},
{
"epoch": 0.3758887171561051,
"grad_norm": 0.8366422653198242,
"learning_rate": 0.00012893401015228427,
"loss": 6.131,
"step": 76
},
{
"epoch": 0.38083462132921175,
"grad_norm": 1.2487200498580933,
"learning_rate": 0.00012791878172588834,
"loss": 6.1158,
"step": 77
},
{
"epoch": 0.3857805255023184,
"grad_norm": 1.0677459239959717,
"learning_rate": 0.0001269035532994924,
"loss": 6.0873,
"step": 78
},
{
"epoch": 0.39072642967542504,
"grad_norm": 0.9405259490013123,
"learning_rate": 0.00012588832487309644,
"loss": 6.0409,
"step": 79
},
{
"epoch": 0.3956723338485317,
"grad_norm": 1.488607406616211,
"learning_rate": 0.00012487309644670052,
"loss": 5.9868,
"step": 80
},
{
"epoch": 0.4006182380216383,
"grad_norm": 0.9067093729972839,
"learning_rate": 0.00012385786802030456,
"loss": 6.0035,
"step": 81
},
{
"epoch": 0.40556414219474496,
"grad_norm": 1.1395992040634155,
"learning_rate": 0.00012284263959390864,
"loss": 5.9638,
"step": 82
},
{
"epoch": 0.4105100463678516,
"grad_norm": 1.4701273441314697,
"learning_rate": 0.0001218274111675127,
"loss": 6.0212,
"step": 83
},
{
"epoch": 0.4154559505409583,
"grad_norm": 0.8167937397956848,
"learning_rate": 0.00012081218274111676,
"loss": 6.0759,
"step": 84
},
{
"epoch": 0.42040185471406494,
"grad_norm": 1.398577332496643,
"learning_rate": 0.00011979695431472082,
"loss": 5.9284,
"step": 85
},
{
"epoch": 0.4253477588871716,
"grad_norm": 1.0022815465927124,
"learning_rate": 0.00011878172588832489,
"loss": 5.9638,
"step": 86
},
{
"epoch": 0.4302936630602782,
"grad_norm": 1.1316360235214233,
"learning_rate": 0.00011776649746192893,
"loss": 5.8901,
"step": 87
},
{
"epoch": 0.43523956723338486,
"grad_norm": 1.1034351587295532,
"learning_rate": 0.000116751269035533,
"loss": 5.9288,
"step": 88
},
{
"epoch": 0.4401854714064915,
"grad_norm": 0.9991883039474487,
"learning_rate": 0.00011573604060913706,
"loss": 5.9447,
"step": 89
},
{
"epoch": 0.44513137557959814,
"grad_norm": 1.4334654808044434,
"learning_rate": 0.00011472081218274113,
"loss": 5.8657,
"step": 90
},
{
"epoch": 0.4500772797527048,
"grad_norm": 1.0602012872695923,
"learning_rate": 0.0001137055837563452,
"loss": 5.8563,
"step": 91
},
{
"epoch": 0.4550231839258114,
"grad_norm": 0.9210672378540039,
"learning_rate": 0.00011269035532994925,
"loss": 5.8811,
"step": 92
},
{
"epoch": 0.45996908809891807,
"grad_norm": 0.9101308584213257,
"learning_rate": 0.0001116751269035533,
"loss": 5.9572,
"step": 93
},
{
"epoch": 0.4649149922720247,
"grad_norm": 0.8447904586791992,
"learning_rate": 0.00011065989847715736,
"loss": 5.8762,
"step": 94
},
{
"epoch": 0.46986089644513135,
"grad_norm": 0.7616278529167175,
"learning_rate": 0.00010964467005076143,
"loss": 5.9493,
"step": 95
},
{
"epoch": 0.47480680061823805,
"grad_norm": 1.0465595722198486,
"learning_rate": 0.00010862944162436547,
"loss": 5.8367,
"step": 96
},
{
"epoch": 0.4797527047913447,
"grad_norm": 1.4627708196640015,
"learning_rate": 0.00010761421319796954,
"loss": 5.8301,
"step": 97
},
{
"epoch": 0.4846986089644513,
"grad_norm": 1.0495349168777466,
"learning_rate": 0.00010659898477157362,
"loss": 5.8782,
"step": 98
},
{
"epoch": 0.48964451313755797,
"grad_norm": 0.9480841755867004,
"learning_rate": 0.00010558375634517767,
"loss": 5.7681,
"step": 99
},
{
"epoch": 0.4945904173106646,
"grad_norm": 0.8606300354003906,
"learning_rate": 0.00010456852791878173,
"loss": 5.7448,
"step": 100
},
{
"epoch": 0.49953632148377125,
"grad_norm": 0.9947773218154907,
"learning_rate": 0.0001035532994923858,
"loss": 5.8485,
"step": 101
},
{
"epoch": 0.504482225656878,
"grad_norm": 1.0647828578948975,
"learning_rate": 0.00010253807106598984,
"loss": 5.7214,
"step": 102
},
{
"epoch": 0.5094281298299845,
"grad_norm": 1.1592961549758911,
"learning_rate": 0.0001015228426395939,
"loss": 5.7393,
"step": 103
},
{
"epoch": 0.5143740340030912,
"grad_norm": 0.8949771523475647,
"learning_rate": 0.00010050761421319797,
"loss": 5.7635,
"step": 104
},
{
"epoch": 0.5193199381761978,
"grad_norm": 0.8713933229446411,
"learning_rate": 9.949238578680203e-05,
"loss": 5.7227,
"step": 105
},
{
"epoch": 0.5242658423493045,
"grad_norm": 0.8814818859100342,
"learning_rate": 9.847715736040609e-05,
"loss": 5.7516,
"step": 106
},
{
"epoch": 0.5292117465224111,
"grad_norm": 0.9553707838058472,
"learning_rate": 9.746192893401017e-05,
"loss": 5.7522,
"step": 107
},
{
"epoch": 0.5341576506955178,
"grad_norm": 0.8567320704460144,
"learning_rate": 9.644670050761421e-05,
"loss": 5.6508,
"step": 108
},
{
"epoch": 0.5391035548686244,
"grad_norm": 1.0081580877304077,
"learning_rate": 9.543147208121828e-05,
"loss": 5.642,
"step": 109
},
{
"epoch": 0.5440494590417311,
"grad_norm": 1.1526085138320923,
"learning_rate": 9.441624365482235e-05,
"loss": 5.7423,
"step": 110
},
{
"epoch": 0.5489953632148377,
"grad_norm": 1.2273470163345337,
"learning_rate": 9.34010152284264e-05,
"loss": 5.7094,
"step": 111
},
{
"epoch": 0.5539412673879444,
"grad_norm": 0.830719530582428,
"learning_rate": 9.238578680203046e-05,
"loss": 5.7365,
"step": 112
},
{
"epoch": 0.558887171561051,
"grad_norm": 1.1520576477050781,
"learning_rate": 9.137055837563452e-05,
"loss": 5.7391,
"step": 113
},
{
"epoch": 0.5638330757341576,
"grad_norm": 1.1414787769317627,
"learning_rate": 9.035532994923858e-05,
"loss": 5.7288,
"step": 114
},
{
"epoch": 0.5687789799072643,
"grad_norm": 0.9615758061408997,
"learning_rate": 8.934010152284265e-05,
"loss": 5.5568,
"step": 115
},
{
"epoch": 0.5737248840803709,
"grad_norm": 0.8781617879867554,
"learning_rate": 8.83248730964467e-05,
"loss": 5.6264,
"step": 116
},
{
"epoch": 0.5786707882534776,
"grad_norm": 1.1544886827468872,
"learning_rate": 8.730964467005075e-05,
"loss": 5.6724,
"step": 117
},
{
"epoch": 0.5836166924265842,
"grad_norm": 0.931874692440033,
"learning_rate": 8.629441624365483e-05,
"loss": 5.6046,
"step": 118
},
{
"epoch": 0.5885625965996909,
"grad_norm": 0.7856680750846863,
"learning_rate": 8.527918781725889e-05,
"loss": 5.6521,
"step": 119
},
{
"epoch": 0.5935085007727975,
"grad_norm": 1.162001609802246,
"learning_rate": 8.426395939086294e-05,
"loss": 5.5843,
"step": 120
},
{
"epoch": 0.5984544049459042,
"grad_norm": 0.8572034239768982,
"learning_rate": 8.324873096446701e-05,
"loss": 5.6526,
"step": 121
},
{
"epoch": 0.6034003091190108,
"grad_norm": 0.9555945992469788,
"learning_rate": 8.223350253807108e-05,
"loss": 5.6673,
"step": 122
},
{
"epoch": 0.6083462132921175,
"grad_norm": 0.880160927772522,
"learning_rate": 8.121827411167512e-05,
"loss": 5.498,
"step": 123
},
{
"epoch": 0.6132921174652242,
"grad_norm": 1.1022496223449707,
"learning_rate": 8.020304568527919e-05,
"loss": 5.5833,
"step": 124
},
{
"epoch": 0.6182380216383307,
"grad_norm": 0.9595851898193359,
"learning_rate": 7.918781725888326e-05,
"loss": 5.6384,
"step": 125
},
{
"epoch": 0.6231839258114374,
"grad_norm": 1.4313597679138184,
"learning_rate": 7.817258883248731e-05,
"loss": 5.5478,
"step": 126
},
{
"epoch": 0.628129829984544,
"grad_norm": 0.9351322054862976,
"learning_rate": 7.715736040609137e-05,
"loss": 5.5652,
"step": 127
},
{
"epoch": 0.6330757341576507,
"grad_norm": 1.251789927482605,
"learning_rate": 7.614213197969543e-05,
"loss": 5.5387,
"step": 128
},
{
"epoch": 0.6380216383307573,
"grad_norm": 0.98284912109375,
"learning_rate": 7.51269035532995e-05,
"loss": 5.5338,
"step": 129
},
{
"epoch": 0.642967542503864,
"grad_norm": 1.0421977043151855,
"learning_rate": 7.411167512690356e-05,
"loss": 5.5774,
"step": 130
},
{
"epoch": 0.6479134466769706,
"grad_norm": 1.0751053094863892,
"learning_rate": 7.309644670050762e-05,
"loss": 5.5642,
"step": 131
},
{
"epoch": 0.6528593508500773,
"grad_norm": 1.089376449584961,
"learning_rate": 7.208121827411168e-05,
"loss": 5.505,
"step": 132
},
{
"epoch": 0.6578052550231839,
"grad_norm": 1.0731728076934814,
"learning_rate": 7.106598984771574e-05,
"loss": 5.5514,
"step": 133
},
{
"epoch": 0.6627511591962906,
"grad_norm": 1.2262444496154785,
"learning_rate": 7.00507614213198e-05,
"loss": 5.5723,
"step": 134
},
{
"epoch": 0.6676970633693973,
"grad_norm": 1.0487595796585083,
"learning_rate": 6.903553299492386e-05,
"loss": 5.5587,
"step": 135
},
{
"epoch": 0.6726429675425039,
"grad_norm": 1.084671139717102,
"learning_rate": 6.802030456852793e-05,
"loss": 5.4868,
"step": 136
},
{
"epoch": 0.6775888717156106,
"grad_norm": 1.1871248483657837,
"learning_rate": 6.700507614213199e-05,
"loss": 5.5475,
"step": 137
},
{
"epoch": 0.6825347758887171,
"grad_norm": 0.960493803024292,
"learning_rate": 6.598984771573604e-05,
"loss": 5.5006,
"step": 138
},
{
"epoch": 0.6874806800618238,
"grad_norm": 1.053593397140503,
"learning_rate": 6.49746192893401e-05,
"loss": 5.5389,
"step": 139
},
{
"epoch": 0.6924265842349304,
"grad_norm": 0.8886996507644653,
"learning_rate": 6.395939086294417e-05,
"loss": 5.4616,
"step": 140
},
{
"epoch": 0.6973724884080371,
"grad_norm": 1.1852856874465942,
"learning_rate": 6.294416243654822e-05,
"loss": 5.498,
"step": 141
},
{
"epoch": 0.7023183925811437,
"grad_norm": 0.8381466865539551,
"learning_rate": 6.192893401015228e-05,
"loss": 5.4977,
"step": 142
},
{
"epoch": 0.7072642967542504,
"grad_norm": 1.01845121383667,
"learning_rate": 6.091370558375635e-05,
"loss": 5.4162,
"step": 143
},
{
"epoch": 0.712210200927357,
"grad_norm": 0.9204426407814026,
"learning_rate": 5.989847715736041e-05,
"loss": 5.4654,
"step": 144
},
{
"epoch": 0.7171561051004637,
"grad_norm": 1.0901105403900146,
"learning_rate": 5.8883248730964467e-05,
"loss": 5.4262,
"step": 145
},
{
"epoch": 0.7221020092735704,
"grad_norm": 0.9842381477355957,
"learning_rate": 5.786802030456853e-05,
"loss": 5.4622,
"step": 146
},
{
"epoch": 0.727047913446677,
"grad_norm": 1.1234885454177856,
"learning_rate": 5.68527918781726e-05,
"loss": 5.4668,
"step": 147
},
{
"epoch": 0.7319938176197837,
"grad_norm": 1.0685431957244873,
"learning_rate": 5.583756345177665e-05,
"loss": 5.4649,
"step": 148
},
{
"epoch": 0.7369397217928902,
"grad_norm": 1.086138367652893,
"learning_rate": 5.482233502538071e-05,
"loss": 5.336,
"step": 149
},
{
"epoch": 0.7418856259659969,
"grad_norm": 1.0806076526641846,
"learning_rate": 5.380710659898477e-05,
"loss": 5.3463,
"step": 150
},
{
"epoch": 0.7468315301391035,
"grad_norm": 1.1613116264343262,
"learning_rate": 5.2791878172588836e-05,
"loss": 5.4095,
"step": 151
},
{
"epoch": 0.7517774343122102,
"grad_norm": 1.1117639541625977,
"learning_rate": 5.17766497461929e-05,
"loss": 5.4248,
"step": 152
},
{
"epoch": 0.7567233384853168,
"grad_norm": 0.9730443954467773,
"learning_rate": 5.076142131979695e-05,
"loss": 5.5573,
"step": 153
},
{
"epoch": 0.7616692426584235,
"grad_norm": 1.0216584205627441,
"learning_rate": 4.9746192893401014e-05,
"loss": 5.3337,
"step": 154
},
{
"epoch": 0.7666151468315301,
"grad_norm": 0.9828229546546936,
"learning_rate": 4.873096446700508e-05,
"loss": 5.3757,
"step": 155
},
{
"epoch": 0.7715610510046368,
"grad_norm": 1.0315641164779663,
"learning_rate": 4.771573604060914e-05,
"loss": 5.4465,
"step": 156
},
{
"epoch": 0.7765069551777435,
"grad_norm": 1.1969993114471436,
"learning_rate": 4.67005076142132e-05,
"loss": 5.4018,
"step": 157
},
{
"epoch": 0.7814528593508501,
"grad_norm": 0.7633097171783447,
"learning_rate": 4.568527918781726e-05,
"loss": 5.5137,
"step": 158
},
{
"epoch": 0.7863987635239568,
"grad_norm": 0.8312305212020874,
"learning_rate": 4.467005076142132e-05,
"loss": 5.4078,
"step": 159
},
{
"epoch": 0.7913446676970634,
"grad_norm": 0.9463878870010376,
"learning_rate": 4.365482233502538e-05,
"loss": 5.3738,
"step": 160
},
{
"epoch": 0.79629057187017,
"grad_norm": 0.8046661615371704,
"learning_rate": 4.2639593908629446e-05,
"loss": 5.455,
"step": 161
},
{
"epoch": 0.8012364760432766,
"grad_norm": 1.0929735898971558,
"learning_rate": 4.162436548223351e-05,
"loss": 5.4263,
"step": 162
},
{
"epoch": 0.8061823802163833,
"grad_norm": 1.0323022603988647,
"learning_rate": 4.060913705583756e-05,
"loss": 5.4503,
"step": 163
},
{
"epoch": 0.8111282843894899,
"grad_norm": 0.7212726473808289,
"learning_rate": 3.959390862944163e-05,
"loss": 5.3904,
"step": 164
},
{
"epoch": 0.8160741885625966,
"grad_norm": 0.8705483078956604,
"learning_rate": 3.8578680203045685e-05,
"loss": 5.2958,
"step": 165
},
{
"epoch": 0.8210200927357032,
"grad_norm": 0.9705776572227478,
"learning_rate": 3.756345177664975e-05,
"loss": 5.3806,
"step": 166
},
{
"epoch": 0.8259659969088099,
"grad_norm": 0.7694171667098999,
"learning_rate": 3.654822335025381e-05,
"loss": 5.3446,
"step": 167
},
{
"epoch": 0.8309119010819166,
"grad_norm": 1.0148179531097412,
"learning_rate": 3.553299492385787e-05,
"loss": 5.4316,
"step": 168
},
{
"epoch": 0.8358578052550232,
"grad_norm": 1.0124086141586304,
"learning_rate": 3.451776649746193e-05,
"loss": 5.2903,
"step": 169
},
{
"epoch": 0.8408037094281299,
"grad_norm": 0.8755667209625244,
"learning_rate": 3.3502538071065994e-05,
"loss": 5.2636,
"step": 170
},
{
"epoch": 0.8457496136012365,
"grad_norm": 0.992751955986023,
"learning_rate": 3.248730964467005e-05,
"loss": 5.3662,
"step": 171
},
{
"epoch": 0.8506955177743432,
"grad_norm": 0.676480770111084,
"learning_rate": 3.147208121827411e-05,
"loss": 5.3912,
"step": 172
},
{
"epoch": 0.8556414219474497,
"grad_norm": 0.8479735851287842,
"learning_rate": 3.0456852791878175e-05,
"loss": 5.5655,
"step": 173
},
{
"epoch": 0.8605873261205564,
"grad_norm": 0.8780114054679871,
"learning_rate": 2.9441624365482233e-05,
"loss": 5.4011,
"step": 174
},
{
"epoch": 0.865533230293663,
"grad_norm": 0.7192287445068359,
"learning_rate": 2.84263959390863e-05,
"loss": 5.46,
"step": 175
},
{
"epoch": 0.8704791344667697,
"grad_norm": 0.9556674957275391,
"learning_rate": 2.7411167512690357e-05,
"loss": 5.4278,
"step": 176
},
{
"epoch": 0.8754250386398763,
"grad_norm": 0.7303546667098999,
"learning_rate": 2.6395939086294418e-05,
"loss": 5.3822,
"step": 177
},
{
"epoch": 0.880370942812983,
"grad_norm": 0.7659119963645935,
"learning_rate": 2.5380710659898476e-05,
"loss": 5.3925,
"step": 178
},
{
"epoch": 0.8853168469860896,
"grad_norm": 0.8511722087860107,
"learning_rate": 2.436548223350254e-05,
"loss": 5.3318,
"step": 179
},
{
"epoch": 0.8902627511591963,
"grad_norm": 0.8240477442741394,
"learning_rate": 2.33502538071066e-05,
"loss": 5.2479,
"step": 180
},
{
"epoch": 0.895208655332303,
"grad_norm": 0.8193429112434387,
"learning_rate": 2.233502538071066e-05,
"loss": 5.4237,
"step": 181
},
{
"epoch": 0.9001545595054096,
"grad_norm": 0.8074966669082642,
"learning_rate": 2.1319796954314723e-05,
"loss": 5.5029,
"step": 182
},
{
"epoch": 0.9051004636785163,
"grad_norm": 0.6603164076805115,
"learning_rate": 2.030456852791878e-05,
"loss": 5.3007,
"step": 183
},
{
"epoch": 0.9100463678516229,
"grad_norm": 0.633477509021759,
"learning_rate": 1.9289340101522843e-05,
"loss": 5.396,
"step": 184
},
{
"epoch": 0.9149922720247295,
"grad_norm": 0.6681249141693115,
"learning_rate": 1.8274111675126904e-05,
"loss": 5.3733,
"step": 185
},
{
"epoch": 0.9199381761978361,
"grad_norm": 0.756808340549469,
"learning_rate": 1.7258883248730966e-05,
"loss": 5.3439,
"step": 186
},
{
"epoch": 0.9248840803709428,
"grad_norm": 0.64524906873703,
"learning_rate": 1.6243654822335024e-05,
"loss": 5.4027,
"step": 187
},
{
"epoch": 0.9298299845440494,
"grad_norm": 0.7147576212882996,
"learning_rate": 1.5228426395939088e-05,
"loss": 5.3111,
"step": 188
},
{
"epoch": 0.9347758887171561,
"grad_norm": 0.6565448641777039,
"learning_rate": 1.421319796954315e-05,
"loss": 5.3649,
"step": 189
},
{
"epoch": 0.9397217928902627,
"grad_norm": 0.6476154923439026,
"learning_rate": 1.3197969543147209e-05,
"loss": 5.3617,
"step": 190
},
{
"epoch": 0.9446676970633694,
"grad_norm": 0.6315869092941284,
"learning_rate": 1.218274111675127e-05,
"loss": 5.3247,
"step": 191
},
{
"epoch": 0.9496136012364761,
"grad_norm": 0.6404466032981873,
"learning_rate": 1.116751269035533e-05,
"loss": 5.3402,
"step": 192
},
{
"epoch": 0.9545595054095827,
"grad_norm": 0.6863434314727783,
"learning_rate": 1.015228426395939e-05,
"loss": 5.3436,
"step": 193
},
{
"epoch": 0.9595054095826894,
"grad_norm": 0.6492709517478943,
"learning_rate": 9.137055837563452e-06,
"loss": 5.2449,
"step": 194
},
{
"epoch": 0.964451313755796,
"grad_norm": 0.647345781326294,
"learning_rate": 8.121827411167512e-06,
"loss": 5.3811,
"step": 195
},
{
"epoch": 0.9693972179289027,
"grad_norm": 0.711609423160553,
"learning_rate": 7.106598984771575e-06,
"loss": 5.3612,
"step": 196
},
{
"epoch": 0.9743431221020092,
"grad_norm": 0.610159158706665,
"learning_rate": 6.091370558375635e-06,
"loss": 5.3041,
"step": 197
},
{
"epoch": 0.9792890262751159,
"grad_norm": 0.61027592420578,
"learning_rate": 5.076142131979695e-06,
"loss": 5.3324,
"step": 198
},
{
"epoch": 0.9842349304482225,
"grad_norm": 0.5848086476325989,
"learning_rate": 4.060913705583756e-06,
"loss": 5.3446,
"step": 199
},
{
"epoch": 0.9891808346213292,
"grad_norm": 0.5617231130599976,
"learning_rate": 3.0456852791878177e-06,
"loss": 5.3997,
"step": 200
},
{
"epoch": 0.9941267387944358,
"grad_norm": 0.6468728184700012,
"learning_rate": 2.030456852791878e-06,
"loss": 5.3444,
"step": 201
},
{
"epoch": 0.9990726429675425,
"grad_norm": 0.629033088684082,
"learning_rate": 1.015228426395939e-06,
"loss": 5.3283,
"step": 202
}
],
"logging_steps": 1,
"max_steps": 202,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5526784012305408.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}