gemma-2-it-zno-cot / trainer_state.json

Upload folder using huggingface_hub

8de0139 verified 5 days ago

38.8 kB

	{
	"best_metric": 1.3252594470977783,
	"best_model_checkpoint": "4bit_repro_03022025/host17_seed_42_full_det_fp16_no_flash_attn_fix_pad_gemma-2-9b-it-l16-cot-wt-4ep-lr3e04-ws20-bs4-ga4-fp16-13022025/checkpoint-109",
	"epoch": 2.0,
	"eval_steps": 500,
	"global_step": 218,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.009195402298850575,
	"grad_norm": 1.025204062461853,
	"learning_rate": 1.4999999999999999e-05,
	"loss": 2.395,
	"step": 1
	},
	{
	"epoch": 0.01839080459770115,
	"grad_norm": 0.7824286222457886,
	"learning_rate": 2.9999999999999997e-05,
	"loss": 2.3972,
	"step": 2
	},
	{
	"epoch": 0.027586206896551724,
	"grad_norm": 0.9325972199440002,
	"learning_rate": 4.4999999999999996e-05,
	"loss": 2.2652,
	"step": 3
	},
	{
	"epoch": 0.0367816091954023,
	"grad_norm": 0.7933842539787292,
	"learning_rate": 5.9999999999999995e-05,
	"loss": 2.1491,
	"step": 4
	},
	{
	"epoch": 0.04597701149425287,
	"grad_norm": 0.9390926957130432,
	"learning_rate": 7.5e-05,
	"loss": 2.2175,
	"step": 5
	},
	{
	"epoch": 0.05517241379310345,
	"grad_norm": 0.8701347708702087,
	"learning_rate": 8.999999999999999e-05,
	"loss": 2.0785,
	"step": 6
	},
	{
	"epoch": 0.06436781609195402,
	"grad_norm": 0.48448503017425537,
	"learning_rate": 0.00010499999999999999,
	"loss": 2.0785,
	"step": 7
	},
	{
	"epoch": 0.0735632183908046,
	"grad_norm": 0.39611828327178955,
	"learning_rate": 0.00011999999999999999,
	"loss": 2.0303,
	"step": 8
	},
	{
	"epoch": 0.08275862068965517,
	"grad_norm": 0.5025896430015564,
	"learning_rate": 0.000135,
	"loss": 1.917,
	"step": 9
	},
	{
	"epoch": 0.09195402298850575,
	"grad_norm": 0.6268681883811951,
	"learning_rate": 0.00015,
	"loss": 1.7051,
	"step": 10
	},
	{
	"epoch": 0.10114942528735632,
	"grad_norm": 0.6085858941078186,
	"learning_rate": 0.000165,
	"loss": 1.729,
	"step": 11
	},
	{
	"epoch": 0.1103448275862069,
	"grad_norm": 0.5290607213973999,
	"learning_rate": 0.00017999999999999998,
	"loss": 1.715,
	"step": 12
	},
	{
	"epoch": 0.11954022988505747,
	"grad_norm": 0.657960832118988,
	"learning_rate": 0.000195,
	"loss": 1.5356,
	"step": 13
	},
	{
	"epoch": 0.12873563218390804,
	"grad_norm": 0.4407201409339905,
	"learning_rate": 0.00020999999999999998,
	"loss": 1.6903,
	"step": 14
	},
	{
	"epoch": 0.13793103448275862,
	"grad_norm": 0.3601807951927185,
	"learning_rate": 0.000225,
	"loss": 1.6744,
	"step": 15
	},
	{
	"epoch": 0.1471264367816092,
	"grad_norm": 0.3802438974380493,
	"learning_rate": 0.00023999999999999998,
	"loss": 1.8822,
	"step": 16
	},
	{
	"epoch": 0.15632183908045977,
	"grad_norm": 0.4443354904651642,
	"learning_rate": 0.00025499999999999996,
	"loss": 1.3503,
	"step": 17
	},
	{
	"epoch": 0.16551724137931034,
	"grad_norm": 0.5189216136932373,
	"learning_rate": 0.00027,
	"loss": 1.3499,
	"step": 18
	},
	{
	"epoch": 0.17471264367816092,
	"grad_norm": 0.3960488438606262,
	"learning_rate": 0.000285,
	"loss": 1.3832,
	"step": 19
	},
	{
	"epoch": 0.1839080459770115,
	"grad_norm": 0.37185606360435486,
	"learning_rate": 0.0003,
	"loss": 1.575,
	"step": 20
	},
	{
	"epoch": 0.19310344827586207,
	"grad_norm": 0.28029191493988037,
	"learning_rate": 0.00029927184466019415,
	"loss": 1.6438,
	"step": 21
	},
	{
	"epoch": 0.20229885057471264,
	"grad_norm": 0.2731279134750366,
	"learning_rate": 0.00029854368932038833,
	"loss": 1.5843,
	"step": 22
	},
	{
	"epoch": 0.21149425287356322,
	"grad_norm": 0.35780686140060425,
	"learning_rate": 0.0002978155339805825,
	"loss": 1.3945,
	"step": 23
	},
	{
	"epoch": 0.2206896551724138,
	"grad_norm": 0.35450395941734314,
	"learning_rate": 0.0002970873786407767,
	"loss": 1.4894,
	"step": 24
	},
	{
	"epoch": 0.22988505747126436,
	"grad_norm": 0.3032964766025543,
	"learning_rate": 0.00029635922330097087,
	"loss": 1.64,
	"step": 25
	},
	{
	"epoch": 0.23908045977011494,
	"grad_norm": 0.3555232584476471,
	"learning_rate": 0.00029563106796116505,
	"loss": 1.4793,
	"step": 26
	},
	{
	"epoch": 0.2482758620689655,
	"grad_norm": 0.43719008564949036,
	"learning_rate": 0.0002949029126213592,
	"loss": 1.4318,
	"step": 27
	},
	{
	"epoch": 0.2574712643678161,
	"grad_norm": 0.3937687277793884,
	"learning_rate": 0.00029417475728155335,
	"loss": 1.3755,
	"step": 28
	},
	{
	"epoch": 0.26666666666666666,
	"grad_norm": 0.3995443880558014,
	"learning_rate": 0.00029344660194174753,
	"loss": 1.6313,
	"step": 29
	},
	{
	"epoch": 0.27586206896551724,
	"grad_norm": 0.33234909176826477,
	"learning_rate": 0.0002927184466019417,
	"loss": 1.7548,
	"step": 30
	},
	{
	"epoch": 0.2850574712643678,
	"grad_norm": 0.3954809010028839,
	"learning_rate": 0.0002919902912621359,
	"loss": 1.5549,
	"step": 31
	},
	{
	"epoch": 0.2942528735632184,
	"grad_norm": 0.3647831976413727,
	"learning_rate": 0.00029126213592233006,
	"loss": 1.3264,
	"step": 32
	},
	{
	"epoch": 0.30344827586206896,
	"grad_norm": 0.4714711308479309,
	"learning_rate": 0.00029053398058252424,
	"loss": 1.2362,
	"step": 33
	},
	{
	"epoch": 0.31264367816091954,
	"grad_norm": 0.4638761878013611,
	"learning_rate": 0.0002898058252427184,
	"loss": 1.5707,
	"step": 34
	},
	{
	"epoch": 0.3218390804597701,
	"grad_norm": 0.43770870566368103,
	"learning_rate": 0.0002890776699029126,
	"loss": 1.5975,
	"step": 35
	},
	{
	"epoch": 0.3310344827586207,
	"grad_norm": 0.46125656366348267,
	"learning_rate": 0.0002883495145631068,
	"loss": 1.4532,
	"step": 36
	},
	{
	"epoch": 0.34022988505747126,
	"grad_norm": 0.3735737204551697,
	"learning_rate": 0.00028762135922330096,
	"loss": 1.4564,
	"step": 37
	},
	{
	"epoch": 0.34942528735632183,
	"grad_norm": 0.35823461413383484,
	"learning_rate": 0.00028689320388349513,
	"loss": 1.5855,
	"step": 38
	},
	{
	"epoch": 0.3586206896551724,
	"grad_norm": 0.5508543252944946,
	"learning_rate": 0.0002861650485436893,
	"loss": 1.306,
	"step": 39
	},
	{
	"epoch": 0.367816091954023,
	"grad_norm": 0.4099932014942169,
	"learning_rate": 0.0002854368932038835,
	"loss": 1.5942,
	"step": 40
	},
	{
	"epoch": 0.37701149425287356,
	"grad_norm": 0.3676886558532715,
	"learning_rate": 0.00028470873786407767,
	"loss": 1.3708,
	"step": 41
	},
	{
	"epoch": 0.38620689655172413,
	"grad_norm": 0.6290714740753174,
	"learning_rate": 0.00028398058252427185,
	"loss": 1.2496,
	"step": 42
	},
	{
	"epoch": 0.3954022988505747,
	"grad_norm": 0.3946329951286316,
	"learning_rate": 0.00028325242718446603,
	"loss": 1.3344,
	"step": 43
	},
	{
	"epoch": 0.4045977011494253,
	"grad_norm": 0.4511699080467224,
	"learning_rate": 0.00028252427184466015,
	"loss": 1.356,
	"step": 44
	},
	{
	"epoch": 0.41379310344827586,
	"grad_norm": 0.5036881566047668,
	"learning_rate": 0.00028179611650485433,
	"loss": 1.2171,
	"step": 45
	},
	{
	"epoch": 0.42298850574712643,
	"grad_norm": 0.4095934331417084,
	"learning_rate": 0.0002810679611650485,
	"loss": 1.4812,
	"step": 46
	},
	{
	"epoch": 0.432183908045977,
	"grad_norm": 0.47633135318756104,
	"learning_rate": 0.0002803398058252427,
	"loss": 1.3561,
	"step": 47
	},
	{
	"epoch": 0.4413793103448276,
	"grad_norm": 0.4468563199043274,
	"learning_rate": 0.00027961165048543687,
	"loss": 1.2434,
	"step": 48
	},
	{
	"epoch": 0.45057471264367815,
	"grad_norm": 0.48372191190719604,
	"learning_rate": 0.00027888349514563105,
	"loss": 1.2266,
	"step": 49
	},
	{
	"epoch": 0.45977011494252873,
	"grad_norm": 0.5756326913833618,
	"learning_rate": 0.0002781553398058252,
	"loss": 1.1512,
	"step": 50
	},
	{
	"epoch": 0.4689655172413793,
	"grad_norm": 0.4629153907299042,
	"learning_rate": 0.0002774271844660194,
	"loss": 1.3474,
	"step": 51
	},
	{
	"epoch": 0.4781609195402299,
	"grad_norm": 0.42864587903022766,
	"learning_rate": 0.0002766990291262136,
	"loss": 1.1593,
	"step": 52
	},
	{
	"epoch": 0.48735632183908045,
	"grad_norm": 0.5796183943748474,
	"learning_rate": 0.00027597087378640776,
	"loss": 1.2137,
	"step": 53
	},
	{
	"epoch": 0.496551724137931,
	"grad_norm": 0.5870793461799622,
	"learning_rate": 0.00027524271844660194,
	"loss": 1.1082,
	"step": 54
	},
	{
	"epoch": 0.5057471264367817,
	"grad_norm": 0.4859938323497772,
	"learning_rate": 0.0002745145631067961,
	"loss": 1.3229,
	"step": 55
	},
	{
	"epoch": 0.5149425287356322,
	"grad_norm": 0.5698845386505127,
	"learning_rate": 0.0002737864077669903,
	"loss": 1.3068,
	"step": 56
	},
	{
	"epoch": 0.5241379310344828,
	"grad_norm": 0.5284724831581116,
	"learning_rate": 0.0002730582524271845,
	"loss": 1.0949,
	"step": 57
	},
	{
	"epoch": 0.5333333333333333,
	"grad_norm": 0.5468711256980896,
	"learning_rate": 0.00027233009708737865,
	"loss": 1.2197,
	"step": 58
	},
	{
	"epoch": 0.542528735632184,
	"grad_norm": 0.6027315258979797,
	"learning_rate": 0.0002716019417475728,
	"loss": 1.1874,
	"step": 59
	},
	{
	"epoch": 0.5517241379310345,
	"grad_norm": 0.5445360541343689,
	"learning_rate": 0.00027087378640776696,
	"loss": 1.2057,
	"step": 60
	},
	{
	"epoch": 0.5609195402298851,
	"grad_norm": 0.591551661491394,
	"learning_rate": 0.00027014563106796114,
	"loss": 1.1251,
	"step": 61
	},
	{
	"epoch": 0.5701149425287356,
	"grad_norm": 0.528071939945221,
	"learning_rate": 0.0002694174757281553,
	"loss": 1.0482,
	"step": 62
	},
	{
	"epoch": 0.5793103448275863,
	"grad_norm": 0.691935658454895,
	"learning_rate": 0.0002686893203883495,
	"loss": 1.0581,
	"step": 63
	},
	{
	"epoch": 0.5885057471264368,
	"grad_norm": 0.776759684085846,
	"learning_rate": 0.00026796116504854367,
	"loss": 1.1077,
	"step": 64
	},
	{
	"epoch": 0.5977011494252874,
	"grad_norm": 0.8228328227996826,
	"learning_rate": 0.00026723300970873785,
	"loss": 1.2629,
	"step": 65
	},
	{
	"epoch": 0.6068965517241379,
	"grad_norm": 0.5646819472312927,
	"learning_rate": 0.00026650485436893203,
	"loss": 0.9204,
	"step": 66
	},
	{
	"epoch": 0.6160919540229886,
	"grad_norm": 0.6202297806739807,
	"learning_rate": 0.0002657766990291262,
	"loss": 1.1396,
	"step": 67
	},
	{
	"epoch": 0.6252873563218391,
	"grad_norm": 0.6260644197463989,
	"learning_rate": 0.0002650485436893204,
	"loss": 1.0977,
	"step": 68
	},
	{
	"epoch": 0.6344827586206897,
	"grad_norm": 0.669505774974823,
	"learning_rate": 0.00026432038834951456,
	"loss": 1.2014,
	"step": 69
	},
	{
	"epoch": 0.6436781609195402,
	"grad_norm": 0.7686023712158203,
	"learning_rate": 0.00026359223300970874,
	"loss": 1.1332,
	"step": 70
	},
	{
	"epoch": 0.6528735632183909,
	"grad_norm": 0.7180910110473633,
	"learning_rate": 0.0002628640776699029,
	"loss": 0.88,
	"step": 71
	},
	{
	"epoch": 0.6620689655172414,
	"grad_norm": 0.6693065166473389,
	"learning_rate": 0.00026213592233009705,
	"loss": 0.9068,
	"step": 72
	},
	{
	"epoch": 0.671264367816092,
	"grad_norm": 0.6618425250053406,
	"learning_rate": 0.0002614077669902912,
	"loss": 0.9885,
	"step": 73
	},
	{
	"epoch": 0.6804597701149425,
	"grad_norm": 0.7131378054618835,
	"learning_rate": 0.0002606796116504854,
	"loss": 1.0587,
	"step": 74
	},
	{
	"epoch": 0.6896551724137931,
	"grad_norm": 0.9193438291549683,
	"learning_rate": 0.0002599514563106796,
	"loss": 1.1504,
	"step": 75
	},
	{
	"epoch": 0.6988505747126437,
	"grad_norm": 1.1682260036468506,
	"learning_rate": 0.00025922330097087376,
	"loss": 1.1793,
	"step": 76
	},
	{
	"epoch": 0.7080459770114943,
	"grad_norm": 0.6184092164039612,
	"learning_rate": 0.00025849514563106794,
	"loss": 1.2133,
	"step": 77
	},
	{
	"epoch": 0.7172413793103448,
	"grad_norm": 0.7343618273735046,
	"learning_rate": 0.0002577669902912621,
	"loss": 0.9616,
	"step": 78
	},
	{
	"epoch": 0.7264367816091954,
	"grad_norm": 0.8535470366477966,
	"learning_rate": 0.0002570388349514563,
	"loss": 1.2857,
	"step": 79
	},
	{
	"epoch": 0.735632183908046,
	"grad_norm": 0.6457574367523193,
	"learning_rate": 0.0002563106796116505,
	"loss": 1.0748,
	"step": 80
	},
	{
	"epoch": 0.7448275862068966,
	"grad_norm": 0.5693302154541016,
	"learning_rate": 0.0002555825242718446,
	"loss": 1.0785,
	"step": 81
	},
	{
	"epoch": 0.7540229885057471,
	"grad_norm": 0.6433013081550598,
	"learning_rate": 0.0002548543689320388,
	"loss": 0.9738,
	"step": 82
	},
	{
	"epoch": 0.7632183908045977,
	"grad_norm": 1.2133727073669434,
	"learning_rate": 0.00025412621359223296,
	"loss": 1.2062,
	"step": 83
	},
	{
	"epoch": 0.7724137931034483,
	"grad_norm": 0.7277675271034241,
	"learning_rate": 0.00025339805825242714,
	"loss": 1.1494,
	"step": 84
	},
	{
	"epoch": 0.7816091954022989,
	"grad_norm": 0.6444184184074402,
	"learning_rate": 0.0002526699029126213,
	"loss": 1.1214,
	"step": 85
	},
	{
	"epoch": 0.7908045977011494,
	"grad_norm": 0.8243492841720581,
	"learning_rate": 0.0002519417475728155,
	"loss": 0.8145,
	"step": 86
	},
	{
	"epoch": 0.8,
	"grad_norm": 0.6770063042640686,
	"learning_rate": 0.00025121359223300967,
	"loss": 0.9134,
	"step": 87
	},
	{
	"epoch": 0.8091954022988506,
	"grad_norm": 0.6134109497070312,
	"learning_rate": 0.00025048543689320385,
	"loss": 1.0109,
	"step": 88
	},
	{
	"epoch": 0.8183908045977012,
	"grad_norm": 0.5844547748565674,
	"learning_rate": 0.00024975728155339803,
	"loss": 1.063,
	"step": 89
	},
	{
	"epoch": 0.8275862068965517,
	"grad_norm": 0.5940524339675903,
	"learning_rate": 0.0002490291262135922,
	"loss": 0.9952,
	"step": 90
	},
	{
	"epoch": 0.8367816091954023,
	"grad_norm": 0.7235853672027588,
	"learning_rate": 0.0002483009708737864,
	"loss": 0.895,
	"step": 91
	},
	{
	"epoch": 0.8459770114942529,
	"grad_norm": 0.7243452668190002,
	"learning_rate": 0.00024757281553398056,
	"loss": 0.7441,
	"step": 92
	},
	{
	"epoch": 0.8551724137931035,
	"grad_norm": 0.6366357207298279,
	"learning_rate": 0.00024684466019417474,
	"loss": 1.0253,
	"step": 93
	},
	{
	"epoch": 0.864367816091954,
	"grad_norm": 0.9579809308052063,
	"learning_rate": 0.0002461165048543689,
	"loss": 0.8995,
	"step": 94
	},
	{
	"epoch": 0.8735632183908046,
	"grad_norm": 0.8137032985687256,
	"learning_rate": 0.0002453883495145631,
	"loss": 0.8475,
	"step": 95
	},
	{
	"epoch": 0.8827586206896552,
	"grad_norm": 0.5339512825012207,
	"learning_rate": 0.0002446601941747572,
	"loss": 0.8167,
	"step": 96
	},
	{
	"epoch": 0.8919540229885058,
	"grad_norm": 0.6556524038314819,
	"learning_rate": 0.00024393203883495143,
	"loss": 1.0225,
	"step": 97
	},
	{
	"epoch": 0.9011494252873563,
	"grad_norm": 0.6119419932365417,
	"learning_rate": 0.0002432038834951456,
	"loss": 1.0889,
	"step": 98
	},
	{
	"epoch": 0.9103448275862069,
	"grad_norm": 0.7066159248352051,
	"learning_rate": 0.0002424757281553398,
	"loss": 0.8548,
	"step": 99
	},
	{
	"epoch": 0.9195402298850575,
	"grad_norm": 0.5464254021644592,
	"learning_rate": 0.00024174757281553394,
	"loss": 0.9283,
	"step": 100
	},
	{
	"epoch": 0.9287356321839081,
	"grad_norm": 0.825078010559082,
	"learning_rate": 0.00024101941747572812,
	"loss": 0.8686,
	"step": 101
	},
	{
	"epoch": 0.9379310344827586,
	"grad_norm": 1.2080026865005493,
	"learning_rate": 0.0002402912621359223,
	"loss": 0.8503,
	"step": 102
	},
	{
	"epoch": 0.9471264367816092,
	"grad_norm": 0.6597005128860474,
	"learning_rate": 0.00023956310679611648,
	"loss": 0.9614,
	"step": 103
	},
	{
	"epoch": 0.9563218390804598,
	"grad_norm": 0.614787757396698,
	"learning_rate": 0.00023883495145631065,
	"loss": 0.9684,
	"step": 104
	},
	{
	"epoch": 0.9655172413793104,
	"grad_norm": 0.6293591856956482,
	"learning_rate": 0.00023810679611650483,
	"loss": 0.7772,
	"step": 105
	},
	{
	"epoch": 0.9747126436781609,
	"grad_norm": 0.5669013857841492,
	"learning_rate": 0.000237378640776699,
	"loss": 1.1319,
	"step": 106
	},
	{
	"epoch": 0.9839080459770115,
	"grad_norm": 0.6458181738853455,
	"learning_rate": 0.0002366504854368932,
	"loss": 0.9616,
	"step": 107
	},
	{
	"epoch": 0.993103448275862,
	"grad_norm": 0.5852652192115784,
	"learning_rate": 0.00023592233009708734,
	"loss": 0.746,
	"step": 108
	},
	{
	"epoch": 1.0,
	"grad_norm": 0.8498281836509705,
	"learning_rate": 0.00023519417475728152,
	"loss": 0.8678,
	"step": 109
	},
	{
	"epoch": 1.0,
	"eval_loss": 1.3252594470977783,
	"eval_runtime": 94.0095,
	"eval_samples_per_second": 3.532,
	"eval_steps_per_second": 1.766,
	"step": 109
	},
	{
	"epoch": 1.0091954022988505,
	"grad_norm": 0.6115343570709229,
	"learning_rate": 0.0002344660194174757,
	"loss": 0.811,
	"step": 110
	},
	{
	"epoch": 1.018390804597701,
	"grad_norm": 0.5486385226249695,
	"learning_rate": 0.00023373786407766988,
	"loss": 0.6873,
	"step": 111
	},
	{
	"epoch": 1.0275862068965518,
	"grad_norm": 0.6083724498748779,
	"learning_rate": 0.00023300970873786406,
	"loss": 0.75,
	"step": 112
	},
	{
	"epoch": 1.0367816091954023,
	"grad_norm": 0.7998746633529663,
	"learning_rate": 0.00023228155339805823,
	"loss": 0.5876,
	"step": 113
	},
	{
	"epoch": 1.0459770114942528,
	"grad_norm": 0.6601845026016235,
	"learning_rate": 0.0002315533980582524,
	"loss": 0.8747,
	"step": 114
	},
	{
	"epoch": 1.0551724137931036,
	"grad_norm": 0.9844085574150085,
	"learning_rate": 0.0002308252427184466,
	"loss": 0.6241,
	"step": 115
	},
	{
	"epoch": 1.064367816091954,
	"grad_norm": 0.6095510125160217,
	"learning_rate": 0.00023009708737864074,
	"loss": 0.6274,
	"step": 116
	},
	{
	"epoch": 1.0735632183908046,
	"grad_norm": 0.9384357929229736,
	"learning_rate": 0.00022936893203883492,
	"loss": 0.7682,
	"step": 117
	},
	{
	"epoch": 1.0827586206896551,
	"grad_norm": 0.5237877368927002,
	"learning_rate": 0.0002286407766990291,
	"loss": 0.8889,
	"step": 118
	},
	{
	"epoch": 1.0919540229885056,
	"grad_norm": 0.5225788354873657,
	"learning_rate": 0.00022791262135922328,
	"loss": 0.6419,
	"step": 119
	},
	{
	"epoch": 1.1011494252873564,
	"grad_norm": 0.6210270524024963,
	"learning_rate": 0.00022718446601941746,
	"loss": 0.6799,
	"step": 120
	},
	{
	"epoch": 1.110344827586207,
	"grad_norm": 0.7171874046325684,
	"learning_rate": 0.00022645631067961164,
	"loss": 0.5037,
	"step": 121
	},
	{
	"epoch": 1.1195402298850574,
	"grad_norm": 0.6145285367965698,
	"learning_rate": 0.00022572815533980582,
	"loss": 0.4812,
	"step": 122
	},
	{
	"epoch": 1.1287356321839082,
	"grad_norm": 0.5306028723716736,
	"learning_rate": 0.000225,
	"loss": 0.6068,
	"step": 123
	},
	{
	"epoch": 1.1379310344827587,
	"grad_norm": 0.6142969131469727,
	"learning_rate": 0.00022427184466019415,
	"loss": 0.6441,
	"step": 124
	},
	{
	"epoch": 1.1471264367816092,
	"grad_norm": 0.5693908333778381,
	"learning_rate": 0.00022354368932038832,
	"loss": 0.6241,
	"step": 125
	},
	{
	"epoch": 1.1563218390804597,
	"grad_norm": 0.8560084104537964,
	"learning_rate": 0.0002228155339805825,
	"loss": 0.5208,
	"step": 126
	},
	{
	"epoch": 1.1655172413793102,
	"grad_norm": 0.9754599928855896,
	"learning_rate": 0.00022208737864077668,
	"loss": 0.6499,
	"step": 127
	},
	{
	"epoch": 1.174712643678161,
	"grad_norm": 0.515574038028717,
	"learning_rate": 0.00022135922330097086,
	"loss": 0.6246,
	"step": 128
	},
	{
	"epoch": 1.1839080459770115,
	"grad_norm": 0.5477547645568848,
	"learning_rate": 0.00022063106796116504,
	"loss": 0.6331,
	"step": 129
	},
	{
	"epoch": 1.193103448275862,
	"grad_norm": 0.445388525724411,
	"learning_rate": 0.00021990291262135922,
	"loss": 0.5737,
	"step": 130
	},
	{
	"epoch": 1.2022988505747128,
	"grad_norm": 0.6278632879257202,
	"learning_rate": 0.00021917475728155337,
	"loss": 0.5605,
	"step": 131
	},
	{
	"epoch": 1.2114942528735633,
	"grad_norm": 0.5176573991775513,
	"learning_rate": 0.00021844660194174755,
	"loss": 0.6198,
	"step": 132
	},
	{
	"epoch": 1.2206896551724138,
	"grad_norm": 0.5394790768623352,
	"learning_rate": 0.00021771844660194173,
	"loss": 0.743,
	"step": 133
	},
	{
	"epoch": 1.2298850574712643,
	"grad_norm": 0.5462550520896912,
	"learning_rate": 0.0002169902912621359,
	"loss": 0.5505,
	"step": 134
	},
	{
	"epoch": 1.2390804597701148,
	"grad_norm": 0.5793837904930115,
	"learning_rate": 0.00021626213592233008,
	"loss": 0.7134,
	"step": 135
	},
	{
	"epoch": 1.2482758620689656,
	"grad_norm": 0.5995808243751526,
	"learning_rate": 0.00021553398058252426,
	"loss": 0.8151,
	"step": 136
	},
	{
	"epoch": 1.257471264367816,
	"grad_norm": 0.6317359805107117,
	"learning_rate": 0.00021480582524271844,
	"loss": 0.4986,
	"step": 137
	},
	{
	"epoch": 1.2666666666666666,
	"grad_norm": 0.9133898019790649,
	"learning_rate": 0.00021407766990291262,
	"loss": 0.6029,
	"step": 138
	},
	{
	"epoch": 1.2758620689655173,
	"grad_norm": 0.7161931991577148,
	"learning_rate": 0.00021334951456310677,
	"loss": 0.6581,
	"step": 139
	},
	{
	"epoch": 1.2850574712643679,
	"grad_norm": 0.5639025568962097,
	"learning_rate": 0.00021262135922330095,
	"loss": 0.681,
	"step": 140
	},
	{
	"epoch": 1.2942528735632184,
	"grad_norm": 0.6325567364692688,
	"learning_rate": 0.00021189320388349513,
	"loss": 0.7994,
	"step": 141
	},
	{
	"epoch": 1.303448275862069,
	"grad_norm": 0.47429075837135315,
	"learning_rate": 0.0002111650485436893,
	"loss": 0.4164,
	"step": 142
	},
	{
	"epoch": 1.3126436781609194,
	"grad_norm": 0.3774986267089844,
	"learning_rate": 0.00021043689320388349,
	"loss": 0.3792,
	"step": 143
	},
	{
	"epoch": 1.3218390804597702,
	"grad_norm": 0.5024625062942505,
	"learning_rate": 0.00020970873786407766,
	"loss": 0.6999,
	"step": 144
	},
	{
	"epoch": 1.3310344827586207,
	"grad_norm": 0.4836028516292572,
	"learning_rate": 0.00020898058252427184,
	"loss": 0.3536,
	"step": 145
	},
	{
	"epoch": 1.3402298850574712,
	"grad_norm": 0.4562912881374359,
	"learning_rate": 0.00020825242718446602,
	"loss": 0.4362,
	"step": 146
	},
	{
	"epoch": 1.349425287356322,
	"grad_norm": 0.4715615212917328,
	"learning_rate": 0.00020752427184466017,
	"loss": 0.4743,
	"step": 147
	},
	{
	"epoch": 1.3586206896551725,
	"grad_norm": 0.5050966143608093,
	"learning_rate": 0.00020679611650485435,
	"loss": 0.6084,
	"step": 148
	},
	{
	"epoch": 1.367816091954023,
	"grad_norm": 0.5919803380966187,
	"learning_rate": 0.00020606796116504853,
	"loss": 0.4208,
	"step": 149
	},
	{
	"epoch": 1.3770114942528735,
	"grad_norm": 0.5397422313690186,
	"learning_rate": 0.0002053398058252427,
	"loss": 0.5182,
	"step": 150
	},
	{
	"epoch": 1.386206896551724,
	"grad_norm": 0.604860246181488,
	"learning_rate": 0.0002046116504854369,
	"loss": 0.6569,
	"step": 151
	},
	{
	"epoch": 1.3954022988505748,
	"grad_norm": 0.6743022799491882,
	"learning_rate": 0.00020388349514563107,
	"loss": 0.6063,
	"step": 152
	},
	{
	"epoch": 1.4045977011494253,
	"grad_norm": 0.5582085847854614,
	"learning_rate": 0.00020315533980582524,
	"loss": 0.8471,
	"step": 153
	},
	{
	"epoch": 1.4137931034482758,
	"grad_norm": 0.6764629483222961,
	"learning_rate": 0.00020242718446601942,
	"loss": 0.4767,
	"step": 154
	},
	{
	"epoch": 1.4229885057471265,
	"grad_norm": 0.39126965403556824,
	"learning_rate": 0.00020169902912621357,
	"loss": 0.4983,
	"step": 155
	},
	{
	"epoch": 1.432183908045977,
	"grad_norm": 0.5407236814498901,
	"learning_rate": 0.00020097087378640775,
	"loss": 0.6467,
	"step": 156
	},
	{
	"epoch": 1.4413793103448276,
	"grad_norm": 0.4321889579296112,
	"learning_rate": 0.00020024271844660193,
	"loss": 0.6037,
	"step": 157
	},
	{
	"epoch": 1.450574712643678,
	"grad_norm": 0.3570482134819031,
	"learning_rate": 0.0001995145631067961,
	"loss": 0.4515,
	"step": 158
	},
	{
	"epoch": 1.4597701149425286,
	"grad_norm": 0.5193243622779846,
	"learning_rate": 0.0001987864077669903,
	"loss": 0.5267,
	"step": 159
	},
	{
	"epoch": 1.4689655172413794,
	"grad_norm": 0.8264741897583008,
	"learning_rate": 0.00019805825242718447,
	"loss": 0.7169,
	"step": 160
	},
	{
	"epoch": 1.4781609195402299,
	"grad_norm": 0.6514953374862671,
	"learning_rate": 0.00019733009708737865,
	"loss": 0.678,
	"step": 161
	},
	{
	"epoch": 1.4873563218390804,
	"grad_norm": 0.5475180745124817,
	"learning_rate": 0.0001966019417475728,
	"loss": 0.5252,
	"step": 162
	},
	{
	"epoch": 1.4965517241379311,
	"grad_norm": 0.49964120984077454,
	"learning_rate": 0.00019587378640776698,
	"loss": 0.4259,
	"step": 163
	},
	{
	"epoch": 1.5057471264367817,
	"grad_norm": 0.4474540948867798,
	"learning_rate": 0.00019514563106796116,
	"loss": 0.4728,
	"step": 164
	},
	{
	"epoch": 1.5149425287356322,
	"grad_norm": 0.5726771950721741,
	"learning_rate": 0.00019441747572815533,
	"loss": 0.6752,
	"step": 165
	},
	{
	"epoch": 1.524137931034483,
	"grad_norm": 0.5038064122200012,
	"learning_rate": 0.0001936893203883495,
	"loss": 0.647,
	"step": 166
	},
	{
	"epoch": 1.5333333333333332,
	"grad_norm": 0.4093747138977051,
	"learning_rate": 0.0001929611650485437,
	"loss": 0.6077,
	"step": 167
	},
	{
	"epoch": 1.542528735632184,
	"grad_norm": 0.8166248798370361,
	"learning_rate": 0.00019223300970873787,
	"loss": 0.5149,
	"step": 168
	},
	{
	"epoch": 1.5517241379310345,
	"grad_norm": 0.5660980343818665,
	"learning_rate": 0.00019150485436893205,
	"loss": 0.3511,
	"step": 169
	},
	{
	"epoch": 1.560919540229885,
	"grad_norm": 0.403187096118927,
	"learning_rate": 0.0001907766990291262,
	"loss": 0.4097,
	"step": 170
	},
	{
	"epoch": 1.5701149425287357,
	"grad_norm": 0.5686673521995544,
	"learning_rate": 0.00019004854368932038,
	"loss": 0.8236,
	"step": 171
	},
	{
	"epoch": 1.5793103448275863,
	"grad_norm": 0.4967772662639618,
	"learning_rate": 0.00018932038834951456,
	"loss": 0.562,
	"step": 172
	},
	{
	"epoch": 1.5885057471264368,
	"grad_norm": 0.560854434967041,
	"learning_rate": 0.00018859223300970874,
	"loss": 0.5852,
	"step": 173
	},
	{
	"epoch": 1.5977011494252875,
	"grad_norm": 0.3643392324447632,
	"learning_rate": 0.00018786407766990291,
	"loss": 0.4042,
	"step": 174
	},
	{
	"epoch": 1.6068965517241378,
	"grad_norm": 0.6362044811248779,
	"learning_rate": 0.00018713592233009707,
	"loss": 0.6533,
	"step": 175
	},
	{
	"epoch": 1.6160919540229886,
	"grad_norm": 0.6190036535263062,
	"learning_rate": 0.00018640776699029122,
	"loss": 0.7651,
	"step": 176
	},
	{
	"epoch": 1.625287356321839,
	"grad_norm": 0.3463480472564697,
	"learning_rate": 0.0001856796116504854,
	"loss": 0.3312,
	"step": 177
	},
	{
	"epoch": 1.6344827586206896,
	"grad_norm": 0.2819209098815918,
	"learning_rate": 0.00018495145631067957,
	"loss": 0.3247,
	"step": 178
	},
	{
	"epoch": 1.6436781609195403,
	"grad_norm": 0.5651117563247681,
	"learning_rate": 0.00018422330097087375,
	"loss": 0.7674,
	"step": 179
	},
	{
	"epoch": 1.6528735632183909,
	"grad_norm": 0.4948618412017822,
	"learning_rate": 0.00018349514563106793,
	"loss": 0.6161,
	"step": 180
	},
	{
	"epoch": 1.6620689655172414,
	"grad_norm": 0.43636301159858704,
	"learning_rate": 0.0001827669902912621,
	"loss": 0.5334,
	"step": 181
	},
	{
	"epoch": 1.6712643678160921,
	"grad_norm": 0.4951108694076538,
	"learning_rate": 0.0001820388349514563,
	"loss": 0.624,
	"step": 182
	},
	{
	"epoch": 1.6804597701149424,
	"grad_norm": 0.5951234102249146,
	"learning_rate": 0.00018131067961165047,
	"loss": 0.5184,
	"step": 183
	},
	{
	"epoch": 1.6896551724137931,
	"grad_norm": 0.6109154224395752,
	"learning_rate": 0.00018058252427184462,
	"loss": 0.7274,
	"step": 184
	},
	{
	"epoch": 1.6988505747126437,
	"grad_norm": 0.4492969810962677,
	"learning_rate": 0.0001798543689320388,
	"loss": 0.6079,
	"step": 185
	},
	{
	"epoch": 1.7080459770114942,
	"grad_norm": 0.5195210576057434,
	"learning_rate": 0.00017912621359223298,
	"loss": 0.4998,
	"step": 186
	},
	{
	"epoch": 1.717241379310345,
	"grad_norm": 0.49724170565605164,
	"learning_rate": 0.00017839805825242716,
	"loss": 0.3856,
	"step": 187
	},
	{
	"epoch": 1.7264367816091954,
	"grad_norm": 1.1214869022369385,
	"learning_rate": 0.00017766990291262133,
	"loss": 0.4785,
	"step": 188
	},
	{
	"epoch": 1.735632183908046,
	"grad_norm": 0.5645748376846313,
	"learning_rate": 0.0001769417475728155,
	"loss": 0.5791,
	"step": 189
	},
	{
	"epoch": 1.7448275862068967,
	"grad_norm": 0.46523571014404297,
	"learning_rate": 0.0001762135922330097,
	"loss": 0.5804,
	"step": 190
	},
	{
	"epoch": 1.754022988505747,
	"grad_norm": 0.3765566945075989,
	"learning_rate": 0.00017548543689320387,
	"loss": 0.5272,
	"step": 191
	},
	{
	"epoch": 1.7632183908045977,
	"grad_norm": 0.5119166374206543,
	"learning_rate": 0.00017475728155339802,
	"loss": 0.6572,
	"step": 192
	},
	{
	"epoch": 1.7724137931034483,
	"grad_norm": 0.38700059056282043,
	"learning_rate": 0.0001740291262135922,
	"loss": 0.5233,
	"step": 193
	},
	{
	"epoch": 1.7816091954022988,
	"grad_norm": 0.3980446457862854,
	"learning_rate": 0.00017330097087378638,
	"loss": 0.4071,
	"step": 194
	},
	{
	"epoch": 1.7908045977011495,
	"grad_norm": 0.35074886679649353,
	"learning_rate": 0.00017257281553398056,
	"loss": 0.4505,
	"step": 195
	},
	{
	"epoch": 1.8,
	"grad_norm": 0.5284190773963928,
	"learning_rate": 0.00017184466019417474,
	"loss": 0.5048,
	"step": 196
	},
	{
	"epoch": 1.8091954022988506,
	"grad_norm": 0.47595924139022827,
	"learning_rate": 0.00017111650485436891,
	"loss": 0.541,
	"step": 197
	},
	{
	"epoch": 1.8183908045977013,
	"grad_norm": 0.557465672492981,
	"learning_rate": 0.0001703883495145631,
	"loss": 0.5175,
	"step": 198
	},
	{
	"epoch": 1.8275862068965516,
	"grad_norm": 0.4417920410633087,
	"learning_rate": 0.00016966019417475724,
	"loss": 0.5284,
	"step": 199
	},
	{
	"epoch": 1.8367816091954023,
	"grad_norm": 0.34410127997398376,
	"learning_rate": 0.00016893203883495142,
	"loss": 0.3723,
	"step": 200
	},
	{
	"epoch": 1.8459770114942529,
	"grad_norm": 0.3989458680152893,
	"learning_rate": 0.0001682038834951456,
	"loss": 0.4057,
	"step": 201
	},
	{
	"epoch": 1.8551724137931034,
	"grad_norm": 0.3975292444229126,
	"learning_rate": 0.00016747572815533978,
	"loss": 0.4334,
	"step": 202
	},
	{
	"epoch": 1.8643678160919541,
	"grad_norm": 0.5099373459815979,
	"learning_rate": 0.00016674757281553396,
	"loss": 0.5842,
	"step": 203
	},
	{
	"epoch": 1.8735632183908046,
	"grad_norm": 0.4445691406726837,
	"learning_rate": 0.00016601941747572814,
	"loss": 0.593,
	"step": 204
	},
	{
	"epoch": 1.8827586206896552,
	"grad_norm": 0.4758138060569763,
	"learning_rate": 0.00016529126213592232,
	"loss": 0.4163,
	"step": 205
	},
	{
	"epoch": 1.891954022988506,
	"grad_norm": 0.41732391715049744,
	"learning_rate": 0.0001645631067961165,
	"loss": 0.5405,
	"step": 206
	},
	{
	"epoch": 1.9011494252873562,
	"grad_norm": 0.3908286988735199,
	"learning_rate": 0.00016383495145631065,
	"loss": 0.4363,
	"step": 207
	},
	{
	"epoch": 1.910344827586207,
	"grad_norm": 0.5812026858329773,
	"learning_rate": 0.00016310679611650483,
	"loss": 0.7188,
	"step": 208
	},
	{
	"epoch": 1.9195402298850575,
	"grad_norm": 0.4734458327293396,
	"learning_rate": 0.000162378640776699,
	"loss": 0.569,
	"step": 209
	},
	{
	"epoch": 1.928735632183908,
	"grad_norm": 0.4347914457321167,
	"learning_rate": 0.00016165048543689318,
	"loss": 0.5026,
	"step": 210
	},
	{
	"epoch": 1.9379310344827587,
	"grad_norm": 0.3364557921886444,
	"learning_rate": 0.00016092233009708736,
	"loss": 0.3537,
	"step": 211
	},
	{
	"epoch": 1.9471264367816092,
	"grad_norm": 0.44029518961906433,
	"learning_rate": 0.00016019417475728154,
	"loss": 0.5733,
	"step": 212
	},
	{
	"epoch": 1.9563218390804598,
	"grad_norm": 0.33010566234588623,
	"learning_rate": 0.00015946601941747572,
	"loss": 0.3838,
	"step": 213
	},
	{
	"epoch": 1.9655172413793105,
	"grad_norm": 0.3367745876312256,
	"learning_rate": 0.0001587378640776699,
	"loss": 0.4212,
	"step": 214
	},
	{
	"epoch": 1.9747126436781608,
	"grad_norm": 0.5834444165229797,
	"learning_rate": 0.00015800970873786405,
	"loss": 0.833,
	"step": 215
	},
	{
	"epoch": 1.9839080459770115,
	"grad_norm": 0.5451297163963318,
	"learning_rate": 0.00015728155339805823,
	"loss": 0.7753,
	"step": 216
	},
	{
	"epoch": 1.993103448275862,
	"grad_norm": 0.4916711747646332,
	"learning_rate": 0.0001565533980582524,
	"loss": 0.6183,
	"step": 217
	},
	{
	"epoch": 2.0,
	"grad_norm": 0.7540034651756287,
	"learning_rate": 0.00015582524271844658,
	"loss": 0.8286,
	"step": 218
	},
	{
	"epoch": 2.0,
	"eval_loss": 1.4519011974334717,
	"eval_runtime": 93.8599,
	"eval_samples_per_second": 3.537,
	"eval_steps_per_second": 1.769,
	"step": 218
	}
	],
	"logging_steps": 1,
	"max_steps": 432,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 4,
	"save_steps": 500,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 1.9491566965342618e+17,
	"train_batch_size": 4,
	"trial_name": null,
	"trial_params": null
	}