{
  "best_metric": 1.300666093826294,
  "best_model_checkpoint": "4bit_repro_03022025/host10_seed_42_full_det_fp16_no_flash_attn_fix_pad_gemma-2-9b-it-l16-cot-4ep-lr3e04-ws20-bs4-ga4-fp16-11022025/checkpoint-109",
  "epoch": 3.0,
  "eval_steps": 500,
  "global_step": 327,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.009195402298850575,
      "grad_norm": 0.9957932233810425,
      "learning_rate": 1.4999999999999999e-05,
      "loss": 2.3724,
      "step": 1
    },
    {
      "epoch": 0.01839080459770115,
      "grad_norm": 0.7591073513031006,
      "learning_rate": 2.9999999999999997e-05,
      "loss": 2.3711,
      "step": 2
    },
    {
      "epoch": 0.027586206896551724,
      "grad_norm": 0.9023211002349854,
      "learning_rate": 4.4999999999999996e-05,
      "loss": 2.2454,
      "step": 3
    },
    {
      "epoch": 0.0367816091954023,
      "grad_norm": 0.7705220580101013,
      "learning_rate": 5.9999999999999995e-05,
      "loss": 2.1252,
      "step": 4
    },
    {
      "epoch": 0.04597701149425287,
      "grad_norm": 0.8994729518890381,
      "learning_rate": 7.5e-05,
      "loss": 2.1776,
      "step": 5
    },
    {
      "epoch": 0.05517241379310345,
      "grad_norm": 0.8370222449302673,
      "learning_rate": 8.999999999999999e-05,
      "loss": 2.0403,
      "step": 6
    },
    {
      "epoch": 0.06436781609195402,
      "grad_norm": 0.4772511124610901,
      "learning_rate": 0.00010499999999999999,
      "loss": 2.0642,
      "step": 7
    },
    {
      "epoch": 0.0735632183908046,
      "grad_norm": 0.3877703547477722,
      "learning_rate": 0.00011999999999999999,
      "loss": 2.0111,
      "step": 8
    },
    {
      "epoch": 0.08275862068965517,
      "grad_norm": 0.4768696129322052,
      "learning_rate": 0.000135,
      "loss": 1.8875,
      "step": 9
    },
    {
      "epoch": 0.09195402298850575,
      "grad_norm": 0.6014317870140076,
      "learning_rate": 0.00015,
      "loss": 1.672,
      "step": 10
    },
    {
      "epoch": 0.10114942528735632,
      "grad_norm": 0.6302416324615479,
      "learning_rate": 0.000165,
      "loss": 1.7088,
      "step": 11
    },
    {
      "epoch": 0.1103448275862069,
      "grad_norm": 0.5539880990982056,
      "learning_rate": 0.00017999999999999998,
      "loss": 1.6913,
      "step": 12
    },
    {
      "epoch": 0.11954022988505747,
      "grad_norm": 0.6798604130744934,
      "learning_rate": 0.000195,
      "loss": 1.4942,
      "step": 13
    },
    {
      "epoch": 0.12873563218390804,
      "grad_norm": 0.4399753510951996,
      "learning_rate": 0.00020999999999999998,
      "loss": 1.6453,
      "step": 14
    },
    {
      "epoch": 0.13793103448275862,
      "grad_norm": 0.37119659781455994,
      "learning_rate": 0.000225,
      "loss": 1.6376,
      "step": 15
    },
    {
      "epoch": 0.1471264367816092,
      "grad_norm": 0.38230371475219727,
      "learning_rate": 0.00023999999999999998,
      "loss": 1.8295,
      "step": 16
    },
    {
      "epoch": 0.15632183908045977,
      "grad_norm": 0.4516810476779938,
      "learning_rate": 0.00025499999999999996,
      "loss": 1.3166,
      "step": 17
    },
    {
      "epoch": 0.16551724137931034,
      "grad_norm": 0.6360406875610352,
      "learning_rate": 0.00027,
      "loss": 1.3089,
      "step": 18
    },
    {
      "epoch": 0.17471264367816092,
      "grad_norm": 0.3828903138637543,
      "learning_rate": 0.000285,
      "loss": 1.3322,
      "step": 19
    },
    {
      "epoch": 0.1839080459770115,
      "grad_norm": 0.3432200253009796,
      "learning_rate": 0.0003,
      "loss": 1.5339,
      "step": 20
    },
    {
      "epoch": 0.19310344827586207,
      "grad_norm": 0.278246134519577,
      "learning_rate": 0.00029927184466019415,
      "loss": 1.6185,
      "step": 21
    },
    {
      "epoch": 0.20229885057471264,
      "grad_norm": 0.278033047914505,
      "learning_rate": 0.00029854368932038833,
      "loss": 1.5488,
      "step": 22
    },
    {
      "epoch": 0.21149425287356322,
      "grad_norm": 0.35856395959854126,
      "learning_rate": 0.0002978155339805825,
      "loss": 1.3512,
      "step": 23
    },
    {
      "epoch": 0.2206896551724138,
      "grad_norm": 0.3614782989025116,
      "learning_rate": 0.0002970873786407767,
      "loss": 1.4487,
      "step": 24
    },
    {
      "epoch": 0.22988505747126436,
      "grad_norm": 0.30157631635665894,
      "learning_rate": 0.00029635922330097087,
      "loss": 1.6049,
      "step": 25
    },
    {
      "epoch": 0.23908045977011494,
      "grad_norm": 0.35453277826309204,
      "learning_rate": 0.00029563106796116505,
      "loss": 1.449,
      "step": 26
    },
    {
      "epoch": 0.2482758620689655,
      "grad_norm": 0.436924546957016,
      "learning_rate": 0.0002949029126213592,
      "loss": 1.3822,
      "step": 27
    },
    {
      "epoch": 0.2574712643678161,
      "grad_norm": 0.3903788626194,
      "learning_rate": 0.00029417475728155335,
      "loss": 1.33,
      "step": 28
    },
    {
      "epoch": 0.26666666666666666,
      "grad_norm": 0.39776864647865295,
      "learning_rate": 0.00029344660194174753,
      "loss": 1.5852,
      "step": 29
    },
    {
      "epoch": 0.27586206896551724,
      "grad_norm": 0.3369416296482086,
      "learning_rate": 0.0002927184466019417,
      "loss": 1.7242,
      "step": 30
    },
    {
      "epoch": 0.2850574712643678,
      "grad_norm": 0.3887549936771393,
      "learning_rate": 0.0002919902912621359,
      "loss": 1.5086,
      "step": 31
    },
    {
      "epoch": 0.2942528735632184,
      "grad_norm": 0.3567999005317688,
      "learning_rate": 0.00029126213592233006,
      "loss": 1.2896,
      "step": 32
    },
    {
      "epoch": 0.30344827586206896,
      "grad_norm": 0.45827871561050415,
      "learning_rate": 0.00029053398058252424,
      "loss": 1.1841,
      "step": 33
    },
    {
      "epoch": 0.31264367816091954,
      "grad_norm": 0.45487773418426514,
      "learning_rate": 0.0002898058252427184,
      "loss": 1.5209,
      "step": 34
    },
    {
      "epoch": 0.3218390804597701,
      "grad_norm": 0.44364598393440247,
      "learning_rate": 0.0002890776699029126,
      "loss": 1.5646,
      "step": 35
    },
    {
      "epoch": 0.3310344827586207,
      "grad_norm": 0.4502098560333252,
      "learning_rate": 0.0002883495145631068,
      "loss": 1.3928,
      "step": 36
    },
    {
      "epoch": 0.34022988505747126,
      "grad_norm": 0.36199966073036194,
      "learning_rate": 0.00028762135922330096,
      "loss": 1.4251,
      "step": 37
    },
    {
      "epoch": 0.34942528735632183,
      "grad_norm": 0.3491019308567047,
      "learning_rate": 0.00028689320388349513,
      "loss": 1.5562,
      "step": 38
    },
    {
      "epoch": 0.3586206896551724,
      "grad_norm": 0.5400763750076294,
      "learning_rate": 0.0002861650485436893,
      "loss": 1.2608,
      "step": 39
    },
    {
      "epoch": 0.367816091954023,
      "grad_norm": 0.3730115592479706,
      "learning_rate": 0.0002854368932038835,
      "loss": 1.5525,
      "step": 40
    },
    {
      "epoch": 0.37701149425287356,
      "grad_norm": 0.3588751554489136,
      "learning_rate": 0.00028470873786407767,
      "loss": 1.3468,
      "step": 41
    },
    {
      "epoch": 0.38620689655172413,
      "grad_norm": 0.5940146446228027,
      "learning_rate": 0.00028398058252427185,
      "loss": 1.185,
      "step": 42
    },
    {
      "epoch": 0.3954022988505747,
      "grad_norm": 0.394444078207016,
      "learning_rate": 0.00028325242718446603,
      "loss": 1.3057,
      "step": 43
    },
    {
      "epoch": 0.4045977011494253,
      "grad_norm": 0.4431244730949402,
      "learning_rate": 0.00028252427184466015,
      "loss": 1.3147,
      "step": 44
    },
    {
      "epoch": 0.41379310344827586,
      "grad_norm": 0.4967786371707916,
      "learning_rate": 0.00028179611650485433,
      "loss": 1.1773,
      "step": 45
    },
    {
      "epoch": 0.42298850574712643,
      "grad_norm": 0.38681459426879883,
      "learning_rate": 0.0002810679611650485,
      "loss": 1.4481,
      "step": 46
    },
    {
      "epoch": 0.432183908045977,
      "grad_norm": 0.4664541780948639,
      "learning_rate": 0.0002803398058252427,
      "loss": 1.3254,
      "step": 47
    },
    {
      "epoch": 0.4413793103448276,
      "grad_norm": 0.433729350566864,
      "learning_rate": 0.00027961165048543687,
      "loss": 1.2067,
      "step": 48
    },
    {
      "epoch": 0.45057471264367815,
      "grad_norm": 0.4609008729457855,
      "learning_rate": 0.00027888349514563105,
      "loss": 1.1925,
      "step": 49
    },
    {
      "epoch": 0.45977011494252873,
      "grad_norm": 0.5346646904945374,
      "learning_rate": 0.0002781553398058252,
      "loss": 1.1117,
      "step": 50
    },
    {
      "epoch": 0.4689655172413793,
      "grad_norm": 0.4348887801170349,
      "learning_rate": 0.0002774271844660194,
      "loss": 1.3088,
      "step": 51
    },
    {
      "epoch": 0.4781609195402299,
      "grad_norm": 0.4136529266834259,
      "learning_rate": 0.0002766990291262136,
      "loss": 1.1332,
      "step": 52
    },
    {
      "epoch": 0.48735632183908045,
      "grad_norm": 0.580917239189148,
      "learning_rate": 0.00027597087378640776,
      "loss": 1.1903,
      "step": 53
    },
    {
      "epoch": 0.496551724137931,
      "grad_norm": 0.5748546719551086,
      "learning_rate": 0.00027524271844660194,
      "loss": 1.0639,
      "step": 54
    },
    {
      "epoch": 0.5057471264367817,
      "grad_norm": 0.45053598284721375,
      "learning_rate": 0.0002745145631067961,
      "loss": 1.2918,
      "step": 55
    },
    {
      "epoch": 0.5149425287356322,
      "grad_norm": 0.5989317893981934,
      "learning_rate": 0.0002737864077669903,
      "loss": 1.2734,
      "step": 56
    },
    {
      "epoch": 0.5241379310344828,
      "grad_norm": 0.5094353556632996,
      "learning_rate": 0.0002730582524271845,
      "loss": 1.063,
      "step": 57
    },
    {
      "epoch": 0.5333333333333333,
      "grad_norm": 0.5418046712875366,
      "learning_rate": 0.00027233009708737865,
      "loss": 1.1931,
      "step": 58
    },
    {
      "epoch": 0.542528735632184,
      "grad_norm": 0.5898640155792236,
      "learning_rate": 0.0002716019417475728,
      "loss": 1.1561,
      "step": 59
    },
    {
      "epoch": 0.5517241379310345,
      "grad_norm": 0.5417141914367676,
      "learning_rate": 0.00027087378640776696,
      "loss": 1.1772,
      "step": 60
    },
    {
      "epoch": 0.5609195402298851,
      "grad_norm": 0.5419390201568604,
      "learning_rate": 0.00027014563106796114,
      "loss": 1.1019,
      "step": 61
    },
    {
      "epoch": 0.5701149425287356,
      "grad_norm": 0.5117617845535278,
      "learning_rate": 0.0002694174757281553,
      "loss": 1.026,
      "step": 62
    },
    {
      "epoch": 0.5793103448275863,
      "grad_norm": 0.6913059949874878,
      "learning_rate": 0.0002686893203883495,
      "loss": 1.0382,
      "step": 63
    },
    {
      "epoch": 0.5885057471264368,
      "grad_norm": 0.7260013818740845,
      "learning_rate": 0.00026796116504854367,
      "loss": 1.0667,
      "step": 64
    },
    {
      "epoch": 0.5977011494252874,
      "grad_norm": 0.831520676612854,
      "learning_rate": 0.00026723300970873785,
      "loss": 1.2295,
      "step": 65
    },
    {
      "epoch": 0.6068965517241379,
      "grad_norm": 0.5325063467025757,
      "learning_rate": 0.00026650485436893203,
      "loss": 0.8899,
      "step": 66
    },
    {
      "epoch": 0.6160919540229886,
      "grad_norm": 0.5867132544517517,
      "learning_rate": 0.0002657766990291262,
      "loss": 1.1095,
      "step": 67
    },
    {
      "epoch": 0.6252873563218391,
      "grad_norm": 0.6068809628486633,
      "learning_rate": 0.0002650485436893204,
      "loss": 1.075,
      "step": 68
    },
    {
      "epoch": 0.6344827586206897,
      "grad_norm": 0.6909754872322083,
      "learning_rate": 0.00026432038834951456,
      "loss": 1.1673,
      "step": 69
    },
    {
      "epoch": 0.6436781609195402,
      "grad_norm": 0.7632415294647217,
      "learning_rate": 0.00026359223300970874,
      "loss": 1.1162,
      "step": 70
    },
    {
      "epoch": 0.6528735632183909,
      "grad_norm": 0.6888180375099182,
      "learning_rate": 0.0002628640776699029,
      "loss": 0.8784,
      "step": 71
    },
    {
      "epoch": 0.6620689655172414,
      "grad_norm": 0.6300679445266724,
      "learning_rate": 0.00026213592233009705,
      "loss": 0.8871,
      "step": 72
    },
    {
      "epoch": 0.671264367816092,
      "grad_norm": 0.6151922941207886,
      "learning_rate": 0.0002614077669902912,
      "loss": 0.9701,
      "step": 73
    },
    {
      "epoch": 0.6804597701149425,
      "grad_norm": 0.6928962469100952,
      "learning_rate": 0.0002606796116504854,
      "loss": 1.0456,
      "step": 74
    },
    {
      "epoch": 0.6896551724137931,
      "grad_norm": 0.848274827003479,
      "learning_rate": 0.0002599514563106796,
      "loss": 1.1089,
      "step": 75
    },
    {
      "epoch": 0.6988505747126437,
      "grad_norm": 1.1162086725234985,
      "learning_rate": 0.00025922330097087376,
      "loss": 1.1541,
      "step": 76
    },
    {
      "epoch": 0.7080459770114943,
      "grad_norm": 0.6158594489097595,
      "learning_rate": 0.00025849514563106794,
      "loss": 1.1431,
      "step": 77
    },
    {
      "epoch": 0.7172413793103448,
      "grad_norm": 0.6790457367897034,
      "learning_rate": 0.0002577669902912621,
      "loss": 0.9188,
      "step": 78
    },
    {
      "epoch": 0.7264367816091954,
      "grad_norm": 0.844353973865509,
      "learning_rate": 0.0002570388349514563,
      "loss": 1.2385,
      "step": 79
    },
    {
      "epoch": 0.735632183908046,
      "grad_norm": 0.645825207233429,
      "learning_rate": 0.0002563106796116505,
      "loss": 1.0516,
      "step": 80
    },
    {
      "epoch": 0.7448275862068966,
      "grad_norm": 0.5313035249710083,
      "learning_rate": 0.0002555825242718446,
      "loss": 1.046,
      "step": 81
    },
    {
      "epoch": 0.7540229885057471,
      "grad_norm": 0.6045447587966919,
      "learning_rate": 0.0002548543689320388,
      "loss": 0.9456,
      "step": 82
    },
    {
      "epoch": 0.7632183908045977,
      "grad_norm": 1.1198372840881348,
      "learning_rate": 0.00025412621359223296,
      "loss": 1.1684,
      "step": 83
    },
    {
      "epoch": 0.7724137931034483,
      "grad_norm": 0.6908996105194092,
      "learning_rate": 0.00025339805825242714,
      "loss": 1.1155,
      "step": 84
    },
    {
      "epoch": 0.7816091954022989,
      "grad_norm": 0.5916021466255188,
      "learning_rate": 0.0002526699029126213,
      "loss": 1.0781,
      "step": 85
    },
    {
      "epoch": 0.7908045977011494,
      "grad_norm": 0.8320909738540649,
      "learning_rate": 0.0002519417475728155,
      "loss": 0.7954,
      "step": 86
    },
    {
      "epoch": 0.8,
      "grad_norm": 0.6855669617652893,
      "learning_rate": 0.00025121359223300967,
      "loss": 0.8968,
      "step": 87
    },
    {
      "epoch": 0.8091954022988506,
      "grad_norm": 0.6343971490859985,
      "learning_rate": 0.00025048543689320385,
      "loss": 0.9878,
      "step": 88
    },
    {
      "epoch": 0.8183908045977012,
      "grad_norm": 0.5901280641555786,
      "learning_rate": 0.00024975728155339803,
      "loss": 1.0158,
      "step": 89
    },
    {
      "epoch": 0.8275862068965517,
      "grad_norm": 0.5881906151771545,
      "learning_rate": 0.0002490291262135922,
      "loss": 0.9432,
      "step": 90
    },
    {
      "epoch": 0.8367816091954023,
      "grad_norm": 0.749911367893219,
      "learning_rate": 0.0002483009708737864,
      "loss": 0.8734,
      "step": 91
    },
    {
      "epoch": 0.8459770114942529,
      "grad_norm": 0.6801809668540955,
      "learning_rate": 0.00024757281553398056,
      "loss": 0.7291,
      "step": 92
    },
    {
      "epoch": 0.8551724137931035,
      "grad_norm": 0.5692021250724792,
      "learning_rate": 0.00024684466019417474,
      "loss": 0.976,
      "step": 93
    },
    {
      "epoch": 0.864367816091954,
      "grad_norm": 0.8974186182022095,
      "learning_rate": 0.0002461165048543689,
      "loss": 0.8702,
      "step": 94
    },
    {
      "epoch": 0.8735632183908046,
      "grad_norm": 0.7381451725959778,
      "learning_rate": 0.0002453883495145631,
      "loss": 0.832,
      "step": 95
    },
    {
      "epoch": 0.8827586206896552,
      "grad_norm": 0.5254162549972534,
      "learning_rate": 0.0002446601941747572,
      "loss": 0.7984,
      "step": 96
    },
    {
      "epoch": 0.8919540229885058,
      "grad_norm": 0.6398109197616577,
      "learning_rate": 0.00024393203883495143,
      "loss": 0.9934,
      "step": 97
    },
    {
      "epoch": 0.9011494252873563,
      "grad_norm": 0.6260827779769897,
      "learning_rate": 0.0002432038834951456,
      "loss": 1.049,
      "step": 98
    },
    {
      "epoch": 0.9103448275862069,
      "grad_norm": 0.7606930136680603,
      "learning_rate": 0.0002424757281553398,
      "loss": 0.8291,
      "step": 99
    },
    {
      "epoch": 0.9195402298850575,
      "grad_norm": 0.5728839039802551,
      "learning_rate": 0.00024174757281553394,
      "loss": 0.9048,
      "step": 100
    },
    {
      "epoch": 0.9287356321839081,
      "grad_norm": 0.8986696600914001,
      "learning_rate": 0.00024101941747572812,
      "loss": 0.8363,
      "step": 101
    },
    {
      "epoch": 0.9379310344827586,
      "grad_norm": 1.1614326238632202,
      "learning_rate": 0.0002402912621359223,
      "loss": 0.8226,
      "step": 102
    },
    {
      "epoch": 0.9471264367816092,
      "grad_norm": 0.6037288904190063,
      "learning_rate": 0.00023956310679611648,
      "loss": 0.9164,
      "step": 103
    },
    {
      "epoch": 0.9563218390804598,
      "grad_norm": 0.5823872685432434,
      "learning_rate": 0.00023883495145631065,
      "loss": 0.9207,
      "step": 104
    },
    {
      "epoch": 0.9655172413793104,
      "grad_norm": 0.6235625147819519,
      "learning_rate": 0.00023810679611650483,
      "loss": 0.762,
      "step": 105
    },
    {
      "epoch": 0.9747126436781609,
      "grad_norm": 0.5390623807907104,
      "learning_rate": 0.000237378640776699,
      "loss": 1.0991,
      "step": 106
    },
    {
      "epoch": 0.9839080459770115,
      "grad_norm": 0.5815601944923401,
      "learning_rate": 0.0002366504854368932,
      "loss": 0.9363,
      "step": 107
    },
    {
      "epoch": 0.993103448275862,
      "grad_norm": 0.5816079378128052,
      "learning_rate": 0.00023592233009708734,
      "loss": 0.7259,
      "step": 108
    },
    {
      "epoch": 1.0,
      "grad_norm": 2.0381648540496826,
      "learning_rate": 0.00023519417475728152,
      "loss": 0.8426,
      "step": 109
    },
    {
      "epoch": 1.0,
      "eval_loss": 1.300666093826294,
      "eval_runtime": 94.5929,
      "eval_samples_per_second": 3.51,
      "eval_steps_per_second": 1.755,
      "step": 109
    },
    {
      "epoch": 1.0091954022988505,
      "grad_norm": 0.5967744588851929,
      "learning_rate": 0.0002344660194174757,
      "loss": 0.7901,
      "step": 110
    },
    {
      "epoch": 1.018390804597701,
      "grad_norm": 0.5640788078308105,
      "learning_rate": 0.00023373786407766988,
      "loss": 0.6748,
      "step": 111
    },
    {
      "epoch": 1.0275862068965518,
      "grad_norm": 0.5988382697105408,
      "learning_rate": 0.00023300970873786406,
      "loss": 0.7256,
      "step": 112
    },
    {
      "epoch": 1.0367816091954023,
      "grad_norm": 0.8283673524856567,
      "learning_rate": 0.00023228155339805823,
      "loss": 0.5817,
      "step": 113
    },
    {
      "epoch": 1.0459770114942528,
      "grad_norm": 0.5982439517974854,
      "learning_rate": 0.0002315533980582524,
      "loss": 0.8342,
      "step": 114
    },
    {
      "epoch": 1.0551724137931036,
      "grad_norm": 0.9253256916999817,
      "learning_rate": 0.0002308252427184466,
      "loss": 0.6173,
      "step": 115
    },
    {
      "epoch": 1.064367816091954,
      "grad_norm": 0.58003830909729,
      "learning_rate": 0.00023009708737864074,
      "loss": 0.6153,
      "step": 116
    },
    {
      "epoch": 1.0735632183908046,
      "grad_norm": 0.9149603843688965,
      "learning_rate": 0.00022936893203883492,
      "loss": 0.7499,
      "step": 117
    },
    {
      "epoch": 1.0827586206896551,
      "grad_norm": 0.4855748414993286,
      "learning_rate": 0.0002286407766990291,
      "loss": 0.8476,
      "step": 118
    },
    {
      "epoch": 1.0919540229885056,
      "grad_norm": 0.5189658403396606,
      "learning_rate": 0.00022791262135922328,
      "loss": 0.6332,
      "step": 119
    },
    {
      "epoch": 1.1011494252873564,
      "grad_norm": 0.6228414177894592,
      "learning_rate": 0.00022718446601941746,
      "loss": 0.6543,
      "step": 120
    },
    {
      "epoch": 1.110344827586207,
      "grad_norm": 0.7163541913032532,
      "learning_rate": 0.00022645631067961164,
      "loss": 0.5062,
      "step": 121
    },
    {
      "epoch": 1.1195402298850574,
      "grad_norm": 0.6037282347679138,
      "learning_rate": 0.00022572815533980582,
      "loss": 0.4752,
      "step": 122
    },
    {
      "epoch": 1.1287356321839082,
      "grad_norm": 0.5334956049919128,
      "learning_rate": 0.000225,
      "loss": 0.5978,
      "step": 123
    },
    {
      "epoch": 1.1379310344827587,
      "grad_norm": 0.6316633224487305,
      "learning_rate": 0.00022427184466019415,
      "loss": 0.631,
      "step": 124
    },
    {
      "epoch": 1.1471264367816092,
      "grad_norm": 0.5869876742362976,
      "learning_rate": 0.00022354368932038832,
      "loss": 0.6083,
      "step": 125
    },
    {
      "epoch": 1.1563218390804597,
      "grad_norm": 0.8843062520027161,
      "learning_rate": 0.0002228155339805825,
      "loss": 0.5106,
      "step": 126
    },
    {
      "epoch": 1.1655172413793102,
      "grad_norm": 0.9645463228225708,
      "learning_rate": 0.00022208737864077668,
      "loss": 0.6352,
      "step": 127
    },
    {
      "epoch": 1.174712643678161,
      "grad_norm": 0.4931183457374573,
      "learning_rate": 0.00022135922330097086,
      "loss": 0.6054,
      "step": 128
    },
    {
      "epoch": 1.1839080459770115,
      "grad_norm": 0.5451965928077698,
      "learning_rate": 0.00022063106796116504,
      "loss": 0.6165,
      "step": 129
    },
    {
      "epoch": 1.193103448275862,
      "grad_norm": 0.44423708319664,
      "learning_rate": 0.00021990291262135922,
      "loss": 0.57,
      "step": 130
    },
    {
      "epoch": 1.2022988505747128,
      "grad_norm": 0.6561571955680847,
      "learning_rate": 0.00021917475728155337,
      "loss": 0.5627,
      "step": 131
    },
    {
      "epoch": 1.2114942528735633,
      "grad_norm": 0.4954622983932495,
      "learning_rate": 0.00021844660194174755,
      "loss": 0.6054,
      "step": 132
    },
    {
      "epoch": 1.2206896551724138,
      "grad_norm": 0.49996840953826904,
      "learning_rate": 0.00021771844660194173,
      "loss": 0.7044,
      "step": 133
    },
    {
      "epoch": 1.2298850574712643,
      "grad_norm": 0.6132878065109253,
      "learning_rate": 0.0002169902912621359,
      "loss": 0.5854,
      "step": 134
    },
    {
      "epoch": 1.2390804597701148,
      "grad_norm": 0.619061291217804,
      "learning_rate": 0.00021626213592233008,
      "loss": 0.7435,
      "step": 135
    },
    {
      "epoch": 1.2482758620689656,
      "grad_norm": 0.5794959664344788,
      "learning_rate": 0.00021553398058252426,
      "loss": 0.8223,
      "step": 136
    },
    {
      "epoch": 1.257471264367816,
      "grad_norm": 0.595934271812439,
      "learning_rate": 0.00021480582524271844,
      "loss": 0.4924,
      "step": 137
    },
    {
      "epoch": 1.2666666666666666,
      "grad_norm": 0.7381154298782349,
      "learning_rate": 0.00021407766990291262,
      "loss": 0.5804,
      "step": 138
    },
    {
      "epoch": 1.2758620689655173,
      "grad_norm": 0.6467525362968445,
      "learning_rate": 0.00021334951456310677,
      "loss": 0.6435,
      "step": 139
    },
    {
      "epoch": 1.2850574712643679,
      "grad_norm": 0.5511316657066345,
      "learning_rate": 0.00021262135922330095,
      "loss": 0.6627,
      "step": 140
    },
    {
      "epoch": 1.2942528735632184,
      "grad_norm": 0.6253796815872192,
      "learning_rate": 0.00021189320388349513,
      "loss": 0.7729,
      "step": 141
    },
    {
      "epoch": 1.303448275862069,
      "grad_norm": 0.4994090795516968,
      "learning_rate": 0.0002111650485436893,
      "loss": 0.4122,
      "step": 142
    },
    {
      "epoch": 1.3126436781609194,
      "grad_norm": 0.35405322909355164,
      "learning_rate": 0.00021043689320388349,
      "loss": 0.3711,
      "step": 143
    },
    {
      "epoch": 1.3218390804597702,
      "grad_norm": 0.5248880982398987,
      "learning_rate": 0.00020970873786407766,
      "loss": 0.6734,
      "step": 144
    },
    {
      "epoch": 1.3310344827586207,
      "grad_norm": 0.44895875453948975,
      "learning_rate": 0.00020898058252427184,
      "loss": 0.3438,
      "step": 145
    },
    {
      "epoch": 1.3402298850574712,
      "grad_norm": 0.4654625654220581,
      "learning_rate": 0.00020825242718446602,
      "loss": 0.4324,
      "step": 146
    },
    {
      "epoch": 1.349425287356322,
      "grad_norm": 0.4388936460018158,
      "learning_rate": 0.00020752427184466017,
      "loss": 0.4578,
      "step": 147
    },
    {
      "epoch": 1.3586206896551725,
      "grad_norm": 0.4960116744041443,
      "learning_rate": 0.00020679611650485435,
      "loss": 0.5942,
      "step": 148
    },
    {
      "epoch": 1.367816091954023,
      "grad_norm": 0.590185284614563,
      "learning_rate": 0.00020606796116504853,
      "loss": 0.4179,
      "step": 149
    },
    {
      "epoch": 1.3770114942528735,
      "grad_norm": 0.5173139572143555,
      "learning_rate": 0.0002053398058252427,
      "loss": 0.5081,
      "step": 150
    },
    {
      "epoch": 1.386206896551724,
      "grad_norm": 0.5537795424461365,
      "learning_rate": 0.0002046116504854369,
      "loss": 0.6486,
      "step": 151
    },
    {
      "epoch": 1.3954022988505748,
      "grad_norm": 0.593481183052063,
      "learning_rate": 0.00020388349514563107,
      "loss": 0.5944,
      "step": 152
    },
    {
      "epoch": 1.4045977011494253,
      "grad_norm": 0.5522420406341553,
      "learning_rate": 0.00020315533980582524,
      "loss": 0.8094,
      "step": 153
    },
    {
      "epoch": 1.4137931034482758,
      "grad_norm": 0.6627272963523865,
      "learning_rate": 0.00020242718446601942,
      "loss": 0.4802,
      "step": 154
    },
    {
      "epoch": 1.4229885057471265,
      "grad_norm": 0.3878915011882782,
      "learning_rate": 0.00020169902912621357,
      "loss": 0.4832,
      "step": 155
    },
    {
      "epoch": 1.432183908045977,
      "grad_norm": 0.5121778845787048,
      "learning_rate": 0.00020097087378640775,
      "loss": 0.6316,
      "step": 156
    },
    {
      "epoch": 1.4413793103448276,
      "grad_norm": 0.41354355216026306,
      "learning_rate": 0.00020024271844660193,
      "loss": 0.5819,
      "step": 157
    },
    {
      "epoch": 1.450574712643678,
      "grad_norm": 0.35058164596557617,
      "learning_rate": 0.0001995145631067961,
      "loss": 0.4422,
      "step": 158
    },
    {
      "epoch": 1.4597701149425286,
      "grad_norm": 0.5187242031097412,
      "learning_rate": 0.0001987864077669903,
      "loss": 0.5037,
      "step": 159
    },
    {
      "epoch": 1.4689655172413794,
      "grad_norm": 0.7884855270385742,
      "learning_rate": 0.00019805825242718447,
      "loss": 0.687,
      "step": 160
    },
    {
      "epoch": 1.4781609195402299,
      "grad_norm": 0.6305707693099976,
      "learning_rate": 0.00019733009708737865,
      "loss": 0.6576,
      "step": 161
    },
    {
      "epoch": 1.4873563218390804,
      "grad_norm": 0.5323311686515808,
      "learning_rate": 0.0001966019417475728,
      "loss": 0.5083,
      "step": 162
    },
    {
      "epoch": 1.4965517241379311,
      "grad_norm": 0.4653192162513733,
      "learning_rate": 0.00019587378640776698,
      "loss": 0.4232,
      "step": 163
    },
    {
      "epoch": 1.5057471264367817,
      "grad_norm": 0.42282363772392273,
      "learning_rate": 0.00019514563106796116,
      "loss": 0.46,
      "step": 164
    },
    {
      "epoch": 1.5149425287356322,
      "grad_norm": 0.5382373929023743,
      "learning_rate": 0.00019441747572815533,
      "loss": 0.6501,
      "step": 165
    },
    {
      "epoch": 1.524137931034483,
      "grad_norm": 0.49478620290756226,
      "learning_rate": 0.0001936893203883495,
      "loss": 0.6337,
      "step": 166
    },
    {
      "epoch": 1.5333333333333332,
      "grad_norm": 0.3653823435306549,
      "learning_rate": 0.0001929611650485437,
      "loss": 0.5887,
      "step": 167
    },
    {
      "epoch": 1.542528735632184,
      "grad_norm": 0.8347523808479309,
      "learning_rate": 0.00019223300970873787,
      "loss": 0.5192,
      "step": 168
    },
    {
      "epoch": 1.5517241379310345,
      "grad_norm": 0.559551477432251,
      "learning_rate": 0.00019150485436893205,
      "loss": 0.3474,
      "step": 169
    },
    {
      "epoch": 1.560919540229885,
      "grad_norm": 0.42149192094802856,
      "learning_rate": 0.0001907766990291262,
      "loss": 0.407,
      "step": 170
    },
    {
      "epoch": 1.5701149425287357,
      "grad_norm": 0.5364254117012024,
      "learning_rate": 0.00019004854368932038,
      "loss": 0.7923,
      "step": 171
    },
    {
      "epoch": 1.5793103448275863,
      "grad_norm": 0.5137253999710083,
      "learning_rate": 0.00018932038834951456,
      "loss": 0.5499,
      "step": 172
    },
    {
      "epoch": 1.5885057471264368,
      "grad_norm": 0.5608237981796265,
      "learning_rate": 0.00018859223300970874,
      "loss": 0.5724,
      "step": 173
    },
    {
      "epoch": 1.5977011494252875,
      "grad_norm": 0.34653526544570923,
      "learning_rate": 0.00018786407766990291,
      "loss": 0.3985,
      "step": 174
    },
    {
      "epoch": 1.6068965517241378,
      "grad_norm": 0.6667991876602173,
      "learning_rate": 0.00018713592233009707,
      "loss": 0.6257,
      "step": 175
    },
    {
      "epoch": 1.6160919540229886,
      "grad_norm": 0.5803218483924866,
      "learning_rate": 0.00018640776699029122,
      "loss": 0.7256,
      "step": 176
    },
    {
      "epoch": 1.625287356321839,
      "grad_norm": 0.3350389897823334,
      "learning_rate": 0.0001856796116504854,
      "loss": 0.3256,
      "step": 177
    },
    {
      "epoch": 1.6344827586206896,
      "grad_norm": 0.2988188862800598,
      "learning_rate": 0.00018495145631067957,
      "loss": 0.325,
      "step": 178
    },
    {
      "epoch": 1.6436781609195403,
      "grad_norm": 0.540225088596344,
      "learning_rate": 0.00018422330097087375,
      "loss": 0.7405,
      "step": 179
    },
    {
      "epoch": 1.6528735632183909,
      "grad_norm": 0.46998921036720276,
      "learning_rate": 0.00018349514563106793,
      "loss": 0.5965,
      "step": 180
    },
    {
      "epoch": 1.6620689655172414,
      "grad_norm": 0.42614537477493286,
      "learning_rate": 0.0001827669902912621,
      "loss": 0.5187,
      "step": 181
    },
    {
      "epoch": 1.6712643678160921,
      "grad_norm": 0.5146321058273315,
      "learning_rate": 0.0001820388349514563,
      "loss": 0.6134,
      "step": 182
    },
    {
      "epoch": 1.6804597701149424,
      "grad_norm": 0.5366286635398865,
      "learning_rate": 0.00018131067961165047,
      "loss": 0.4942,
      "step": 183
    },
    {
      "epoch": 1.6896551724137931,
      "grad_norm": 0.5794548392295837,
      "learning_rate": 0.00018058252427184462,
      "loss": 0.6949,
      "step": 184
    },
    {
      "epoch": 1.6988505747126437,
      "grad_norm": 0.4413406550884247,
      "learning_rate": 0.0001798543689320388,
      "loss": 0.5908,
      "step": 185
    },
    {
      "epoch": 1.7080459770114942,
      "grad_norm": 0.5551508069038391,
      "learning_rate": 0.00017912621359223298,
      "loss": 0.4899,
      "step": 186
    },
    {
      "epoch": 1.717241379310345,
      "grad_norm": 0.5020127892494202,
      "learning_rate": 0.00017839805825242716,
      "loss": 0.3787,
      "step": 187
    },
    {
      "epoch": 1.7264367816091954,
      "grad_norm": 0.5126776695251465,
      "learning_rate": 0.00017766990291262133,
      "loss": 0.4694,
      "step": 188
    },
    {
      "epoch": 1.735632183908046,
      "grad_norm": 0.5620916485786438,
      "learning_rate": 0.0001769417475728155,
      "loss": 0.5547,
      "step": 189
    },
    {
      "epoch": 1.7448275862068967,
      "grad_norm": 0.4924725294113159,
      "learning_rate": 0.0001762135922330097,
      "loss": 0.5656,
      "step": 190
    },
    {
      "epoch": 1.754022988505747,
      "grad_norm": 0.3647457957267761,
      "learning_rate": 0.00017548543689320387,
      "loss": 0.5137,
      "step": 191
    },
    {
      "epoch": 1.7632183908045977,
      "grad_norm": 0.4956059455871582,
      "learning_rate": 0.00017475728155339802,
      "loss": 0.6375,
      "step": 192
    },
    {
      "epoch": 1.7724137931034483,
      "grad_norm": 0.3766675591468811,
      "learning_rate": 0.0001740291262135922,
      "loss": 0.5081,
      "step": 193
    },
    {
      "epoch": 1.7816091954022988,
      "grad_norm": 0.3699629604816437,
      "learning_rate": 0.00017330097087378638,
      "loss": 0.3969,
      "step": 194
    },
    {
      "epoch": 1.7908045977011495,
      "grad_norm": 0.35047248005867004,
      "learning_rate": 0.00017257281553398056,
      "loss": 0.4473,
      "step": 195
    },
    {
      "epoch": 1.8,
      "grad_norm": 0.5037795901298523,
      "learning_rate": 0.00017184466019417474,
      "loss": 0.4882,
      "step": 196
    },
    {
      "epoch": 1.8091954022988506,
      "grad_norm": 0.46976998448371887,
      "learning_rate": 0.00017111650485436891,
      "loss": 0.5307,
      "step": 197
    },
    {
      "epoch": 1.8183908045977013,
      "grad_norm": 0.5248985290527344,
      "learning_rate": 0.0001703883495145631,
      "loss": 0.5062,
      "step": 198
    },
    {
      "epoch": 1.8275862068965516,
      "grad_norm": 0.4395950436592102,
      "learning_rate": 0.00016966019417475724,
      "loss": 0.5184,
      "step": 199
    },
    {
      "epoch": 1.8367816091954023,
      "grad_norm": 0.31456464529037476,
      "learning_rate": 0.00016893203883495142,
      "loss": 0.3707,
      "step": 200
    },
    {
      "epoch": 1.8459770114942529,
      "grad_norm": 0.39423462748527527,
      "learning_rate": 0.0001682038834951456,
      "loss": 0.3915,
      "step": 201
    },
    {
      "epoch": 1.8551724137931034,
      "grad_norm": 0.3807072937488556,
      "learning_rate": 0.00016747572815533978,
      "loss": 0.4203,
      "step": 202
    },
    {
      "epoch": 1.8643678160919541,
      "grad_norm": 0.49600493907928467,
      "learning_rate": 0.00016674757281553396,
      "loss": 0.5633,
      "step": 203
    },
    {
      "epoch": 1.8735632183908046,
      "grad_norm": 0.4472233057022095,
      "learning_rate": 0.00016601941747572814,
      "loss": 0.5791,
      "step": 204
    },
    {
      "epoch": 1.8827586206896552,
      "grad_norm": 0.48306939005851746,
      "learning_rate": 0.00016529126213592232,
      "loss": 0.4091,
      "step": 205
    },
    {
      "epoch": 1.891954022988506,
      "grad_norm": 0.40710389614105225,
      "learning_rate": 0.0001645631067961165,
      "loss": 0.5302,
      "step": 206
    },
    {
      "epoch": 1.9011494252873562,
      "grad_norm": 0.37706899642944336,
      "learning_rate": 0.00016383495145631065,
      "loss": 0.416,
      "step": 207
    },
    {
      "epoch": 1.910344827586207,
      "grad_norm": 0.5805254578590393,
      "learning_rate": 0.00016310679611650483,
      "loss": 0.6861,
      "step": 208
    },
    {
      "epoch": 1.9195402298850575,
      "grad_norm": 0.44542425870895386,
      "learning_rate": 0.000162378640776699,
      "loss": 0.5468,
      "step": 209
    },
    {
      "epoch": 1.928735632183908,
      "grad_norm": 0.44782838225364685,
      "learning_rate": 0.00016165048543689318,
      "loss": 0.4827,
      "step": 210
    },
    {
      "epoch": 1.9379310344827587,
      "grad_norm": 0.30875957012176514,
      "learning_rate": 0.00016092233009708736,
      "loss": 0.3472,
      "step": 211
    },
    {
      "epoch": 1.9471264367816092,
      "grad_norm": 0.4443942904472351,
      "learning_rate": 0.00016019417475728154,
      "loss": 0.5595,
      "step": 212
    },
    {
      "epoch": 1.9563218390804598,
      "grad_norm": 0.32429659366607666,
      "learning_rate": 0.00015946601941747572,
      "loss": 0.374,
      "step": 213
    },
    {
      "epoch": 1.9655172413793105,
      "grad_norm": 0.339242160320282,
      "learning_rate": 0.0001587378640776699,
      "loss": 0.4155,
      "step": 214
    },
    {
      "epoch": 1.9747126436781608,
      "grad_norm": 0.5646737813949585,
      "learning_rate": 0.00015800970873786405,
      "loss": 0.8037,
      "step": 215
    },
    {
      "epoch": 1.9839080459770115,
      "grad_norm": 0.5335783958435059,
      "learning_rate": 0.00015728155339805823,
      "loss": 0.7524,
      "step": 216
    },
    {
      "epoch": 1.993103448275862,
      "grad_norm": 0.46629971265792847,
      "learning_rate": 0.0001565533980582524,
      "loss": 0.598,
      "step": 217
    },
    {
      "epoch": 2.0,
      "grad_norm": 0.7031301856040955,
      "learning_rate": 0.00015582524271844658,
      "loss": 0.7793,
      "step": 218
    },
    {
      "epoch": 2.0,
      "eval_loss": 1.3836089372634888,
      "eval_runtime": 94.5395,
      "eval_samples_per_second": 3.512,
      "eval_steps_per_second": 1.756,
      "step": 218
    },
    {
      "epoch": 2.0091954022988507,
      "grad_norm": 0.3298664093017578,
      "learning_rate": 0.00015509708737864076,
      "loss": 0.3196,
      "step": 219
    },
    {
      "epoch": 2.018390804597701,
      "grad_norm": 0.4151831567287445,
      "learning_rate": 0.00015436893203883494,
      "loss": 0.4209,
      "step": 220
    },
    {
      "epoch": 2.027586206896552,
      "grad_norm": 0.35617533326148987,
      "learning_rate": 0.00015364077669902912,
      "loss": 0.4023,
      "step": 221
    },
    {
      "epoch": 2.036781609195402,
      "grad_norm": 0.41707131266593933,
      "learning_rate": 0.0001529126213592233,
      "loss": 0.4355,
      "step": 222
    },
    {
      "epoch": 2.045977011494253,
      "grad_norm": 0.3739112913608551,
      "learning_rate": 0.00015218446601941745,
      "loss": 0.3309,
      "step": 223
    },
    {
      "epoch": 2.0551724137931036,
      "grad_norm": 0.529603123664856,
      "learning_rate": 0.00015145631067961163,
      "loss": 0.3706,
      "step": 224
    },
    {
      "epoch": 2.064367816091954,
      "grad_norm": 0.4731467664241791,
      "learning_rate": 0.0001507281553398058,
      "loss": 0.379,
      "step": 225
    },
    {
      "epoch": 2.0735632183908046,
      "grad_norm": 0.5218953490257263,
      "learning_rate": 0.00015,
      "loss": 0.3467,
      "step": 226
    },
    {
      "epoch": 2.0827586206896553,
      "grad_norm": 0.7507036924362183,
      "learning_rate": 0.00014927184466019417,
      "loss": 0.6107,
      "step": 227
    },
    {
      "epoch": 2.0919540229885056,
      "grad_norm": 0.44099223613739014,
      "learning_rate": 0.00014854368932038834,
      "loss": 0.3186,
      "step": 228
    },
    {
      "epoch": 2.1011494252873564,
      "grad_norm": 0.4234969913959503,
      "learning_rate": 0.00014781553398058252,
      "loss": 0.3482,
      "step": 229
    },
    {
      "epoch": 2.110344827586207,
      "grad_norm": 0.49509719014167786,
      "learning_rate": 0.00014708737864077667,
      "loss": 0.3985,
      "step": 230
    },
    {
      "epoch": 2.1195402298850574,
      "grad_norm": 0.43987375497817993,
      "learning_rate": 0.00014635922330097085,
      "loss": 0.3618,
      "step": 231
    },
    {
      "epoch": 2.128735632183908,
      "grad_norm": 0.6162170767784119,
      "learning_rate": 0.00014563106796116503,
      "loss": 0.4423,
      "step": 232
    },
    {
      "epoch": 2.1379310344827585,
      "grad_norm": 0.3867994248867035,
      "learning_rate": 0.0001449029126213592,
      "loss": 0.3419,
      "step": 233
    },
    {
      "epoch": 2.147126436781609,
      "grad_norm": 0.7177544832229614,
      "learning_rate": 0.0001441747572815534,
      "loss": 0.4747,
      "step": 234
    },
    {
      "epoch": 2.15632183908046,
      "grad_norm": 0.5735111236572266,
      "learning_rate": 0.00014344660194174757,
      "loss": 0.4305,
      "step": 235
    },
    {
      "epoch": 2.1655172413793102,
      "grad_norm": 0.5527003407478333,
      "learning_rate": 0.00014271844660194175,
      "loss": 0.3601,
      "step": 236
    },
    {
      "epoch": 2.174712643678161,
      "grad_norm": 0.5799367427825928,
      "learning_rate": 0.00014199029126213592,
      "loss": 0.4681,
      "step": 237
    },
    {
      "epoch": 2.1839080459770113,
      "grad_norm": 0.5536375045776367,
      "learning_rate": 0.00014126213592233008,
      "loss": 0.4431,
      "step": 238
    },
    {
      "epoch": 2.193103448275862,
      "grad_norm": 0.45677146315574646,
      "learning_rate": 0.00014053398058252425,
      "loss": 0.3618,
      "step": 239
    },
    {
      "epoch": 2.2022988505747128,
      "grad_norm": 0.464231938123703,
      "learning_rate": 0.00013980582524271843,
      "loss": 0.3164,
      "step": 240
    },
    {
      "epoch": 2.211494252873563,
      "grad_norm": 0.7074082493782043,
      "learning_rate": 0.0001390776699029126,
      "loss": 0.5988,
      "step": 241
    },
    {
      "epoch": 2.220689655172414,
      "grad_norm": 0.4413163363933563,
      "learning_rate": 0.0001383495145631068,
      "loss": 0.2946,
      "step": 242
    },
    {
      "epoch": 2.2298850574712645,
      "grad_norm": 0.3606387972831726,
      "learning_rate": 0.00013762135922330097,
      "loss": 0.2709,
      "step": 243
    },
    {
      "epoch": 2.239080459770115,
      "grad_norm": 0.6036794185638428,
      "learning_rate": 0.00013689320388349515,
      "loss": 0.5043,
      "step": 244
    },
    {
      "epoch": 2.2482758620689656,
      "grad_norm": 0.7041867971420288,
      "learning_rate": 0.00013616504854368933,
      "loss": 0.6161,
      "step": 245
    },
    {
      "epoch": 2.2574712643678163,
      "grad_norm": 0.36813509464263916,
      "learning_rate": 0.00013543689320388348,
      "loss": 0.2538,
      "step": 246
    },
    {
      "epoch": 2.2666666666666666,
      "grad_norm": 0.6574187874794006,
      "learning_rate": 0.00013470873786407766,
      "loss": 0.5472,
      "step": 247
    },
    {
      "epoch": 2.2758620689655173,
      "grad_norm": 0.3584626317024231,
      "learning_rate": 0.00013398058252427184,
      "loss": 0.2486,
      "step": 248
    },
    {
      "epoch": 2.2850574712643676,
      "grad_norm": 0.7730153203010559,
      "learning_rate": 0.00013325242718446601,
      "loss": 0.4867,
      "step": 249
    },
    {
      "epoch": 2.2942528735632184,
      "grad_norm": 0.6601793766021729,
      "learning_rate": 0.0001325242718446602,
      "loss": 0.6158,
      "step": 250
    },
    {
      "epoch": 2.303448275862069,
      "grad_norm": 0.42591384053230286,
      "learning_rate": 0.00013179611650485437,
      "loss": 0.3484,
      "step": 251
    },
    {
      "epoch": 2.3126436781609194,
      "grad_norm": 0.3791221082210541,
      "learning_rate": 0.00013106796116504852,
      "loss": 0.2861,
      "step": 252
    },
    {
      "epoch": 2.32183908045977,
      "grad_norm": 0.5107592940330505,
      "learning_rate": 0.0001303398058252427,
      "loss": 0.4219,
      "step": 253
    },
    {
      "epoch": 2.3310344827586205,
      "grad_norm": 0.43515774607658386,
      "learning_rate": 0.00012961165048543688,
      "loss": 0.32,
      "step": 254
    },
    {
      "epoch": 2.340229885057471,
      "grad_norm": 0.45249196887016296,
      "learning_rate": 0.00012888349514563106,
      "loss": 0.3144,
      "step": 255
    },
    {
      "epoch": 2.349425287356322,
      "grad_norm": 0.5824540853500366,
      "learning_rate": 0.00012815533980582524,
      "loss": 0.4423,
      "step": 256
    },
    {
      "epoch": 2.3586206896551722,
      "grad_norm": 0.680497407913208,
      "learning_rate": 0.0001274271844660194,
      "loss": 0.579,
      "step": 257
    },
    {
      "epoch": 2.367816091954023,
      "grad_norm": 0.45555397868156433,
      "learning_rate": 0.00012669902912621357,
      "loss": 0.3296,
      "step": 258
    },
    {
      "epoch": 2.3770114942528737,
      "grad_norm": 0.5558891892433167,
      "learning_rate": 0.00012597087378640775,
      "loss": 0.4183,
      "step": 259
    },
    {
      "epoch": 2.386206896551724,
      "grad_norm": 0.6155978441238403,
      "learning_rate": 0.00012524271844660192,
      "loss": 0.4989,
      "step": 260
    },
    {
      "epoch": 2.3954022988505748,
      "grad_norm": 0.5432369709014893,
      "learning_rate": 0.0001245145631067961,
      "loss": 0.4149,
      "step": 261
    },
    {
      "epoch": 2.4045977011494255,
      "grad_norm": 0.3622282147407532,
      "learning_rate": 0.00012378640776699028,
      "loss": 0.2948,
      "step": 262
    },
    {
      "epoch": 2.413793103448276,
      "grad_norm": 0.42852282524108887,
      "learning_rate": 0.00012305825242718446,
      "loss": 0.2617,
      "step": 263
    },
    {
      "epoch": 2.4229885057471265,
      "grad_norm": 0.3979598879814148,
      "learning_rate": 0.0001223300970873786,
      "loss": 0.221,
      "step": 264
    },
    {
      "epoch": 2.432183908045977,
      "grad_norm": 0.4943128526210785,
      "learning_rate": 0.0001216019417475728,
      "loss": 0.31,
      "step": 265
    },
    {
      "epoch": 2.4413793103448276,
      "grad_norm": 0.5944445133209229,
      "learning_rate": 0.00012087378640776697,
      "loss": 0.5284,
      "step": 266
    },
    {
      "epoch": 2.4505747126436783,
      "grad_norm": 0.5441230535507202,
      "learning_rate": 0.00012014563106796115,
      "loss": 0.4091,
      "step": 267
    },
    {
      "epoch": 2.4597701149425286,
      "grad_norm": 0.3592386543750763,
      "learning_rate": 0.00011941747572815533,
      "loss": 0.2881,
      "step": 268
    },
    {
      "epoch": 2.4689655172413794,
      "grad_norm": 0.4022001028060913,
      "learning_rate": 0.0001186893203883495,
      "loss": 0.3005,
      "step": 269
    },
    {
      "epoch": 2.4781609195402297,
      "grad_norm": 0.3709545433521271,
      "learning_rate": 0.00011796116504854367,
      "loss": 0.261,
      "step": 270
    },
    {
      "epoch": 2.4873563218390804,
      "grad_norm": 0.4119846522808075,
      "learning_rate": 0.00011723300970873785,
      "loss": 0.284,
      "step": 271
    },
    {
      "epoch": 2.496551724137931,
      "grad_norm": 0.5992346405982971,
      "learning_rate": 0.00011650485436893203,
      "loss": 0.6485,
      "step": 272
    },
    {
      "epoch": 2.5057471264367814,
      "grad_norm": 0.3555474579334259,
      "learning_rate": 0.0001157766990291262,
      "loss": 0.2032,
      "step": 273
    },
    {
      "epoch": 2.514942528735632,
      "grad_norm": 0.40004169940948486,
      "learning_rate": 0.00011504854368932037,
      "loss": 0.3559,
      "step": 274
    },
    {
      "epoch": 2.524137931034483,
      "grad_norm": 0.5196751356124878,
      "learning_rate": 0.00011432038834951455,
      "loss": 0.3952,
      "step": 275
    },
    {
      "epoch": 2.533333333333333,
      "grad_norm": 0.2508908808231354,
      "learning_rate": 0.00011359223300970873,
      "loss": 0.1867,
      "step": 276
    },
    {
      "epoch": 2.542528735632184,
      "grad_norm": 0.5024006366729736,
      "learning_rate": 0.00011286407766990291,
      "loss": 0.4146,
      "step": 277
    },
    {
      "epoch": 2.5517241379310347,
      "grad_norm": 0.48392558097839355,
      "learning_rate": 0.00011213592233009707,
      "loss": 0.404,
      "step": 278
    },
    {
      "epoch": 2.560919540229885,
      "grad_norm": 0.3421654999256134,
      "learning_rate": 0.00011140776699029125,
      "loss": 0.2727,
      "step": 279
    },
    {
      "epoch": 2.5701149425287357,
      "grad_norm": 0.32812729477882385,
      "learning_rate": 0.00011067961165048543,
      "loss": 0.2466,
      "step": 280
    },
    {
      "epoch": 2.5793103448275865,
      "grad_norm": 0.3916667401790619,
      "learning_rate": 0.00010995145631067961,
      "loss": 0.2962,
      "step": 281
    },
    {
      "epoch": 2.5885057471264368,
      "grad_norm": 0.41674014925956726,
      "learning_rate": 0.00010922330097087377,
      "loss": 0.3091,
      "step": 282
    },
    {
      "epoch": 2.5977011494252875,
      "grad_norm": 0.5579612851142883,
      "learning_rate": 0.00010849514563106795,
      "loss": 0.4515,
      "step": 283
    },
    {
      "epoch": 2.606896551724138,
      "grad_norm": 0.31362155079841614,
      "learning_rate": 0.00010776699029126213,
      "loss": 0.209,
      "step": 284
    },
    {
      "epoch": 2.6160919540229886,
      "grad_norm": 0.3783795237541199,
      "learning_rate": 0.00010703883495145631,
      "loss": 0.2683,
      "step": 285
    },
    {
      "epoch": 2.625287356321839,
      "grad_norm": 0.4127572178840637,
      "learning_rate": 0.00010631067961165047,
      "loss": 0.3164,
      "step": 286
    },
    {
      "epoch": 2.6344827586206896,
      "grad_norm": 0.42433518171310425,
      "learning_rate": 0.00010558252427184465,
      "loss": 0.2813,
      "step": 287
    },
    {
      "epoch": 2.6436781609195403,
      "grad_norm": 0.26535582542419434,
      "learning_rate": 0.00010485436893203883,
      "loss": 0.1638,
      "step": 288
    },
    {
      "epoch": 2.6528735632183906,
      "grad_norm": 0.47647133469581604,
      "learning_rate": 0.00010412621359223301,
      "loss": 0.3998,
      "step": 289
    },
    {
      "epoch": 2.6620689655172414,
      "grad_norm": 0.4837816059589386,
      "learning_rate": 0.00010339805825242718,
      "loss": 0.3634,
      "step": 290
    },
    {
      "epoch": 2.671264367816092,
      "grad_norm": 0.5139144062995911,
      "learning_rate": 0.00010266990291262135,
      "loss": 0.4553,
      "step": 291
    },
    {
      "epoch": 2.6804597701149424,
      "grad_norm": 0.42559128999710083,
      "learning_rate": 0.00010194174757281553,
      "loss": 0.2803,
      "step": 292
    },
    {
      "epoch": 2.689655172413793,
      "grad_norm": 0.5253325700759888,
      "learning_rate": 0.00010121359223300971,
      "loss": 0.4476,
      "step": 293
    },
    {
      "epoch": 2.698850574712644,
      "grad_norm": 0.42015260457992554,
      "learning_rate": 0.00010048543689320388,
      "loss": 0.3619,
      "step": 294
    },
    {
      "epoch": 2.708045977011494,
      "grad_norm": 0.7167945504188538,
      "learning_rate": 9.975728155339806e-05,
      "loss": 0.6277,
      "step": 295
    },
    {
      "epoch": 2.717241379310345,
      "grad_norm": 0.596862256526947,
      "learning_rate": 9.902912621359223e-05,
      "loss": 0.454,
      "step": 296
    },
    {
      "epoch": 2.7264367816091957,
      "grad_norm": 0.391558974981308,
      "learning_rate": 9.83009708737864e-05,
      "loss": 0.3283,
      "step": 297
    },
    {
      "epoch": 2.735632183908046,
      "grad_norm": 0.4755273759365082,
      "learning_rate": 9.757281553398058e-05,
      "loss": 0.4141,
      "step": 298
    },
    {
      "epoch": 2.7448275862068967,
      "grad_norm": 0.37474825978279114,
      "learning_rate": 9.684466019417476e-05,
      "loss": 0.285,
      "step": 299
    },
    {
      "epoch": 2.754022988505747,
      "grad_norm": 0.47797009348869324,
      "learning_rate": 9.611650485436893e-05,
      "loss": 0.3485,
      "step": 300
    },
    {
      "epoch": 2.7632183908045977,
      "grad_norm": 0.4562835395336151,
      "learning_rate": 9.53883495145631e-05,
      "loss": 0.3423,
      "step": 301
    },
    {
      "epoch": 2.772413793103448,
      "grad_norm": 0.6650474667549133,
      "learning_rate": 9.466019417475728e-05,
      "loss": 0.4271,
      "step": 302
    },
    {
      "epoch": 2.781609195402299,
      "grad_norm": 0.4036124050617218,
      "learning_rate": 9.393203883495146e-05,
      "loss": 0.2785,
      "step": 303
    },
    {
      "epoch": 2.7908045977011495,
      "grad_norm": 0.6422847509384155,
      "learning_rate": 9.320388349514561e-05,
      "loss": 0.4546,
      "step": 304
    },
    {
      "epoch": 2.8,
      "grad_norm": 0.4403219521045685,
      "learning_rate": 9.247572815533979e-05,
      "loss": 0.2975,
      "step": 305
    },
    {
      "epoch": 2.8091954022988506,
      "grad_norm": 0.34099337458610535,
      "learning_rate": 9.174757281553397e-05,
      "loss": 0.2092,
      "step": 306
    },
    {
      "epoch": 2.8183908045977013,
      "grad_norm": 0.6565693020820618,
      "learning_rate": 9.101941747572814e-05,
      "loss": 0.589,
      "step": 307
    },
    {
      "epoch": 2.8275862068965516,
      "grad_norm": 0.29657208919525146,
      "learning_rate": 9.029126213592231e-05,
      "loss": 0.2206,
      "step": 308
    },
    {
      "epoch": 2.8367816091954023,
      "grad_norm": 0.35843998193740845,
      "learning_rate": 8.956310679611649e-05,
      "loss": 0.2576,
      "step": 309
    },
    {
      "epoch": 2.845977011494253,
      "grad_norm": 0.39961719512939453,
      "learning_rate": 8.883495145631067e-05,
      "loss": 0.2589,
      "step": 310
    },
    {
      "epoch": 2.8551724137931034,
      "grad_norm": 0.2650497257709503,
      "learning_rate": 8.810679611650485e-05,
      "loss": 0.1499,
      "step": 311
    },
    {
      "epoch": 2.864367816091954,
      "grad_norm": 0.6040882468223572,
      "learning_rate": 8.737864077669901e-05,
      "loss": 0.5003,
      "step": 312
    },
    {
      "epoch": 2.873563218390805,
      "grad_norm": 0.36157143115997314,
      "learning_rate": 8.665048543689319e-05,
      "loss": 0.3019,
      "step": 313
    },
    {
      "epoch": 2.882758620689655,
      "grad_norm": 0.34061622619628906,
      "learning_rate": 8.592233009708737e-05,
      "loss": 0.26,
      "step": 314
    },
    {
      "epoch": 2.891954022988506,
      "grad_norm": 0.6060124635696411,
      "learning_rate": 8.519417475728155e-05,
      "loss": 0.5343,
      "step": 315
    },
    {
      "epoch": 2.901149425287356,
      "grad_norm": 0.536323070526123,
      "learning_rate": 8.446601941747571e-05,
      "loss": 0.3762,
      "step": 316
    },
    {
      "epoch": 2.910344827586207,
      "grad_norm": 0.6887696981430054,
      "learning_rate": 8.373786407766989e-05,
      "loss": 0.613,
      "step": 317
    },
    {
      "epoch": 2.9195402298850572,
      "grad_norm": 0.3895469009876251,
      "learning_rate": 8.300970873786407e-05,
      "loss": 0.2919,
      "step": 318
    },
    {
      "epoch": 2.928735632183908,
      "grad_norm": 0.43969079852104187,
      "learning_rate": 8.228155339805825e-05,
      "loss": 0.2747,
      "step": 319
    },
    {
      "epoch": 2.9379310344827587,
      "grad_norm": 0.6681087017059326,
      "learning_rate": 8.155339805825241e-05,
      "loss": 0.5428,
      "step": 320
    },
    {
      "epoch": 2.947126436781609,
      "grad_norm": 0.5340582132339478,
      "learning_rate": 8.082524271844659e-05,
      "loss": 0.3636,
      "step": 321
    },
    {
      "epoch": 2.9563218390804598,
      "grad_norm": 0.4719969630241394,
      "learning_rate": 8.009708737864077e-05,
      "loss": 0.3629,
      "step": 322
    },
    {
      "epoch": 2.9655172413793105,
      "grad_norm": 0.4452765882015228,
      "learning_rate": 7.936893203883495e-05,
      "loss": 0.3002,
      "step": 323
    },
    {
      "epoch": 2.974712643678161,
      "grad_norm": 0.5446481704711914,
      "learning_rate": 7.864077669902911e-05,
      "loss": 0.426,
      "step": 324
    },
    {
      "epoch": 2.9839080459770115,
      "grad_norm": 0.6985622048377991,
      "learning_rate": 7.791262135922329e-05,
      "loss": 0.467,
      "step": 325
    },
    {
      "epoch": 2.9931034482758623,
      "grad_norm": 0.6391457915306091,
      "learning_rate": 7.718446601941747e-05,
      "loss": 0.428,
      "step": 326
    },
    {
      "epoch": 3.0,
      "grad_norm": 0.7811146378517151,
      "learning_rate": 7.645631067961165e-05,
      "loss": 0.5441,
      "step": 327
    },
    {
      "epoch": 3.0,
      "eval_loss": 1.4894534349441528,
      "eval_runtime": 94.563,
      "eval_samples_per_second": 3.511,
      "eval_steps_per_second": 1.755,
      "step": 327
    }
  ],
  "logging_steps": 1,
  "max_steps": 432,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 4,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 2.99509837713236e+17,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}