dabrown's picture
Training in progress, step 958, checkpoint
7e95335 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 240,
"global_step": 958,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010438413361169101,
"grad_norm": 1.4100897312164307,
"learning_rate": 2e-05,
"loss": 1.0427,
"step": 1
},
{
"epoch": 0.0010438413361169101,
"eval_loss": 1.9999854564666748,
"eval_runtime": 81.2923,
"eval_samples_per_second": 19.854,
"eval_steps_per_second": 9.927,
"step": 1
},
{
"epoch": 0.0020876826722338203,
"grad_norm": 2.131267547607422,
"learning_rate": 4e-05,
"loss": 1.2661,
"step": 2
},
{
"epoch": 0.003131524008350731,
"grad_norm": 2.1016671657562256,
"learning_rate": 6e-05,
"loss": 1.0965,
"step": 3
},
{
"epoch": 0.0041753653444676405,
"grad_norm": 2.6869776248931885,
"learning_rate": 8e-05,
"loss": 1.1197,
"step": 4
},
{
"epoch": 0.005219206680584551,
"grad_norm": 2.6327548027038574,
"learning_rate": 0.0001,
"loss": 0.9191,
"step": 5
},
{
"epoch": 0.006263048016701462,
"grad_norm": 1.9485589265823364,
"learning_rate": 0.00012,
"loss": 0.7709,
"step": 6
},
{
"epoch": 0.007306889352818371,
"grad_norm": 1.7759828567504883,
"learning_rate": 0.00014,
"loss": 0.6302,
"step": 7
},
{
"epoch": 0.008350730688935281,
"grad_norm": 1.4292073249816895,
"learning_rate": 0.00016,
"loss": 0.604,
"step": 8
},
{
"epoch": 0.009394572025052192,
"grad_norm": 1.8719030618667603,
"learning_rate": 0.00018,
"loss": 0.5034,
"step": 9
},
{
"epoch": 0.010438413361169102,
"grad_norm": 2.183128833770752,
"learning_rate": 0.0002,
"loss": 0.5065,
"step": 10
},
{
"epoch": 0.011482254697286013,
"grad_norm": 2.187234401702881,
"learning_rate": 0.00019999945089843994,
"loss": 0.4966,
"step": 11
},
{
"epoch": 0.012526096033402923,
"grad_norm": 2.7468483448028564,
"learning_rate": 0.00019999780359979,
"loss": 0.3204,
"step": 12
},
{
"epoch": 0.013569937369519834,
"grad_norm": 2.2195403575897217,
"learning_rate": 0.00019999505812214085,
"loss": 0.2784,
"step": 13
},
{
"epoch": 0.014613778705636743,
"grad_norm": 2.0797836780548096,
"learning_rate": 0.00019999121449564347,
"loss": 0.3631,
"step": 14
},
{
"epoch": 0.015657620041753653,
"grad_norm": 2.080904245376587,
"learning_rate": 0.00019998627276250858,
"loss": 0.2425,
"step": 15
},
{
"epoch": 0.016701461377870562,
"grad_norm": 1.358616590499878,
"learning_rate": 0.00019998023297700658,
"loss": 0.2799,
"step": 16
},
{
"epoch": 0.017745302713987474,
"grad_norm": 1.4436254501342773,
"learning_rate": 0.00019997309520546647,
"loss": 0.3314,
"step": 17
},
{
"epoch": 0.018789144050104383,
"grad_norm": 1.1498231887817383,
"learning_rate": 0.00019996485952627552,
"loss": 0.2713,
"step": 18
},
{
"epoch": 0.019832985386221295,
"grad_norm": 1.3239034414291382,
"learning_rate": 0.00019995552602987827,
"loss": 0.2347,
"step": 19
},
{
"epoch": 0.020876826722338204,
"grad_norm": 1.1880042552947998,
"learning_rate": 0.00019994509481877537,
"loss": 0.3311,
"step": 20
},
{
"epoch": 0.021920668058455117,
"grad_norm": 1.1707823276519775,
"learning_rate": 0.00019993356600752276,
"loss": 0.4601,
"step": 21
},
{
"epoch": 0.022964509394572025,
"grad_norm": 1.6230024099349976,
"learning_rate": 0.00019992093972273018,
"loss": 0.4187,
"step": 22
},
{
"epoch": 0.024008350730688934,
"grad_norm": 1.6155649423599243,
"learning_rate": 0.00019990721610305996,
"loss": 0.4661,
"step": 23
},
{
"epoch": 0.025052192066805846,
"grad_norm": 1.4404484033584595,
"learning_rate": 0.0001998923952992252,
"loss": 0.2662,
"step": 24
},
{
"epoch": 0.026096033402922755,
"grad_norm": 1.0615559816360474,
"learning_rate": 0.00019987647747398852,
"loss": 0.3144,
"step": 25
},
{
"epoch": 0.027139874739039668,
"grad_norm": 2.5796847343444824,
"learning_rate": 0.00019985946280215994,
"loss": 0.9688,
"step": 26
},
{
"epoch": 0.028183716075156576,
"grad_norm": 3.5270519256591797,
"learning_rate": 0.00019984135147059514,
"loss": 1.1566,
"step": 27
},
{
"epoch": 0.029227557411273485,
"grad_norm": 3.404811143875122,
"learning_rate": 0.00019982214367819328,
"loss": 0.9788,
"step": 28
},
{
"epoch": 0.030271398747390398,
"grad_norm": 2.5648179054260254,
"learning_rate": 0.00019980183963589504,
"loss": 0.9095,
"step": 29
},
{
"epoch": 0.031315240083507306,
"grad_norm": 2.203162670135498,
"learning_rate": 0.0001997804395666799,
"loss": 0.6835,
"step": 30
},
{
"epoch": 0.032359081419624215,
"grad_norm": 2.2330455780029297,
"learning_rate": 0.00019975794370556417,
"loss": 0.7176,
"step": 31
},
{
"epoch": 0.033402922755741124,
"grad_norm": 1.7634390592575073,
"learning_rate": 0.00019973435229959813,
"loss": 0.6717,
"step": 32
},
{
"epoch": 0.03444676409185804,
"grad_norm": 1.7378387451171875,
"learning_rate": 0.00019970966560786324,
"loss": 0.6573,
"step": 33
},
{
"epoch": 0.03549060542797495,
"grad_norm": 1.515835165977478,
"learning_rate": 0.0001996838839014696,
"loss": 0.5787,
"step": 34
},
{
"epoch": 0.03653444676409186,
"grad_norm": 1.6372636556625366,
"learning_rate": 0.0001996570074635527,
"loss": 0.5711,
"step": 35
},
{
"epoch": 0.037578288100208766,
"grad_norm": 1.449986457824707,
"learning_rate": 0.00019962903658927037,
"loss": 0.4957,
"step": 36
},
{
"epoch": 0.038622129436325675,
"grad_norm": 1.544659972190857,
"learning_rate": 0.00019959997158579967,
"loss": 0.5339,
"step": 37
},
{
"epoch": 0.03966597077244259,
"grad_norm": 1.497573733329773,
"learning_rate": 0.0001995698127723334,
"loss": 0.3736,
"step": 38
},
{
"epoch": 0.0407098121085595,
"grad_norm": 1.7716635465621948,
"learning_rate": 0.00019953856048007652,
"loss": 0.5047,
"step": 39
},
{
"epoch": 0.04175365344467641,
"grad_norm": 1.439935326576233,
"learning_rate": 0.00019950621505224273,
"loss": 0.3242,
"step": 40
},
{
"epoch": 0.04279749478079332,
"grad_norm": 1.5024681091308594,
"learning_rate": 0.00019947277684405056,
"loss": 0.3943,
"step": 41
},
{
"epoch": 0.04384133611691023,
"grad_norm": 1.411978006362915,
"learning_rate": 0.00019943824622271935,
"loss": 0.2535,
"step": 42
},
{
"epoch": 0.04488517745302714,
"grad_norm": 1.6485614776611328,
"learning_rate": 0.00019940262356746554,
"loss": 0.3503,
"step": 43
},
{
"epoch": 0.04592901878914405,
"grad_norm": 2.0574934482574463,
"learning_rate": 0.0001993659092694982,
"loss": 0.3788,
"step": 44
},
{
"epoch": 0.04697286012526096,
"grad_norm": 1.5055749416351318,
"learning_rate": 0.00019932810373201495,
"loss": 0.285,
"step": 45
},
{
"epoch": 0.04801670146137787,
"grad_norm": 1.3977243900299072,
"learning_rate": 0.00019928920737019733,
"loss": 0.2997,
"step": 46
},
{
"epoch": 0.049060542797494784,
"grad_norm": 1.5702697038650513,
"learning_rate": 0.00019924922061120644,
"loss": 0.2235,
"step": 47
},
{
"epoch": 0.05010438413361169,
"grad_norm": 1.1728659868240356,
"learning_rate": 0.0001992081438941781,
"loss": 0.277,
"step": 48
},
{
"epoch": 0.0511482254697286,
"grad_norm": 1.6709052324295044,
"learning_rate": 0.00019916597767021807,
"loss": 0.2712,
"step": 49
},
{
"epoch": 0.05219206680584551,
"grad_norm": 2.5871024131774902,
"learning_rate": 0.00019912272240239716,
"loss": 0.3964,
"step": 50
},
{
"epoch": 0.05323590814196242,
"grad_norm": 2.782623291015625,
"learning_rate": 0.00019907837856574607,
"loss": 0.7383,
"step": 51
},
{
"epoch": 0.054279749478079335,
"grad_norm": 2.9450387954711914,
"learning_rate": 0.0001990329466472502,
"loss": 0.7801,
"step": 52
},
{
"epoch": 0.055323590814196244,
"grad_norm": 2.3782992362976074,
"learning_rate": 0.00019898642714584428,
"loss": 0.6978,
"step": 53
},
{
"epoch": 0.05636743215031315,
"grad_norm": 2.7798867225646973,
"learning_rate": 0.000198938820572407,
"loss": 0.6635,
"step": 54
},
{
"epoch": 0.05741127348643006,
"grad_norm": 1.2441174983978271,
"learning_rate": 0.00019889012744975508,
"loss": 0.3866,
"step": 55
},
{
"epoch": 0.05845511482254697,
"grad_norm": 1.9317642450332642,
"learning_rate": 0.00019884034831263808,
"loss": 0.5378,
"step": 56
},
{
"epoch": 0.059498956158663886,
"grad_norm": 1.48700749874115,
"learning_rate": 0.00019878948370773193,
"loss": 0.416,
"step": 57
},
{
"epoch": 0.060542797494780795,
"grad_norm": 1.6028392314910889,
"learning_rate": 0.00019873753419363336,
"loss": 0.4354,
"step": 58
},
{
"epoch": 0.061586638830897704,
"grad_norm": 1.2003154754638672,
"learning_rate": 0.00019868450034085352,
"loss": 0.289,
"step": 59
},
{
"epoch": 0.06263048016701461,
"grad_norm": 0.8975642919540405,
"learning_rate": 0.00019863038273181186,
"loss": 0.3655,
"step": 60
},
{
"epoch": 0.06367432150313153,
"grad_norm": 1.0003724098205566,
"learning_rate": 0.00019857518196082964,
"loss": 0.3517,
"step": 61
},
{
"epoch": 0.06471816283924843,
"grad_norm": 0.7653176784515381,
"learning_rate": 0.00019851889863412345,
"loss": 0.2623,
"step": 62
},
{
"epoch": 0.06576200417536535,
"grad_norm": 0.8752651810646057,
"learning_rate": 0.00019846153336979856,
"loss": 0.295,
"step": 63
},
{
"epoch": 0.06680584551148225,
"grad_norm": 0.8633437752723694,
"learning_rate": 0.00019840308679784207,
"loss": 0.198,
"step": 64
},
{
"epoch": 0.06784968684759916,
"grad_norm": 0.9436039924621582,
"learning_rate": 0.00019834355956011606,
"loss": 0.2345,
"step": 65
},
{
"epoch": 0.06889352818371608,
"grad_norm": 0.6003718376159668,
"learning_rate": 0.00019828295231035051,
"loss": 0.1637,
"step": 66
},
{
"epoch": 0.06993736951983298,
"grad_norm": 1.460316777229309,
"learning_rate": 0.00019822126571413616,
"loss": 0.2315,
"step": 67
},
{
"epoch": 0.0709812108559499,
"grad_norm": 0.840602457523346,
"learning_rate": 0.00019815850044891707,
"loss": 0.2587,
"step": 68
},
{
"epoch": 0.0720250521920668,
"grad_norm": 0.9615221619606018,
"learning_rate": 0.0001980946572039834,
"loss": 0.3806,
"step": 69
},
{
"epoch": 0.07306889352818371,
"grad_norm": 1.013310432434082,
"learning_rate": 0.00019802973668046363,
"loss": 0.2961,
"step": 70
},
{
"epoch": 0.07411273486430063,
"grad_norm": 0.9203840494155884,
"learning_rate": 0.00019796373959131698,
"loss": 0.2715,
"step": 71
},
{
"epoch": 0.07515657620041753,
"grad_norm": 0.8724762201309204,
"learning_rate": 0.00019789666666132554,
"loss": 0.2774,
"step": 72
},
{
"epoch": 0.07620041753653445,
"grad_norm": 0.854420006275177,
"learning_rate": 0.00019782851862708634,
"loss": 0.2466,
"step": 73
},
{
"epoch": 0.07724425887265135,
"grad_norm": 0.880692720413208,
"learning_rate": 0.00019775929623700318,
"loss": 0.3145,
"step": 74
},
{
"epoch": 0.07828810020876827,
"grad_norm": 1.4334697723388672,
"learning_rate": 0.00019768900025127851,
"loss": 0.6141,
"step": 75
},
{
"epoch": 0.07933194154488518,
"grad_norm": 1.6129218339920044,
"learning_rate": 0.0001976176314419051,
"loss": 0.6577,
"step": 76
},
{
"epoch": 0.08037578288100208,
"grad_norm": 1.4365684986114502,
"learning_rate": 0.00019754519059265736,
"loss": 0.6501,
"step": 77
},
{
"epoch": 0.081419624217119,
"grad_norm": 1.1411486864089966,
"learning_rate": 0.00019747167849908304,
"loss": 0.5629,
"step": 78
},
{
"epoch": 0.0824634655532359,
"grad_norm": 0.9845925569534302,
"learning_rate": 0.00019739709596849417,
"loss": 0.4142,
"step": 79
},
{
"epoch": 0.08350730688935282,
"grad_norm": 1.0630574226379395,
"learning_rate": 0.00019732144381995846,
"loss": 0.4291,
"step": 80
},
{
"epoch": 0.08455114822546973,
"grad_norm": 1.059112548828125,
"learning_rate": 0.0001972447228842902,
"loss": 0.4116,
"step": 81
},
{
"epoch": 0.08559498956158663,
"grad_norm": 0.99774569272995,
"learning_rate": 0.000197166934004041,
"loss": 0.423,
"step": 82
},
{
"epoch": 0.08663883089770355,
"grad_norm": 1.2038483619689941,
"learning_rate": 0.00019708807803349088,
"loss": 0.4696,
"step": 83
},
{
"epoch": 0.08768267223382047,
"grad_norm": 1.5743309259414673,
"learning_rate": 0.00019700815583863852,
"loss": 0.4688,
"step": 84
},
{
"epoch": 0.08872651356993737,
"grad_norm": 1.0845067501068115,
"learning_rate": 0.00019692716829719194,
"loss": 0.3629,
"step": 85
},
{
"epoch": 0.08977035490605428,
"grad_norm": 1.1626365184783936,
"learning_rate": 0.00019684511629855888,
"loss": 0.4327,
"step": 86
},
{
"epoch": 0.09081419624217119,
"grad_norm": 1.1431448459625244,
"learning_rate": 0.00019676200074383692,
"loss": 0.2941,
"step": 87
},
{
"epoch": 0.0918580375782881,
"grad_norm": 1.0742226839065552,
"learning_rate": 0.00019667782254580374,
"loss": 0.365,
"step": 88
},
{
"epoch": 0.09290187891440502,
"grad_norm": 1.0580729246139526,
"learning_rate": 0.00019659258262890683,
"loss": 0.351,
"step": 89
},
{
"epoch": 0.09394572025052192,
"grad_norm": 1.1420572996139526,
"learning_rate": 0.0001965062819292537,
"loss": 0.3829,
"step": 90
},
{
"epoch": 0.09498956158663883,
"grad_norm": 0.9046992063522339,
"learning_rate": 0.0001964189213946013,
"loss": 0.3192,
"step": 91
},
{
"epoch": 0.09603340292275574,
"grad_norm": 0.94282066822052,
"learning_rate": 0.00019633050198434576,
"loss": 0.251,
"step": 92
},
{
"epoch": 0.09707724425887265,
"grad_norm": 0.9391703009605408,
"learning_rate": 0.0001962410246695118,
"loss": 0.2545,
"step": 93
},
{
"epoch": 0.09812108559498957,
"grad_norm": 1.068588137626648,
"learning_rate": 0.00019615049043274205,
"loss": 0.2889,
"step": 94
},
{
"epoch": 0.09916492693110647,
"grad_norm": 0.8007175922393799,
"learning_rate": 0.00019605890026828634,
"loss": 0.1906,
"step": 95
},
{
"epoch": 0.10020876826722339,
"grad_norm": 1.195952296257019,
"learning_rate": 0.00019596625518199077,
"loss": 0.3045,
"step": 96
},
{
"epoch": 0.10125260960334029,
"grad_norm": 1.1844594478607178,
"learning_rate": 0.00019587255619128648,
"loss": 0.2509,
"step": 97
},
{
"epoch": 0.1022964509394572,
"grad_norm": 1.0775648355484009,
"learning_rate": 0.00019577780432517879,
"loss": 0.1594,
"step": 98
},
{
"epoch": 0.10334029227557412,
"grad_norm": 1.0487576723098755,
"learning_rate": 0.00019568200062423555,
"loss": 0.1619,
"step": 99
},
{
"epoch": 0.10438413361169102,
"grad_norm": 3.752366304397583,
"learning_rate": 0.00019558514614057609,
"loss": 0.4774,
"step": 100
},
{
"epoch": 0.10542797494780794,
"grad_norm": 1.1260536909103394,
"learning_rate": 0.00019548724193785933,
"loss": 0.4538,
"step": 101
},
{
"epoch": 0.10647181628392484,
"grad_norm": 1.2162113189697266,
"learning_rate": 0.0001953882890912723,
"loss": 0.4851,
"step": 102
},
{
"epoch": 0.10751565762004175,
"grad_norm": 1.2823134660720825,
"learning_rate": 0.00019528828868751818,
"loss": 0.5302,
"step": 103
},
{
"epoch": 0.10855949895615867,
"grad_norm": 1.2430353164672852,
"learning_rate": 0.0001951872418248046,
"loss": 0.4463,
"step": 104
},
{
"epoch": 0.10960334029227557,
"grad_norm": 0.8766555786132812,
"learning_rate": 0.00019508514961283138,
"loss": 0.3673,
"step": 105
},
{
"epoch": 0.11064718162839249,
"grad_norm": 1.0745108127593994,
"learning_rate": 0.00019498201317277828,
"loss": 0.4313,
"step": 106
},
{
"epoch": 0.11169102296450939,
"grad_norm": 0.904514729976654,
"learning_rate": 0.00019487783363729294,
"loss": 0.3183,
"step": 107
},
{
"epoch": 0.1127348643006263,
"grad_norm": 1.1046079397201538,
"learning_rate": 0.00019477261215047835,
"loss": 0.389,
"step": 108
},
{
"epoch": 0.11377870563674322,
"grad_norm": 0.9722571969032288,
"learning_rate": 0.00019466634986788005,
"loss": 0.3647,
"step": 109
},
{
"epoch": 0.11482254697286012,
"grad_norm": 0.8949522376060486,
"learning_rate": 0.0001945590479564738,
"loss": 0.3807,
"step": 110
},
{
"epoch": 0.11586638830897704,
"grad_norm": 0.8489611744880676,
"learning_rate": 0.00019445070759465253,
"loss": 0.3148,
"step": 111
},
{
"epoch": 0.11691022964509394,
"grad_norm": 0.8171373009681702,
"learning_rate": 0.00019434132997221345,
"loss": 0.2941,
"step": 112
},
{
"epoch": 0.11795407098121086,
"grad_norm": 0.8785704970359802,
"learning_rate": 0.00019423091629034507,
"loss": 0.3087,
"step": 113
},
{
"epoch": 0.11899791231732777,
"grad_norm": 0.8450028896331787,
"learning_rate": 0.00019411946776161387,
"loss": 0.2428,
"step": 114
},
{
"epoch": 0.12004175365344467,
"grad_norm": 0.7037304639816284,
"learning_rate": 0.00019400698560995103,
"loss": 0.1534,
"step": 115
},
{
"epoch": 0.12108559498956159,
"grad_norm": 0.8595078587532043,
"learning_rate": 0.00019389347107063912,
"loss": 0.1996,
"step": 116
},
{
"epoch": 0.12212943632567849,
"grad_norm": 0.7243440747261047,
"learning_rate": 0.00019377892539029827,
"loss": 0.1407,
"step": 117
},
{
"epoch": 0.12317327766179541,
"grad_norm": 0.7662886381149292,
"learning_rate": 0.0001936633498268728,
"loss": 0.187,
"step": 118
},
{
"epoch": 0.12421711899791232,
"grad_norm": 0.9693275094032288,
"learning_rate": 0.0001935467456496171,
"loss": 0.2622,
"step": 119
},
{
"epoch": 0.12526096033402923,
"grad_norm": 0.827397882938385,
"learning_rate": 0.0001934291141390819,
"loss": 0.2819,
"step": 120
},
{
"epoch": 0.12630480167014613,
"grad_norm": 0.7069506645202637,
"learning_rate": 0.0001933104565871001,
"loss": 0.2907,
"step": 121
},
{
"epoch": 0.12734864300626306,
"grad_norm": 0.7466704845428467,
"learning_rate": 0.00019319077429677268,
"loss": 0.209,
"step": 122
},
{
"epoch": 0.12839248434237996,
"grad_norm": 0.8960225582122803,
"learning_rate": 0.00019307006858245424,
"loss": 0.219,
"step": 123
},
{
"epoch": 0.12943632567849686,
"grad_norm": 0.7968412041664124,
"learning_rate": 0.0001929483407697387,
"loss": 0.292,
"step": 124
},
{
"epoch": 0.1304801670146138,
"grad_norm": 0.7965363264083862,
"learning_rate": 0.00019282559219544477,
"loss": 0.3051,
"step": 125
},
{
"epoch": 0.1315240083507307,
"grad_norm": 1.490718126296997,
"learning_rate": 0.00019270182420760102,
"loss": 0.485,
"step": 126
},
{
"epoch": 0.1325678496868476,
"grad_norm": 1.431773066520691,
"learning_rate": 0.00019257703816543144,
"loss": 0.4818,
"step": 127
},
{
"epoch": 0.1336116910229645,
"grad_norm": 1.4318815469741821,
"learning_rate": 0.00019245123543934017,
"loss": 0.5472,
"step": 128
},
{
"epoch": 0.13465553235908143,
"grad_norm": 1.2134075164794922,
"learning_rate": 0.00019232441741089676,
"loss": 0.4773,
"step": 129
},
{
"epoch": 0.13569937369519833,
"grad_norm": 1.110347032546997,
"learning_rate": 0.00019219658547282067,
"loss": 0.4516,
"step": 130
},
{
"epoch": 0.13674321503131523,
"grad_norm": 1.122799277305603,
"learning_rate": 0.00019206774102896627,
"loss": 0.5222,
"step": 131
},
{
"epoch": 0.13778705636743216,
"grad_norm": 0.9539543390274048,
"learning_rate": 0.00019193788549430724,
"loss": 0.3994,
"step": 132
},
{
"epoch": 0.13883089770354906,
"grad_norm": 0.9659099578857422,
"learning_rate": 0.00019180702029492118,
"loss": 0.3946,
"step": 133
},
{
"epoch": 0.13987473903966596,
"grad_norm": 1.0640462636947632,
"learning_rate": 0.00019167514686797369,
"loss": 0.4409,
"step": 134
},
{
"epoch": 0.1409185803757829,
"grad_norm": 1.1047799587249756,
"learning_rate": 0.00019154226666170295,
"loss": 0.4123,
"step": 135
},
{
"epoch": 0.1419624217118998,
"grad_norm": 0.9255096912384033,
"learning_rate": 0.00019140838113540346,
"loss": 0.2524,
"step": 136
},
{
"epoch": 0.1430062630480167,
"grad_norm": 0.9960671663284302,
"learning_rate": 0.00019127349175941032,
"loss": 0.3268,
"step": 137
},
{
"epoch": 0.1440501043841336,
"grad_norm": 1.0808758735656738,
"learning_rate": 0.0001911376000150828,
"loss": 0.4802,
"step": 138
},
{
"epoch": 0.14509394572025053,
"grad_norm": 0.9004728198051453,
"learning_rate": 0.00019100070739478832,
"loss": 0.3092,
"step": 139
},
{
"epoch": 0.14613778705636743,
"grad_norm": 0.9561731219291687,
"learning_rate": 0.00019086281540188588,
"loss": 0.2572,
"step": 140
},
{
"epoch": 0.14718162839248433,
"grad_norm": 1.1557111740112305,
"learning_rate": 0.00019072392555070965,
"loss": 0.3391,
"step": 141
},
{
"epoch": 0.14822546972860126,
"grad_norm": 1.0732665061950684,
"learning_rate": 0.00019058403936655233,
"loss": 0.2717,
"step": 142
},
{
"epoch": 0.14926931106471816,
"grad_norm": 1.2010210752487183,
"learning_rate": 0.00019044315838564834,
"loss": 0.3122,
"step": 143
},
{
"epoch": 0.15031315240083507,
"grad_norm": 1.0502979755401611,
"learning_rate": 0.000190301284155157,
"loss": 0.2789,
"step": 144
},
{
"epoch": 0.151356993736952,
"grad_norm": 0.835246205329895,
"learning_rate": 0.0001901584182331456,
"loss": 0.1788,
"step": 145
},
{
"epoch": 0.1524008350730689,
"grad_norm": 0.9552949666976929,
"learning_rate": 0.00019001456218857208,
"loss": 0.2785,
"step": 146
},
{
"epoch": 0.1534446764091858,
"grad_norm": 0.6930809617042542,
"learning_rate": 0.00018986971760126805,
"loss": 0.1823,
"step": 147
},
{
"epoch": 0.1544885177453027,
"grad_norm": 0.6930037140846252,
"learning_rate": 0.00018972388606192125,
"loss": 0.1895,
"step": 148
},
{
"epoch": 0.15553235908141963,
"grad_norm": 0.8468357920646667,
"learning_rate": 0.0001895770691720582,
"loss": 0.1717,
"step": 149
},
{
"epoch": 0.15657620041753653,
"grad_norm": 0.9056650996208191,
"learning_rate": 0.0001894292685440266,
"loss": 0.1854,
"step": 150
},
{
"epoch": 0.15762004175365343,
"grad_norm": 1.131759524345398,
"learning_rate": 0.00018928048580097757,
"loss": 0.441,
"step": 151
},
{
"epoch": 0.15866388308977036,
"grad_norm": 1.426763892173767,
"learning_rate": 0.00018913072257684778,
"loss": 0.5142,
"step": 152
},
{
"epoch": 0.15970772442588727,
"grad_norm": 1.4198706150054932,
"learning_rate": 0.00018897998051634166,
"loss": 0.5212,
"step": 153
},
{
"epoch": 0.16075156576200417,
"grad_norm": 1.228197693824768,
"learning_rate": 0.0001888282612749132,
"loss": 0.4639,
"step": 154
},
{
"epoch": 0.1617954070981211,
"grad_norm": 0.7881720662117004,
"learning_rate": 0.0001886755665187479,
"loss": 0.3905,
"step": 155
},
{
"epoch": 0.162839248434238,
"grad_norm": 0.7964795231819153,
"learning_rate": 0.00018852189792474425,
"loss": 0.3802,
"step": 156
},
{
"epoch": 0.1638830897703549,
"grad_norm": 1.1133593320846558,
"learning_rate": 0.00018836725718049562,
"loss": 0.3962,
"step": 157
},
{
"epoch": 0.1649269311064718,
"grad_norm": 1.182551383972168,
"learning_rate": 0.00018821164598427145,
"loss": 0.3305,
"step": 158
},
{
"epoch": 0.16597077244258873,
"grad_norm": 0.8984887003898621,
"learning_rate": 0.0001880550660449988,
"loss": 0.2847,
"step": 159
},
{
"epoch": 0.16701461377870563,
"grad_norm": 0.8994187116622925,
"learning_rate": 0.00018789751908224338,
"loss": 0.346,
"step": 160
},
{
"epoch": 0.16805845511482254,
"grad_norm": 0.7379996180534363,
"learning_rate": 0.0001877390068261909,
"loss": 0.3183,
"step": 161
},
{
"epoch": 0.16910229645093947,
"grad_norm": 0.719083845615387,
"learning_rate": 0.00018757953101762787,
"loss": 0.2136,
"step": 162
},
{
"epoch": 0.17014613778705637,
"grad_norm": 1.2782928943634033,
"learning_rate": 0.00018741909340792262,
"loss": 0.1879,
"step": 163
},
{
"epoch": 0.17118997912317327,
"grad_norm": 0.6968748569488525,
"learning_rate": 0.000187257695759006,
"loss": 0.1594,
"step": 164
},
{
"epoch": 0.1722338204592902,
"grad_norm": 0.8432148098945618,
"learning_rate": 0.00018709533984335192,
"loss": 0.1767,
"step": 165
},
{
"epoch": 0.1732776617954071,
"grad_norm": 0.6877685785293579,
"learning_rate": 0.00018693202744395827,
"loss": 0.1787,
"step": 166
},
{
"epoch": 0.174321503131524,
"grad_norm": 0.9011303782463074,
"learning_rate": 0.0001867677603543268,
"loss": 0.1692,
"step": 167
},
{
"epoch": 0.17536534446764093,
"grad_norm": 0.8391153216362,
"learning_rate": 0.00018660254037844388,
"loss": 0.2701,
"step": 168
},
{
"epoch": 0.17640918580375783,
"grad_norm": 0.7215454578399658,
"learning_rate": 0.0001864363693307604,
"loss": 0.2727,
"step": 169
},
{
"epoch": 0.17745302713987474,
"grad_norm": 0.9198095202445984,
"learning_rate": 0.000186269249036172,
"loss": 0.2709,
"step": 170
},
{
"epoch": 0.17849686847599164,
"grad_norm": 0.6714828014373779,
"learning_rate": 0.00018610118132999896,
"loss": 0.2343,
"step": 171
},
{
"epoch": 0.17954070981210857,
"grad_norm": 0.7863060235977173,
"learning_rate": 0.00018593216805796612,
"loss": 0.3066,
"step": 172
},
{
"epoch": 0.18058455114822547,
"grad_norm": 0.8642680644989014,
"learning_rate": 0.00018576221107618243,
"loss": 0.3599,
"step": 173
},
{
"epoch": 0.18162839248434237,
"grad_norm": 1.3609846830368042,
"learning_rate": 0.00018559131225112085,
"loss": 0.435,
"step": 174
},
{
"epoch": 0.1826722338204593,
"grad_norm": 1.3618521690368652,
"learning_rate": 0.00018541947345959754,
"loss": 0.5144,
"step": 175
},
{
"epoch": 0.1837160751565762,
"grad_norm": 1.10430908203125,
"learning_rate": 0.00018524669658875152,
"loss": 0.465,
"step": 176
},
{
"epoch": 0.1847599164926931,
"grad_norm": 0.8763125538825989,
"learning_rate": 0.00018507298353602375,
"loss": 0.4001,
"step": 177
},
{
"epoch": 0.18580375782881003,
"grad_norm": 0.979162335395813,
"learning_rate": 0.00018489833620913642,
"loss": 0.4281,
"step": 178
},
{
"epoch": 0.18684759916492694,
"grad_norm": 0.9191274642944336,
"learning_rate": 0.00018472275652607186,
"loss": 0.4362,
"step": 179
},
{
"epoch": 0.18789144050104384,
"grad_norm": 1.0854690074920654,
"learning_rate": 0.0001845462464150517,
"loss": 0.4294,
"step": 180
},
{
"epoch": 0.18893528183716074,
"grad_norm": 1.0451358556747437,
"learning_rate": 0.00018436880781451544,
"loss": 0.4065,
"step": 181
},
{
"epoch": 0.18997912317327767,
"grad_norm": 1.0197910070419312,
"learning_rate": 0.00018419044267309939,
"loss": 0.3585,
"step": 182
},
{
"epoch": 0.19102296450939457,
"grad_norm": 0.9235432744026184,
"learning_rate": 0.000184011152949615,
"loss": 0.4618,
"step": 183
},
{
"epoch": 0.19206680584551147,
"grad_norm": 0.9856287837028503,
"learning_rate": 0.00018383094061302766,
"loss": 0.3951,
"step": 184
},
{
"epoch": 0.1931106471816284,
"grad_norm": 0.851546585559845,
"learning_rate": 0.0001836498076424349,
"loss": 0.3907,
"step": 185
},
{
"epoch": 0.1941544885177453,
"grad_norm": 0.936446487903595,
"learning_rate": 0.00018346775602704464,
"loss": 0.3234,
"step": 186
},
{
"epoch": 0.1951983298538622,
"grad_norm": 0.8742521405220032,
"learning_rate": 0.00018328478776615334,
"loss": 0.3065,
"step": 187
},
{
"epoch": 0.19624217118997914,
"grad_norm": 0.6365717053413391,
"learning_rate": 0.00018310090486912424,
"loss": 0.1979,
"step": 188
},
{
"epoch": 0.19728601252609604,
"grad_norm": 0.7701941132545471,
"learning_rate": 0.000182916109355365,
"loss": 0.2322,
"step": 189
},
{
"epoch": 0.19832985386221294,
"grad_norm": 0.8491252064704895,
"learning_rate": 0.00018273040325430574,
"loss": 0.2477,
"step": 190
},
{
"epoch": 0.19937369519832984,
"grad_norm": 0.8160005807876587,
"learning_rate": 0.00018254378860537666,
"loss": 0.2554,
"step": 191
},
{
"epoch": 0.20041753653444677,
"grad_norm": 0.7356383800506592,
"learning_rate": 0.0001823562674579856,
"loss": 0.2209,
"step": 192
},
{
"epoch": 0.20146137787056367,
"grad_norm": 0.734779953956604,
"learning_rate": 0.0001821678418714957,
"loss": 0.2162,
"step": 193
},
{
"epoch": 0.20250521920668058,
"grad_norm": 0.8594037294387817,
"learning_rate": 0.00018197851391520264,
"loss": 0.2831,
"step": 194
},
{
"epoch": 0.2035490605427975,
"grad_norm": 0.6627790331840515,
"learning_rate": 0.00018178828566831192,
"loss": 0.1818,
"step": 195
},
{
"epoch": 0.2045929018789144,
"grad_norm": 0.9464810490608215,
"learning_rate": 0.00018159715921991612,
"loss": 0.2376,
"step": 196
},
{
"epoch": 0.2056367432150313,
"grad_norm": 0.8601820468902588,
"learning_rate": 0.00018140513666897178,
"loss": 0.1824,
"step": 197
},
{
"epoch": 0.20668058455114824,
"grad_norm": 0.7504302263259888,
"learning_rate": 0.00018121222012427665,
"loss": 0.2081,
"step": 198
},
{
"epoch": 0.20772442588726514,
"grad_norm": 0.6832506060600281,
"learning_rate": 0.00018101841170444616,
"loss": 0.1742,
"step": 199
},
{
"epoch": 0.20876826722338204,
"grad_norm": 1.3405510187149048,
"learning_rate": 0.00018082371353789046,
"loss": 0.2519,
"step": 200
},
{
"epoch": 0.20981210855949894,
"grad_norm": 0.8375347852706909,
"learning_rate": 0.00018062812776279086,
"loss": 0.3868,
"step": 201
},
{
"epoch": 0.21085594989561587,
"grad_norm": 0.9397875070571899,
"learning_rate": 0.00018043165652707649,
"loss": 0.4824,
"step": 202
},
{
"epoch": 0.21189979123173278,
"grad_norm": 0.9745060205459595,
"learning_rate": 0.00018023430198840058,
"loss": 0.4726,
"step": 203
},
{
"epoch": 0.21294363256784968,
"grad_norm": 0.9175118207931519,
"learning_rate": 0.00018003606631411678,
"loss": 0.3722,
"step": 204
},
{
"epoch": 0.2139874739039666,
"grad_norm": 0.831420361995697,
"learning_rate": 0.0001798369516812555,
"loss": 0.387,
"step": 205
},
{
"epoch": 0.2150313152400835,
"grad_norm": 0.7558404207229614,
"learning_rate": 0.00017963696027649986,
"loss": 0.3522,
"step": 206
},
{
"epoch": 0.2160751565762004,
"grad_norm": 0.8819952607154846,
"learning_rate": 0.0001794360942961617,
"loss": 0.3665,
"step": 207
},
{
"epoch": 0.21711899791231734,
"grad_norm": 0.9681791663169861,
"learning_rate": 0.00017923435594615744,
"loss": 0.3571,
"step": 208
},
{
"epoch": 0.21816283924843424,
"grad_norm": 1.2287527322769165,
"learning_rate": 0.000179031747441984,
"loss": 0.269,
"step": 209
},
{
"epoch": 0.21920668058455114,
"grad_norm": 0.9387004375457764,
"learning_rate": 0.0001788282710086942,
"loss": 0.2254,
"step": 210
},
{
"epoch": 0.22025052192066805,
"grad_norm": 0.6556700468063354,
"learning_rate": 0.0001786239288808727,
"loss": 0.2438,
"step": 211
},
{
"epoch": 0.22129436325678498,
"grad_norm": 0.8871288895606995,
"learning_rate": 0.00017841872330261101,
"loss": 0.2408,
"step": 212
},
{
"epoch": 0.22233820459290188,
"grad_norm": 0.6930763125419617,
"learning_rate": 0.00017821265652748328,
"loss": 0.1755,
"step": 213
},
{
"epoch": 0.22338204592901878,
"grad_norm": 0.6196439862251282,
"learning_rate": 0.00017800573081852122,
"loss": 0.1714,
"step": 214
},
{
"epoch": 0.2244258872651357,
"grad_norm": 0.4868805408477783,
"learning_rate": 0.00017779794844818944,
"loss": 0.1688,
"step": 215
},
{
"epoch": 0.2254697286012526,
"grad_norm": 0.5908803343772888,
"learning_rate": 0.0001775893116983604,
"loss": 0.1653,
"step": 216
},
{
"epoch": 0.2265135699373695,
"grad_norm": 0.7646514177322388,
"learning_rate": 0.00017737982286028937,
"loss": 0.3009,
"step": 217
},
{
"epoch": 0.22755741127348644,
"grad_norm": 0.7528402209281921,
"learning_rate": 0.00017716948423458938,
"loss": 0.3261,
"step": 218
},
{
"epoch": 0.22860125260960334,
"grad_norm": 0.6268967390060425,
"learning_rate": 0.0001769582981312057,
"loss": 0.2322,
"step": 219
},
{
"epoch": 0.22964509394572025,
"grad_norm": 0.7152937650680542,
"learning_rate": 0.0001767462668693908,
"loss": 0.2756,
"step": 220
},
{
"epoch": 0.23068893528183715,
"grad_norm": 0.7480033040046692,
"learning_rate": 0.00017653339277767853,
"loss": 0.2613,
"step": 221
},
{
"epoch": 0.23173277661795408,
"grad_norm": 0.6832448840141296,
"learning_rate": 0.00017631967819385885,
"loss": 0.1719,
"step": 222
},
{
"epoch": 0.23277661795407098,
"grad_norm": 0.8552375435829163,
"learning_rate": 0.00017610512546495195,
"loss": 0.3867,
"step": 223
},
{
"epoch": 0.23382045929018788,
"grad_norm": 1.193741798400879,
"learning_rate": 0.00017588973694718262,
"loss": 0.4765,
"step": 224
},
{
"epoch": 0.2348643006263048,
"grad_norm": 1.2147541046142578,
"learning_rate": 0.00017567351500595425,
"loss": 0.4603,
"step": 225
},
{
"epoch": 0.2359081419624217,
"grad_norm": 1.028960108757019,
"learning_rate": 0.00017545646201582303,
"loss": 0.4966,
"step": 226
},
{
"epoch": 0.23695198329853862,
"grad_norm": 1.021507978439331,
"learning_rate": 0.00017523858036047157,
"loss": 0.3946,
"step": 227
},
{
"epoch": 0.23799582463465555,
"grad_norm": 0.9800359606742859,
"learning_rate": 0.00017501987243268306,
"loss": 0.4061,
"step": 228
},
{
"epoch": 0.23903966597077245,
"grad_norm": 1.0288289785385132,
"learning_rate": 0.00017480034063431477,
"loss": 0.4,
"step": 229
},
{
"epoch": 0.24008350730688935,
"grad_norm": 0.9607250690460205,
"learning_rate": 0.00017457998737627182,
"loss": 0.3849,
"step": 230
},
{
"epoch": 0.24112734864300625,
"grad_norm": 0.9097132086753845,
"learning_rate": 0.00017435881507848046,
"loss": 0.4232,
"step": 231
},
{
"epoch": 0.24217118997912318,
"grad_norm": 0.9774672985076904,
"learning_rate": 0.00017413682616986185,
"loss": 0.4199,
"step": 232
},
{
"epoch": 0.24321503131524008,
"grad_norm": 0.9056701064109802,
"learning_rate": 0.00017391402308830503,
"loss": 0.3844,
"step": 233
},
{
"epoch": 0.24425887265135698,
"grad_norm": 0.9123631119728088,
"learning_rate": 0.00017369040828064047,
"loss": 0.3614,
"step": 234
},
{
"epoch": 0.2453027139874739,
"grad_norm": 0.8399245142936707,
"learning_rate": 0.00017346598420261294,
"loss": 0.3572,
"step": 235
},
{
"epoch": 0.24634655532359082,
"grad_norm": 1.1125223636627197,
"learning_rate": 0.00017324075331885466,
"loss": 0.3155,
"step": 236
},
{
"epoch": 0.24739039665970772,
"grad_norm": 0.7240562438964844,
"learning_rate": 0.00017301471810285822,
"loss": 0.281,
"step": 237
},
{
"epoch": 0.24843423799582465,
"grad_norm": 0.6814681887626648,
"learning_rate": 0.00017278788103694943,
"loss": 0.2605,
"step": 238
},
{
"epoch": 0.24947807933194155,
"grad_norm": 0.7790616154670715,
"learning_rate": 0.00017256024461226001,
"loss": 0.3214,
"step": 239
},
{
"epoch": 0.25052192066805845,
"grad_norm": 0.7958056926727295,
"learning_rate": 0.00017233181132870028,
"loss": 0.2668,
"step": 240
},
{
"epoch": 0.25052192066805845,
"eval_loss": 0.28679800033569336,
"eval_runtime": 81.356,
"eval_samples_per_second": 19.839,
"eval_steps_per_second": 9.919,
"step": 240
},
{
"epoch": 0.25156576200417535,
"grad_norm": 0.7587823271751404,
"learning_rate": 0.0001721025836949317,
"loss": 0.2911,
"step": 241
},
{
"epoch": 0.25260960334029225,
"grad_norm": 0.8206672668457031,
"learning_rate": 0.00017187256422833929,
"loss": 0.2602,
"step": 242
},
{
"epoch": 0.2536534446764092,
"grad_norm": 0.7913001179695129,
"learning_rate": 0.000171641755455004,
"loss": 0.242,
"step": 243
},
{
"epoch": 0.2546972860125261,
"grad_norm": 0.6242060661315918,
"learning_rate": 0.000171410159909675,
"loss": 0.2098,
"step": 244
},
{
"epoch": 0.255741127348643,
"grad_norm": 0.9254711270332336,
"learning_rate": 0.00017117778013574174,
"loss": 0.2028,
"step": 245
},
{
"epoch": 0.2567849686847599,
"grad_norm": 0.804876446723938,
"learning_rate": 0.00017094461868520622,
"loss": 0.215,
"step": 246
},
{
"epoch": 0.2578288100208768,
"grad_norm": 0.786629319190979,
"learning_rate": 0.00017071067811865476,
"loss": 0.2075,
"step": 247
},
{
"epoch": 0.2588726513569937,
"grad_norm": 0.9948318004608154,
"learning_rate": 0.0001704759610052299,
"loss": 0.243,
"step": 248
},
{
"epoch": 0.2599164926931106,
"grad_norm": 1.256179928779602,
"learning_rate": 0.00017024046992260237,
"loss": 0.2296,
"step": 249
},
{
"epoch": 0.2609603340292276,
"grad_norm": 0.9201668500900269,
"learning_rate": 0.00017000420745694254,
"loss": 0.1739,
"step": 250
},
{
"epoch": 0.2620041753653445,
"grad_norm": 0.8208956122398376,
"learning_rate": 0.0001697671762028922,
"loss": 0.4128,
"step": 251
},
{
"epoch": 0.2630480167014614,
"grad_norm": 0.8502789735794067,
"learning_rate": 0.00016952937876353597,
"loss": 0.4016,
"step": 252
},
{
"epoch": 0.2640918580375783,
"grad_norm": 1.1290831565856934,
"learning_rate": 0.00016929081775037276,
"loss": 0.442,
"step": 253
},
{
"epoch": 0.2651356993736952,
"grad_norm": 0.9651896953582764,
"learning_rate": 0.00016905149578328702,
"loss": 0.4346,
"step": 254
},
{
"epoch": 0.2661795407098121,
"grad_norm": 0.9524455070495605,
"learning_rate": 0.00016881141549052013,
"loss": 0.3441,
"step": 255
},
{
"epoch": 0.267223382045929,
"grad_norm": 0.8862738013267517,
"learning_rate": 0.00016857057950864132,
"loss": 0.3336,
"step": 256
},
{
"epoch": 0.26826722338204595,
"grad_norm": 0.8053174614906311,
"learning_rate": 0.00016832899048251884,
"loss": 0.3265,
"step": 257
},
{
"epoch": 0.26931106471816285,
"grad_norm": 0.7121575474739075,
"learning_rate": 0.00016808665106529094,
"loss": 0.3,
"step": 258
},
{
"epoch": 0.27035490605427975,
"grad_norm": 0.8516458868980408,
"learning_rate": 0.00016784356391833665,
"loss": 0.2941,
"step": 259
},
{
"epoch": 0.27139874739039666,
"grad_norm": 0.9382047653198242,
"learning_rate": 0.0001675997317112466,
"loss": 0.3625,
"step": 260
},
{
"epoch": 0.27244258872651356,
"grad_norm": 1.0630011558532715,
"learning_rate": 0.00016735515712179368,
"loss": 0.2875,
"step": 261
},
{
"epoch": 0.27348643006263046,
"grad_norm": 0.8051387071609497,
"learning_rate": 0.0001671098428359037,
"loss": 0.2498,
"step": 262
},
{
"epoch": 0.2745302713987474,
"grad_norm": 0.898259162902832,
"learning_rate": 0.00016686379154762574,
"loss": 0.2293,
"step": 263
},
{
"epoch": 0.2755741127348643,
"grad_norm": 0.6109540462493896,
"learning_rate": 0.00016661700595910285,
"loss": 0.2029,
"step": 264
},
{
"epoch": 0.2766179540709812,
"grad_norm": 0.5780929327011108,
"learning_rate": 0.0001663694887805419,
"loss": 0.1858,
"step": 265
},
{
"epoch": 0.2776617954070981,
"grad_norm": 0.5590531826019287,
"learning_rate": 0.0001661212427301844,
"loss": 0.1911,
"step": 266
},
{
"epoch": 0.278705636743215,
"grad_norm": 0.382966548204422,
"learning_rate": 0.00016587227053427612,
"loss": 0.114,
"step": 267
},
{
"epoch": 0.2797494780793319,
"grad_norm": 0.7528713941574097,
"learning_rate": 0.00016562257492703757,
"loss": 0.2135,
"step": 268
},
{
"epoch": 0.2807933194154488,
"grad_norm": 0.5933663845062256,
"learning_rate": 0.00016537215865063365,
"loss": 0.1993,
"step": 269
},
{
"epoch": 0.2818371607515658,
"grad_norm": 0.7138686776161194,
"learning_rate": 0.00016512102445514375,
"loss": 0.2874,
"step": 270
},
{
"epoch": 0.2828810020876827,
"grad_norm": 0.6350995898246765,
"learning_rate": 0.0001648691750985314,
"loss": 0.2225,
"step": 271
},
{
"epoch": 0.2839248434237996,
"grad_norm": 0.7503290772438049,
"learning_rate": 0.00016461661334661414,
"loss": 0.2687,
"step": 272
},
{
"epoch": 0.2849686847599165,
"grad_norm": 0.633063018321991,
"learning_rate": 0.00016436334197303295,
"loss": 0.1867,
"step": 273
},
{
"epoch": 0.2860125260960334,
"grad_norm": 0.750601053237915,
"learning_rate": 0.000164109363759222,
"loss": 0.1644,
"step": 274
},
{
"epoch": 0.2870563674321503,
"grad_norm": 0.9629106521606445,
"learning_rate": 0.000163854681494378,
"loss": 0.463,
"step": 275
},
{
"epoch": 0.2881002087682672,
"grad_norm": 1.2181180715560913,
"learning_rate": 0.00016359929797542944,
"loss": 0.497,
"step": 276
},
{
"epoch": 0.28914405010438415,
"grad_norm": 1.0051687955856323,
"learning_rate": 0.00016334321600700613,
"loss": 0.3734,
"step": 277
},
{
"epoch": 0.29018789144050106,
"grad_norm": 0.8457571268081665,
"learning_rate": 0.00016308643840140828,
"loss": 0.3634,
"step": 278
},
{
"epoch": 0.29123173277661796,
"grad_norm": 0.8827991485595703,
"learning_rate": 0.00016282896797857548,
"loss": 0.4218,
"step": 279
},
{
"epoch": 0.29227557411273486,
"grad_norm": 0.767482578754425,
"learning_rate": 0.000162570807566056,
"loss": 0.3099,
"step": 280
},
{
"epoch": 0.29331941544885176,
"grad_norm": 0.9973590970039368,
"learning_rate": 0.00016231195999897558,
"loss": 0.3751,
"step": 281
},
{
"epoch": 0.29436325678496866,
"grad_norm": 1.094586730003357,
"learning_rate": 0.00016205242812000617,
"loss": 0.3329,
"step": 282
},
{
"epoch": 0.2954070981210856,
"grad_norm": 1.1051408052444458,
"learning_rate": 0.0001617922147793351,
"loss": 0.3888,
"step": 283
},
{
"epoch": 0.2964509394572025,
"grad_norm": 1.1265891790390015,
"learning_rate": 0.00016153132283463326,
"loss": 0.3881,
"step": 284
},
{
"epoch": 0.2974947807933194,
"grad_norm": 0.9808762669563293,
"learning_rate": 0.00016126975515102422,
"loss": 0.3608,
"step": 285
},
{
"epoch": 0.2985386221294363,
"grad_norm": 0.9096065759658813,
"learning_rate": 0.00016100751460105243,
"loss": 0.3036,
"step": 286
},
{
"epoch": 0.29958246346555323,
"grad_norm": 0.7918301224708557,
"learning_rate": 0.0001607446040646518,
"loss": 0.265,
"step": 287
},
{
"epoch": 0.30062630480167013,
"grad_norm": 0.8781275153160095,
"learning_rate": 0.00016048102642911397,
"loss": 0.3192,
"step": 288
},
{
"epoch": 0.30167014613778703,
"grad_norm": 0.7994233965873718,
"learning_rate": 0.00016021678458905684,
"loss": 0.2619,
"step": 289
},
{
"epoch": 0.302713987473904,
"grad_norm": 0.7719682455062866,
"learning_rate": 0.0001599518814463925,
"loss": 0.2373,
"step": 290
},
{
"epoch": 0.3037578288100209,
"grad_norm": 0.7163065075874329,
"learning_rate": 0.00015968631991029555,
"loss": 0.2461,
"step": 291
},
{
"epoch": 0.3048016701461378,
"grad_norm": 0.6648116707801819,
"learning_rate": 0.00015942010289717105,
"loss": 0.2033,
"step": 292
},
{
"epoch": 0.3058455114822547,
"grad_norm": 0.7089317440986633,
"learning_rate": 0.00015915323333062255,
"loss": 0.2308,
"step": 293
},
{
"epoch": 0.3068893528183716,
"grad_norm": 0.6273900270462036,
"learning_rate": 0.00015888571414141996,
"loss": 0.1762,
"step": 294
},
{
"epoch": 0.3079331941544885,
"grad_norm": 0.570709228515625,
"learning_rate": 0.00015861754826746734,
"loss": 0.1612,
"step": 295
},
{
"epoch": 0.3089770354906054,
"grad_norm": 0.7131444811820984,
"learning_rate": 0.00015834873865377077,
"loss": 0.1748,
"step": 296
},
{
"epoch": 0.31002087682672236,
"grad_norm": 0.857348620891571,
"learning_rate": 0.00015807928825240566,
"loss": 0.2076,
"step": 297
},
{
"epoch": 0.31106471816283926,
"grad_norm": 0.6636834740638733,
"learning_rate": 0.00015780920002248484,
"loss": 0.1577,
"step": 298
},
{
"epoch": 0.31210855949895616,
"grad_norm": 0.8303975462913513,
"learning_rate": 0.00015753847693012566,
"loss": 0.1722,
"step": 299
},
{
"epoch": 0.31315240083507306,
"grad_norm": 1.009426474571228,
"learning_rate": 0.00015726712194841756,
"loss": 0.181,
"step": 300
},
{
"epoch": 0.31419624217118997,
"grad_norm": 0.873030960559845,
"learning_rate": 0.0001569951380573894,
"loss": 0.4061,
"step": 301
},
{
"epoch": 0.31524008350730687,
"grad_norm": 0.8714029788970947,
"learning_rate": 0.0001567225282439768,
"loss": 0.4541,
"step": 302
},
{
"epoch": 0.3162839248434238,
"grad_norm": 1.2070977687835693,
"learning_rate": 0.0001564492955019892,
"loss": 0.444,
"step": 303
},
{
"epoch": 0.3173277661795407,
"grad_norm": 1.1216182708740234,
"learning_rate": 0.0001561754428320771,
"loss": 0.378,
"step": 304
},
{
"epoch": 0.31837160751565763,
"grad_norm": 1.0232008695602417,
"learning_rate": 0.00015590097324169909,
"loss": 0.3044,
"step": 305
},
{
"epoch": 0.31941544885177453,
"grad_norm": 0.8639291524887085,
"learning_rate": 0.00015562588974508872,
"loss": 0.3222,
"step": 306
},
{
"epoch": 0.32045929018789143,
"grad_norm": 0.7577306032180786,
"learning_rate": 0.00015535019536322157,
"loss": 0.3346,
"step": 307
},
{
"epoch": 0.32150313152400833,
"grad_norm": 0.7879360914230347,
"learning_rate": 0.00015507389312378197,
"loss": 0.2889,
"step": 308
},
{
"epoch": 0.32254697286012524,
"grad_norm": 0.8477923274040222,
"learning_rate": 0.0001547969860611297,
"loss": 0.2948,
"step": 309
},
{
"epoch": 0.3235908141962422,
"grad_norm": 0.8615967631340027,
"learning_rate": 0.00015451947721626676,
"loss": 0.3135,
"step": 310
},
{
"epoch": 0.3246346555323591,
"grad_norm": 1.0251150131225586,
"learning_rate": 0.00015424136963680397,
"loss": 0.2807,
"step": 311
},
{
"epoch": 0.325678496868476,
"grad_norm": 0.8408343195915222,
"learning_rate": 0.00015396266637692743,
"loss": 0.2879,
"step": 312
},
{
"epoch": 0.3267223382045929,
"grad_norm": 0.7773711681365967,
"learning_rate": 0.00015368337049736502,
"loss": 0.2457,
"step": 313
},
{
"epoch": 0.3277661795407098,
"grad_norm": 0.797435462474823,
"learning_rate": 0.00015340348506535283,
"loss": 0.259,
"step": 314
},
{
"epoch": 0.3288100208768267,
"grad_norm": 0.8681854605674744,
"learning_rate": 0.00015312301315460137,
"loss": 0.2163,
"step": 315
},
{
"epoch": 0.3298538622129436,
"grad_norm": 0.8305045962333679,
"learning_rate": 0.00015284195784526195,
"loss": 0.1331,
"step": 316
},
{
"epoch": 0.33089770354906056,
"grad_norm": 0.4855158030986786,
"learning_rate": 0.00015256032222389277,
"loss": 0.153,
"step": 317
},
{
"epoch": 0.33194154488517746,
"grad_norm": 0.4702201783657074,
"learning_rate": 0.00015227810938342492,
"loss": 0.1527,
"step": 318
},
{
"epoch": 0.33298538622129437,
"grad_norm": 0.6600713133811951,
"learning_rate": 0.0001519953224231287,
"loss": 0.2761,
"step": 319
},
{
"epoch": 0.33402922755741127,
"grad_norm": 0.5896007418632507,
"learning_rate": 0.00015171196444857933,
"loss": 0.1978,
"step": 320
},
{
"epoch": 0.33507306889352817,
"grad_norm": 0.7142235040664673,
"learning_rate": 0.0001514280385716229,
"loss": 0.2759,
"step": 321
},
{
"epoch": 0.33611691022964507,
"grad_norm": 0.6415271759033203,
"learning_rate": 0.00015114354791034225,
"loss": 0.265,
"step": 322
},
{
"epoch": 0.33716075156576203,
"grad_norm": 0.6545302867889404,
"learning_rate": 0.00015085849558902264,
"loss": 0.2221,
"step": 323
},
{
"epoch": 0.33820459290187893,
"grad_norm": 0.7755191922187805,
"learning_rate": 0.00015057288473811772,
"loss": 0.2827,
"step": 324
},
{
"epoch": 0.33924843423799583,
"grad_norm": 0.960443913936615,
"learning_rate": 0.00015028671849421464,
"loss": 0.4552,
"step": 325
},
{
"epoch": 0.34029227557411273,
"grad_norm": 1.016601324081421,
"learning_rate": 0.00015000000000000001,
"loss": 0.4142,
"step": 326
},
{
"epoch": 0.34133611691022964,
"grad_norm": 1.2163599729537964,
"learning_rate": 0.00014971273240422535,
"loss": 0.5009,
"step": 327
},
{
"epoch": 0.34237995824634654,
"grad_norm": 1.181193470954895,
"learning_rate": 0.0001494249188616723,
"loss": 0.5031,
"step": 328
},
{
"epoch": 0.34342379958246344,
"grad_norm": 0.8927332758903503,
"learning_rate": 0.0001491365625331182,
"loss": 0.3835,
"step": 329
},
{
"epoch": 0.3444676409185804,
"grad_norm": 0.9175090789794922,
"learning_rate": 0.00014884766658530125,
"loss": 0.4822,
"step": 330
},
{
"epoch": 0.3455114822546973,
"grad_norm": 0.8441628813743591,
"learning_rate": 0.00014855823419088576,
"loss": 0.3455,
"step": 331
},
{
"epoch": 0.3465553235908142,
"grad_norm": 0.7784302234649658,
"learning_rate": 0.00014826826852842726,
"loss": 0.3359,
"step": 332
},
{
"epoch": 0.3475991649269311,
"grad_norm": 1.0156643390655518,
"learning_rate": 0.00014797777278233778,
"loss": 0.4488,
"step": 333
},
{
"epoch": 0.348643006263048,
"grad_norm": 0.9029203057289124,
"learning_rate": 0.00014768675014285062,
"loss": 0.3175,
"step": 334
},
{
"epoch": 0.3496868475991649,
"grad_norm": 0.8379835486412048,
"learning_rate": 0.0001473952038059855,
"loss": 0.352,
"step": 335
},
{
"epoch": 0.35073068893528186,
"grad_norm": 0.807041585445404,
"learning_rate": 0.00014710313697351341,
"loss": 0.3151,
"step": 336
},
{
"epoch": 0.35177453027139877,
"grad_norm": 0.7760060429573059,
"learning_rate": 0.0001468105528529214,
"loss": 0.3397,
"step": 337
},
{
"epoch": 0.35281837160751567,
"grad_norm": 0.7737391591072083,
"learning_rate": 0.00014651745465737737,
"loss": 0.3054,
"step": 338
},
{
"epoch": 0.35386221294363257,
"grad_norm": 0.7160333395004272,
"learning_rate": 0.00014622384560569493,
"loss": 0.2271,
"step": 339
},
{
"epoch": 0.35490605427974947,
"grad_norm": 0.7895155549049377,
"learning_rate": 0.00014592972892229778,
"loss": 0.2717,
"step": 340
},
{
"epoch": 0.3559498956158664,
"grad_norm": 0.6400478482246399,
"learning_rate": 0.00014563510783718457,
"loss": 0.2212,
"step": 341
},
{
"epoch": 0.3569937369519833,
"grad_norm": 0.7159572243690491,
"learning_rate": 0.0001453399855858932,
"loss": 0.2192,
"step": 342
},
{
"epoch": 0.35803757828810023,
"grad_norm": 0.8888924717903137,
"learning_rate": 0.00014504436540946548,
"loss": 0.269,
"step": 343
},
{
"epoch": 0.35908141962421714,
"grad_norm": 0.8276658058166504,
"learning_rate": 0.00014474825055441136,
"loss": 0.2466,
"step": 344
},
{
"epoch": 0.36012526096033404,
"grad_norm": 0.690984308719635,
"learning_rate": 0.00014445164427267344,
"loss": 0.1649,
"step": 345
},
{
"epoch": 0.36116910229645094,
"grad_norm": 0.7650169730186462,
"learning_rate": 0.0001441545498215912,
"loss": 0.2323,
"step": 346
},
{
"epoch": 0.36221294363256784,
"grad_norm": 0.8575247526168823,
"learning_rate": 0.00014385697046386512,
"loss": 0.1856,
"step": 347
},
{
"epoch": 0.36325678496868474,
"grad_norm": 0.9500517249107361,
"learning_rate": 0.00014355890946752102,
"loss": 0.2059,
"step": 348
},
{
"epoch": 0.36430062630480164,
"grad_norm": 0.7111496925354004,
"learning_rate": 0.00014326037010587404,
"loss": 0.144,
"step": 349
},
{
"epoch": 0.3653444676409186,
"grad_norm": 0.9617753624916077,
"learning_rate": 0.0001429613556574928,
"loss": 0.2208,
"step": 350
},
{
"epoch": 0.3663883089770355,
"grad_norm": 0.7692716717720032,
"learning_rate": 0.00014266186940616328,
"loss": 0.4445,
"step": 351
},
{
"epoch": 0.3674321503131524,
"grad_norm": 0.742679238319397,
"learning_rate": 0.00014236191464085286,
"loss": 0.3707,
"step": 352
},
{
"epoch": 0.3684759916492693,
"grad_norm": 0.928932785987854,
"learning_rate": 0.00014206149465567403,
"loss": 0.4158,
"step": 353
},
{
"epoch": 0.3695198329853862,
"grad_norm": 1.0475050210952759,
"learning_rate": 0.00014176061274984858,
"loss": 0.4644,
"step": 354
},
{
"epoch": 0.3705636743215031,
"grad_norm": 0.9185068607330322,
"learning_rate": 0.0001414592722276709,
"loss": 0.4066,
"step": 355
},
{
"epoch": 0.37160751565762007,
"grad_norm": 0.8016102910041809,
"learning_rate": 0.00014115747639847204,
"loss": 0.3827,
"step": 356
},
{
"epoch": 0.37265135699373697,
"grad_norm": 0.689323365688324,
"learning_rate": 0.0001408552285765832,
"loss": 0.3004,
"step": 357
},
{
"epoch": 0.3736951983298539,
"grad_norm": 0.721184253692627,
"learning_rate": 0.00014055253208129938,
"loss": 0.311,
"step": 358
},
{
"epoch": 0.3747390396659708,
"grad_norm": 0.8813337683677673,
"learning_rate": 0.00014024939023684298,
"loss": 0.3493,
"step": 359
},
{
"epoch": 0.3757828810020877,
"grad_norm": 0.8543792366981506,
"learning_rate": 0.00013994580637232716,
"loss": 0.2636,
"step": 360
},
{
"epoch": 0.3768267223382046,
"grad_norm": 0.8040288686752319,
"learning_rate": 0.00013964178382171942,
"loss": 0.2661,
"step": 361
},
{
"epoch": 0.3778705636743215,
"grad_norm": 0.7446701526641846,
"learning_rate": 0.00013933732592380483,
"loss": 0.2464,
"step": 362
},
{
"epoch": 0.37891440501043844,
"grad_norm": 0.7692604064941406,
"learning_rate": 0.0001390324360221496,
"loss": 0.2179,
"step": 363
},
{
"epoch": 0.37995824634655534,
"grad_norm": 0.6927527785301208,
"learning_rate": 0.00013872711746506413,
"loss": 0.1913,
"step": 364
},
{
"epoch": 0.38100208768267224,
"grad_norm": 0.750212550163269,
"learning_rate": 0.00013842137360556628,
"loss": 0.1723,
"step": 365
},
{
"epoch": 0.38204592901878914,
"grad_norm": 0.7114387154579163,
"learning_rate": 0.0001381152078013447,
"loss": 0.1899,
"step": 366
},
{
"epoch": 0.38308977035490605,
"grad_norm": 0.671464741230011,
"learning_rate": 0.00013780862341472182,
"loss": 0.2059,
"step": 367
},
{
"epoch": 0.38413361169102295,
"grad_norm": 0.6177671551704407,
"learning_rate": 0.00013750162381261693,
"loss": 0.1888,
"step": 368
},
{
"epoch": 0.38517745302713985,
"grad_norm": 0.7235206365585327,
"learning_rate": 0.0001371942123665092,
"loss": 0.2612,
"step": 369
},
{
"epoch": 0.3862212943632568,
"grad_norm": 0.5639315843582153,
"learning_rate": 0.00013688639245240078,
"loss": 0.2214,
"step": 370
},
{
"epoch": 0.3872651356993737,
"grad_norm": 0.5648091435432434,
"learning_rate": 0.00013657816745077955,
"loss": 0.2087,
"step": 371
},
{
"epoch": 0.3883089770354906,
"grad_norm": 0.6048844456672668,
"learning_rate": 0.0001362695407465821,
"loss": 0.2312,
"step": 372
},
{
"epoch": 0.3893528183716075,
"grad_norm": 0.48515263199806213,
"learning_rate": 0.0001359605157291565,
"loss": 0.1624,
"step": 373
},
{
"epoch": 0.3903966597077244,
"grad_norm": 0.6982168555259705,
"learning_rate": 0.0001356510957922251,
"loss": 0.2632,
"step": 374
},
{
"epoch": 0.3914405010438413,
"grad_norm": 0.9829317331314087,
"learning_rate": 0.0001353412843338474,
"loss": 0.3995,
"step": 375
},
{
"epoch": 0.3924843423799583,
"grad_norm": 1.1811579465866089,
"learning_rate": 0.00013503108475638244,
"loss": 0.4381,
"step": 376
},
{
"epoch": 0.3935281837160752,
"grad_norm": 1.0763869285583496,
"learning_rate": 0.00013472050046645166,
"loss": 0.4366,
"step": 377
},
{
"epoch": 0.3945720250521921,
"grad_norm": 0.8847516179084778,
"learning_rate": 0.00013440953487490144,
"loss": 0.3637,
"step": 378
},
{
"epoch": 0.395615866388309,
"grad_norm": 0.8159587979316711,
"learning_rate": 0.0001340981913967656,
"loss": 0.3771,
"step": 379
},
{
"epoch": 0.3966597077244259,
"grad_norm": 0.8111384510993958,
"learning_rate": 0.00013378647345122795,
"loss": 0.3473,
"step": 380
},
{
"epoch": 0.3977035490605428,
"grad_norm": 0.8355801105499268,
"learning_rate": 0.00013347438446158466,
"loss": 0.381,
"step": 381
},
{
"epoch": 0.3987473903966597,
"grad_norm": 0.7971011400222778,
"learning_rate": 0.0001331619278552068,
"loss": 0.3019,
"step": 382
},
{
"epoch": 0.39979123173277664,
"grad_norm": 0.8738229274749756,
"learning_rate": 0.00013284910706350247,
"loss": 0.3766,
"step": 383
},
{
"epoch": 0.40083507306889354,
"grad_norm": 0.8685609698295593,
"learning_rate": 0.0001325359255218795,
"loss": 0.3371,
"step": 384
},
{
"epoch": 0.40187891440501045,
"grad_norm": 0.8119929432868958,
"learning_rate": 0.00013222238666970728,
"loss": 0.3047,
"step": 385
},
{
"epoch": 0.40292275574112735,
"grad_norm": 0.831425130367279,
"learning_rate": 0.00013190849395027928,
"loss": 0.3241,
"step": 386
},
{
"epoch": 0.40396659707724425,
"grad_norm": 0.8819262981414795,
"learning_rate": 0.0001315942508107751,
"loss": 0.2986,
"step": 387
},
{
"epoch": 0.40501043841336115,
"grad_norm": 0.8911377787590027,
"learning_rate": 0.00013127966070222274,
"loss": 0.3614,
"step": 388
},
{
"epoch": 0.40605427974947805,
"grad_norm": 0.7478219866752625,
"learning_rate": 0.00013096472707946056,
"loss": 0.2861,
"step": 389
},
{
"epoch": 0.407098121085595,
"grad_norm": 0.8377799391746521,
"learning_rate": 0.00013064945340109948,
"loss": 0.2854,
"step": 390
},
{
"epoch": 0.4081419624217119,
"grad_norm": 0.7409958839416504,
"learning_rate": 0.00013033384312948488,
"loss": 0.2589,
"step": 391
},
{
"epoch": 0.4091858037578288,
"grad_norm": 0.6093199849128723,
"learning_rate": 0.00013001789973065853,
"loss": 0.2124,
"step": 392
},
{
"epoch": 0.4102296450939457,
"grad_norm": 0.5660393834114075,
"learning_rate": 0.00012970162667432075,
"loss": 0.1718,
"step": 393
},
{
"epoch": 0.4112734864300626,
"grad_norm": 0.7063373923301697,
"learning_rate": 0.00012938502743379212,
"loss": 0.2245,
"step": 394
},
{
"epoch": 0.4123173277661795,
"grad_norm": 0.8215602040290833,
"learning_rate": 0.00012906810548597532,
"loss": 0.2442,
"step": 395
},
{
"epoch": 0.4133611691022965,
"grad_norm": 0.6678823828697205,
"learning_rate": 0.00012875086431131716,
"loss": 0.1891,
"step": 396
},
{
"epoch": 0.4144050104384134,
"grad_norm": 0.6394354701042175,
"learning_rate": 0.00012843330739377,
"loss": 0.2203,
"step": 397
},
{
"epoch": 0.4154488517745303,
"grad_norm": 0.7983390688896179,
"learning_rate": 0.00012811543822075397,
"loss": 0.2113,
"step": 398
},
{
"epoch": 0.4164926931106472,
"grad_norm": 0.611224889755249,
"learning_rate": 0.0001277972602831181,
"loss": 0.1616,
"step": 399
},
{
"epoch": 0.4175365344467641,
"grad_norm": 0.9979553818702698,
"learning_rate": 0.00012747877707510252,
"loss": 0.1953,
"step": 400
},
{
"epoch": 0.418580375782881,
"grad_norm": 0.5957475304603577,
"learning_rate": 0.00012715999209429973,
"loss": 0.3356,
"step": 401
},
{
"epoch": 0.4196242171189979,
"grad_norm": 0.6433320641517639,
"learning_rate": 0.00012684090884161636,
"loss": 0.3177,
"step": 402
},
{
"epoch": 0.42066805845511485,
"grad_norm": 0.8637453317642212,
"learning_rate": 0.00012652153082123456,
"loss": 0.4008,
"step": 403
},
{
"epoch": 0.42171189979123175,
"grad_norm": 0.6944557428359985,
"learning_rate": 0.00012620186154057382,
"loss": 0.3198,
"step": 404
},
{
"epoch": 0.42275574112734865,
"grad_norm": 0.8894799947738647,
"learning_rate": 0.00012588190451025207,
"loss": 0.3917,
"step": 405
},
{
"epoch": 0.42379958246346555,
"grad_norm": 0.7825080156326294,
"learning_rate": 0.0001255616632440475,
"loss": 0.3679,
"step": 406
},
{
"epoch": 0.42484342379958245,
"grad_norm": 0.6816397309303284,
"learning_rate": 0.00012524114125885957,
"loss": 0.2855,
"step": 407
},
{
"epoch": 0.42588726513569936,
"grad_norm": 0.7824950218200684,
"learning_rate": 0.0001249203420746708,
"loss": 0.305,
"step": 408
},
{
"epoch": 0.42693110647181626,
"grad_norm": 0.7584050297737122,
"learning_rate": 0.0001245992692145078,
"loss": 0.3123,
"step": 409
},
{
"epoch": 0.4279749478079332,
"grad_norm": 0.6384229063987732,
"learning_rate": 0.00012427792620440278,
"loss": 0.233,
"step": 410
},
{
"epoch": 0.4290187891440501,
"grad_norm": 0.7239426374435425,
"learning_rate": 0.00012395631657335468,
"loss": 0.216,
"step": 411
},
{
"epoch": 0.430062630480167,
"grad_norm": 0.6760838627815247,
"learning_rate": 0.0001236344438532905,
"loss": 0.2092,
"step": 412
},
{
"epoch": 0.4311064718162839,
"grad_norm": 0.5736236572265625,
"learning_rate": 0.00012331231157902648,
"loss": 0.1725,
"step": 413
},
{
"epoch": 0.4321503131524008,
"grad_norm": 0.5566690564155579,
"learning_rate": 0.00012298992328822937,
"loss": 0.1645,
"step": 414
},
{
"epoch": 0.4331941544885177,
"grad_norm": 0.5182932019233704,
"learning_rate": 0.00012266728252137733,
"loss": 0.1484,
"step": 415
},
{
"epoch": 0.4342379958246347,
"grad_norm": 0.6892213821411133,
"learning_rate": 0.00012234439282172142,
"loss": 0.1422,
"step": 416
},
{
"epoch": 0.4352818371607516,
"grad_norm": 0.46982264518737793,
"learning_rate": 0.0001220212577352464,
"loss": 0.1292,
"step": 417
},
{
"epoch": 0.4363256784968685,
"grad_norm": 0.5547206401824951,
"learning_rate": 0.0001216978808106318,
"loss": 0.1813,
"step": 418
},
{
"epoch": 0.4373695198329854,
"grad_norm": 0.6126968264579773,
"learning_rate": 0.00012137426559921316,
"loss": 0.1804,
"step": 419
},
{
"epoch": 0.4384133611691023,
"grad_norm": 0.6041930317878723,
"learning_rate": 0.0001210504156549428,
"loss": 0.2519,
"step": 420
},
{
"epoch": 0.4394572025052192,
"grad_norm": 0.6091843843460083,
"learning_rate": 0.00012072633453435091,
"loss": 0.2423,
"step": 421
},
{
"epoch": 0.4405010438413361,
"grad_norm": 0.5664049983024597,
"learning_rate": 0.00012040202579650648,
"loss": 0.2357,
"step": 422
},
{
"epoch": 0.44154488517745305,
"grad_norm": 0.4431195855140686,
"learning_rate": 0.00012007749300297817,
"loss": 0.1704,
"step": 423
},
{
"epoch": 0.44258872651356995,
"grad_norm": 0.5842788815498352,
"learning_rate": 0.00011975273971779528,
"loss": 0.1979,
"step": 424
},
{
"epoch": 0.44363256784968685,
"grad_norm": 0.8385937809944153,
"learning_rate": 0.00011942776950740848,
"loss": 0.4167,
"step": 425
},
{
"epoch": 0.44467640918580376,
"grad_norm": 0.8559632301330566,
"learning_rate": 0.00011910258594065078,
"loss": 0.4068,
"step": 426
},
{
"epoch": 0.44572025052192066,
"grad_norm": 0.8821555972099304,
"learning_rate": 0.00011877719258869826,
"loss": 0.3839,
"step": 427
},
{
"epoch": 0.44676409185803756,
"grad_norm": 0.8119932413101196,
"learning_rate": 0.00011845159302503086,
"loss": 0.3336,
"step": 428
},
{
"epoch": 0.44780793319415446,
"grad_norm": 0.8578853011131287,
"learning_rate": 0.00011812579082539317,
"loss": 0.4184,
"step": 429
},
{
"epoch": 0.4488517745302714,
"grad_norm": 0.7293173670768738,
"learning_rate": 0.00011779978956775506,
"loss": 0.3733,
"step": 430
},
{
"epoch": 0.4498956158663883,
"grad_norm": 0.8195939064025879,
"learning_rate": 0.00011747359283227251,
"loss": 0.3646,
"step": 431
},
{
"epoch": 0.4509394572025052,
"grad_norm": 0.7359249591827393,
"learning_rate": 0.00011714720420124831,
"loss": 0.3201,
"step": 432
},
{
"epoch": 0.4519832985386221,
"grad_norm": 0.8084685206413269,
"learning_rate": 0.00011682062725909258,
"loss": 0.3537,
"step": 433
},
{
"epoch": 0.453027139874739,
"grad_norm": 0.7745420336723328,
"learning_rate": 0.00011649386559228341,
"loss": 0.2933,
"step": 434
},
{
"epoch": 0.45407098121085593,
"grad_norm": 0.81271892786026,
"learning_rate": 0.00011616692278932772,
"loss": 0.2821,
"step": 435
},
{
"epoch": 0.4551148225469729,
"grad_norm": 0.7679653167724609,
"learning_rate": 0.0001158398024407215,
"loss": 0.2973,
"step": 436
},
{
"epoch": 0.4561586638830898,
"grad_norm": 0.826492190361023,
"learning_rate": 0.00011551250813891066,
"loss": 0.311,
"step": 437
},
{
"epoch": 0.4572025052192067,
"grad_norm": 0.766703724861145,
"learning_rate": 0.00011518504347825145,
"loss": 0.2606,
"step": 438
},
{
"epoch": 0.4582463465553236,
"grad_norm": 0.7974645495414734,
"learning_rate": 0.00011485741205497094,
"loss": 0.2843,
"step": 439
},
{
"epoch": 0.4592901878914405,
"grad_norm": 0.6922892928123474,
"learning_rate": 0.0001145296174671277,
"loss": 0.2178,
"step": 440
},
{
"epoch": 0.4603340292275574,
"grad_norm": 0.6793475151062012,
"learning_rate": 0.00011420166331457207,
"loss": 0.2221,
"step": 441
},
{
"epoch": 0.4613778705636743,
"grad_norm": 0.9168251752853394,
"learning_rate": 0.00011387355319890685,
"loss": 0.2686,
"step": 442
},
{
"epoch": 0.46242171189979125,
"grad_norm": 0.7042071223258972,
"learning_rate": 0.00011354529072344748,
"loss": 0.2703,
"step": 443
},
{
"epoch": 0.46346555323590816,
"grad_norm": 0.6087216734886169,
"learning_rate": 0.00011321687949318276,
"loss": 0.2095,
"step": 444
},
{
"epoch": 0.46450939457202506,
"grad_norm": 0.5309197306632996,
"learning_rate": 0.00011288832311473508,
"loss": 0.188,
"step": 445
},
{
"epoch": 0.46555323590814196,
"grad_norm": 0.5840495824813843,
"learning_rate": 0.00011255962519632081,
"loss": 0.1569,
"step": 446
},
{
"epoch": 0.46659707724425886,
"grad_norm": 0.541627049446106,
"learning_rate": 0.00011223078934771079,
"loss": 0.1778,
"step": 447
},
{
"epoch": 0.46764091858037576,
"grad_norm": 0.4946768581867218,
"learning_rate": 0.00011190181918019049,
"loss": 0.1729,
"step": 448
},
{
"epoch": 0.46868475991649267,
"grad_norm": 0.4716700613498688,
"learning_rate": 0.00011157271830652062,
"loss": 0.1518,
"step": 449
},
{
"epoch": 0.4697286012526096,
"grad_norm": 0.8532614707946777,
"learning_rate": 0.00011124349034089723,
"loss": 0.1282,
"step": 450
},
{
"epoch": 0.4707724425887265,
"grad_norm": 0.7531268000602722,
"learning_rate": 0.00011091413889891211,
"loss": 0.3468,
"step": 451
},
{
"epoch": 0.4718162839248434,
"grad_norm": 0.7047679424285889,
"learning_rate": 0.00011058466759751302,
"loss": 0.3667,
"step": 452
},
{
"epoch": 0.47286012526096033,
"grad_norm": 0.7256395220756531,
"learning_rate": 0.00011025508005496417,
"loss": 0.3224,
"step": 453
},
{
"epoch": 0.47390396659707723,
"grad_norm": 0.7575612664222717,
"learning_rate": 0.00010992537989080618,
"loss": 0.3498,
"step": 454
},
{
"epoch": 0.47494780793319413,
"grad_norm": 0.7202345132827759,
"learning_rate": 0.00010959557072581652,
"loss": 0.3282,
"step": 455
},
{
"epoch": 0.4759916492693111,
"grad_norm": 0.6914469599723816,
"learning_rate": 0.00010926565618196978,
"loss": 0.2925,
"step": 456
},
{
"epoch": 0.477035490605428,
"grad_norm": 0.7601653337478638,
"learning_rate": 0.00010893563988239772,
"loss": 0.3728,
"step": 457
},
{
"epoch": 0.4780793319415449,
"grad_norm": 0.756959080696106,
"learning_rate": 0.0001086055254513497,
"loss": 0.3186,
"step": 458
},
{
"epoch": 0.4791231732776618,
"grad_norm": 0.6831420063972473,
"learning_rate": 0.00010827531651415266,
"loss": 0.2786,
"step": 459
},
{
"epoch": 0.4801670146137787,
"grad_norm": 0.6854783296585083,
"learning_rate": 0.00010794501669717145,
"loss": 0.2334,
"step": 460
},
{
"epoch": 0.4812108559498956,
"grad_norm": 0.667158305644989,
"learning_rate": 0.00010761462962776897,
"loss": 0.1972,
"step": 461
},
{
"epoch": 0.4822546972860125,
"grad_norm": 0.6852616667747498,
"learning_rate": 0.00010728415893426635,
"loss": 0.2318,
"step": 462
},
{
"epoch": 0.48329853862212946,
"grad_norm": 0.6210921406745911,
"learning_rate": 0.00010695360824590303,
"loss": 0.2239,
"step": 463
},
{
"epoch": 0.48434237995824636,
"grad_norm": 0.5915560722351074,
"learning_rate": 0.00010662298119279701,
"loss": 0.1759,
"step": 464
},
{
"epoch": 0.48538622129436326,
"grad_norm": 0.5121853351593018,
"learning_rate": 0.00010629228140590486,
"loss": 0.1742,
"step": 465
},
{
"epoch": 0.48643006263048016,
"grad_norm": 0.6679707169532776,
"learning_rate": 0.00010596151251698199,
"loss": 0.1989,
"step": 466
},
{
"epoch": 0.48747390396659707,
"grad_norm": 0.5263128280639648,
"learning_rate": 0.00010563067815854266,
"loss": 0.1344,
"step": 467
},
{
"epoch": 0.48851774530271397,
"grad_norm": 0.6040886640548706,
"learning_rate": 0.00010529978196382011,
"loss": 0.1759,
"step": 468
},
{
"epoch": 0.48956158663883087,
"grad_norm": 0.4196336269378662,
"learning_rate": 0.00010496882756672666,
"loss": 0.1153,
"step": 469
},
{
"epoch": 0.4906054279749478,
"grad_norm": 0.6484777927398682,
"learning_rate": 0.00010463781860181385,
"loss": 0.2986,
"step": 470
},
{
"epoch": 0.49164926931106473,
"grad_norm": 0.651329755783081,
"learning_rate": 0.00010430675870423246,
"loss": 0.2466,
"step": 471
},
{
"epoch": 0.49269311064718163,
"grad_norm": 0.5095123648643494,
"learning_rate": 0.0001039756515096926,
"loss": 0.199,
"step": 472
},
{
"epoch": 0.49373695198329853,
"grad_norm": 0.5218927264213562,
"learning_rate": 0.00010364450065442377,
"loss": 0.1569,
"step": 473
},
{
"epoch": 0.49478079331941544,
"grad_norm": 0.5320420861244202,
"learning_rate": 0.00010331330977513509,
"loss": 0.1954,
"step": 474
},
{
"epoch": 0.49582463465553234,
"grad_norm": 0.7471246719360352,
"learning_rate": 0.00010298208250897503,
"loss": 0.3619,
"step": 475
},
{
"epoch": 0.4968684759916493,
"grad_norm": 0.7712672352790833,
"learning_rate": 0.00010265082249349187,
"loss": 0.2926,
"step": 476
},
{
"epoch": 0.4979123173277662,
"grad_norm": 0.8972386717796326,
"learning_rate": 0.00010231953336659334,
"loss": 0.3387,
"step": 477
},
{
"epoch": 0.4989561586638831,
"grad_norm": 0.827797532081604,
"learning_rate": 0.00010198821876650701,
"loss": 0.3206,
"step": 478
},
{
"epoch": 0.5,
"grad_norm": 0.8829711675643921,
"learning_rate": 0.00010165688233174017,
"loss": 0.3861,
"step": 479
},
{
"epoch": 0.5010438413361169,
"grad_norm": 0.8328503370285034,
"learning_rate": 0.00010132552770103987,
"loss": 0.4158,
"step": 480
},
{
"epoch": 0.5010438413361169,
"eval_loss": 0.2552998960018158,
"eval_runtime": 81.2486,
"eval_samples_per_second": 19.865,
"eval_steps_per_second": 9.932,
"step": 480
},
{
"epoch": 0.5020876826722338,
"grad_norm": 0.7070721387863159,
"learning_rate": 0.00010099415851335299,
"loss": 0.3174,
"step": 481
},
{
"epoch": 0.5031315240083507,
"grad_norm": 0.7391024827957153,
"learning_rate": 0.00010066277840778626,
"loss": 0.3442,
"step": 482
},
{
"epoch": 0.5041753653444676,
"grad_norm": 0.7629324793815613,
"learning_rate": 0.00010033139102356642,
"loss": 0.3439,
"step": 483
},
{
"epoch": 0.5052192066805845,
"grad_norm": 0.7324389219284058,
"learning_rate": 0.0001,
"loss": 0.3063,
"step": 484
},
{
"epoch": 0.5062630480167014,
"grad_norm": 0.6402798295021057,
"learning_rate": 9.966860897643359e-05,
"loss": 0.2383,
"step": 485
},
{
"epoch": 0.5073068893528184,
"grad_norm": 0.7618774771690369,
"learning_rate": 9.933722159221376e-05,
"loss": 0.3004,
"step": 486
},
{
"epoch": 0.5083507306889353,
"grad_norm": 0.8296042680740356,
"learning_rate": 9.900584148664704e-05,
"loss": 0.3208,
"step": 487
},
{
"epoch": 0.5093945720250522,
"grad_norm": 0.7663673162460327,
"learning_rate": 9.867447229896018e-05,
"loss": 0.3204,
"step": 488
},
{
"epoch": 0.5104384133611691,
"grad_norm": 0.7188003063201904,
"learning_rate": 9.834311766825985e-05,
"loss": 0.2645,
"step": 489
},
{
"epoch": 0.511482254697286,
"grad_norm": 0.6017361879348755,
"learning_rate": 9.801178123349298e-05,
"loss": 0.2076,
"step": 490
},
{
"epoch": 0.5125260960334029,
"grad_norm": 0.6702793836593628,
"learning_rate": 9.768046663340669e-05,
"loss": 0.2207,
"step": 491
},
{
"epoch": 0.5135699373695198,
"grad_norm": 0.6283150911331177,
"learning_rate": 9.734917750650816e-05,
"loss": 0.2246,
"step": 492
},
{
"epoch": 0.5146137787056367,
"grad_norm": 0.6348150968551636,
"learning_rate": 9.701791749102495e-05,
"loss": 0.1896,
"step": 493
},
{
"epoch": 0.5156576200417536,
"grad_norm": 0.6684585213661194,
"learning_rate": 9.668669022486494e-05,
"loss": 0.2409,
"step": 494
},
{
"epoch": 0.5167014613778705,
"grad_norm": 0.678677499294281,
"learning_rate": 9.635549934557625e-05,
"loss": 0.216,
"step": 495
},
{
"epoch": 0.5177453027139874,
"grad_norm": 0.6523580551147461,
"learning_rate": 9.602434849030745e-05,
"loss": 0.1894,
"step": 496
},
{
"epoch": 0.5187891440501043,
"grad_norm": 0.5122499465942383,
"learning_rate": 9.569324129576757e-05,
"loss": 0.1579,
"step": 497
},
{
"epoch": 0.5198329853862212,
"grad_norm": 0.5820009112358093,
"learning_rate": 9.536218139818614e-05,
"loss": 0.1766,
"step": 498
},
{
"epoch": 0.5208768267223383,
"grad_norm": 0.5032172203063965,
"learning_rate": 9.503117243327337e-05,
"loss": 0.1519,
"step": 499
},
{
"epoch": 0.5219206680584552,
"grad_norm": 0.7425169944763184,
"learning_rate": 9.47002180361799e-05,
"loss": 0.1333,
"step": 500
},
{
"epoch": 0.5229645093945721,
"grad_norm": 0.7141383290290833,
"learning_rate": 9.436932184145737e-05,
"loss": 0.4269,
"step": 501
},
{
"epoch": 0.524008350730689,
"grad_norm": 0.662886917591095,
"learning_rate": 9.403848748301802e-05,
"loss": 0.2939,
"step": 502
},
{
"epoch": 0.5250521920668059,
"grad_norm": 0.6695585250854492,
"learning_rate": 9.370771859409513e-05,
"loss": 0.3167,
"step": 503
},
{
"epoch": 0.5260960334029228,
"grad_norm": 0.7355087399482727,
"learning_rate": 9.337701880720303e-05,
"loss": 0.3414,
"step": 504
},
{
"epoch": 0.5271398747390397,
"grad_norm": 0.6738423705101013,
"learning_rate": 9.304639175409698e-05,
"loss": 0.3176,
"step": 505
},
{
"epoch": 0.5281837160751566,
"grad_norm": 0.7200036644935608,
"learning_rate": 9.271584106573364e-05,
"loss": 0.3276,
"step": 506
},
{
"epoch": 0.5292275574112735,
"grad_norm": 0.6562217473983765,
"learning_rate": 9.238537037223104e-05,
"loss": 0.3353,
"step": 507
},
{
"epoch": 0.5302713987473904,
"grad_norm": 0.6065823435783386,
"learning_rate": 9.205498330282856e-05,
"loss": 0.3062,
"step": 508
},
{
"epoch": 0.5313152400835073,
"grad_norm": 0.6631349921226501,
"learning_rate": 9.172468348584739e-05,
"loss": 0.2497,
"step": 509
},
{
"epoch": 0.5323590814196242,
"grad_norm": 0.613549530506134,
"learning_rate": 9.139447454865033e-05,
"loss": 0.2625,
"step": 510
},
{
"epoch": 0.5334029227557411,
"grad_norm": 0.6613410711288452,
"learning_rate": 9.106436011760229e-05,
"loss": 0.244,
"step": 511
},
{
"epoch": 0.534446764091858,
"grad_norm": 0.5914390087127686,
"learning_rate": 9.073434381803024e-05,
"loss": 0.2109,
"step": 512
},
{
"epoch": 0.535490605427975,
"grad_norm": 0.5324224829673767,
"learning_rate": 9.04044292741835e-05,
"loss": 0.1926,
"step": 513
},
{
"epoch": 0.5365344467640919,
"grad_norm": 0.652651846408844,
"learning_rate": 9.007462010919386e-05,
"loss": 0.2254,
"step": 514
},
{
"epoch": 0.5375782881002088,
"grad_norm": 0.6112195253372192,
"learning_rate": 8.974491994503584e-05,
"loss": 0.205,
"step": 515
},
{
"epoch": 0.5386221294363257,
"grad_norm": 0.5404685735702515,
"learning_rate": 8.941533240248699e-05,
"loss": 0.1928,
"step": 516
},
{
"epoch": 0.5396659707724426,
"grad_norm": 0.47451335191726685,
"learning_rate": 8.908586110108794e-05,
"loss": 0.1307,
"step": 517
},
{
"epoch": 0.5407098121085595,
"grad_norm": 0.8480343818664551,
"learning_rate": 8.875650965910279e-05,
"loss": 0.1533,
"step": 518
},
{
"epoch": 0.5417536534446764,
"grad_norm": 0.6759589314460754,
"learning_rate": 8.842728169347939e-05,
"loss": 0.1468,
"step": 519
},
{
"epoch": 0.5427974947807933,
"grad_norm": 0.5591132640838623,
"learning_rate": 8.809818081980953e-05,
"loss": 0.2246,
"step": 520
},
{
"epoch": 0.5438413361169102,
"grad_norm": 0.6184394955635071,
"learning_rate": 8.776921065228924e-05,
"loss": 0.2189,
"step": 521
},
{
"epoch": 0.5448851774530271,
"grad_norm": 0.5175319910049438,
"learning_rate": 8.744037480367921e-05,
"loss": 0.1886,
"step": 522
},
{
"epoch": 0.545929018789144,
"grad_norm": 0.645250678062439,
"learning_rate": 8.711167688526493e-05,
"loss": 0.2297,
"step": 523
},
{
"epoch": 0.5469728601252609,
"grad_norm": 0.6044825315475464,
"learning_rate": 8.678312050681724e-05,
"loss": 0.2029,
"step": 524
},
{
"epoch": 0.5480167014613778,
"grad_norm": 0.5178519487380981,
"learning_rate": 8.645470927655255e-05,
"loss": 0.1447,
"step": 525
},
{
"epoch": 0.5490605427974948,
"grad_norm": 0.8626076579093933,
"learning_rate": 8.612644680109319e-05,
"loss": 0.4495,
"step": 526
},
{
"epoch": 0.5501043841336117,
"grad_norm": 0.8363009691238403,
"learning_rate": 8.579833668542796e-05,
"loss": 0.3709,
"step": 527
},
{
"epoch": 0.5511482254697286,
"grad_norm": 0.872733473777771,
"learning_rate": 8.547038253287233e-05,
"loss": 0.3226,
"step": 528
},
{
"epoch": 0.5521920668058455,
"grad_norm": 0.865210235118866,
"learning_rate": 8.514258794502905e-05,
"loss": 0.3579,
"step": 529
},
{
"epoch": 0.5532359081419624,
"grad_norm": 0.7914073467254639,
"learning_rate": 8.481495652174859e-05,
"loss": 0.3216,
"step": 530
},
{
"epoch": 0.5542797494780793,
"grad_norm": 0.8779425024986267,
"learning_rate": 8.448749186108935e-05,
"loss": 0.3532,
"step": 531
},
{
"epoch": 0.5553235908141962,
"grad_norm": 0.8584082722663879,
"learning_rate": 8.416019755927851e-05,
"loss": 0.4009,
"step": 532
},
{
"epoch": 0.5563674321503131,
"grad_norm": 0.8470184206962585,
"learning_rate": 8.383307721067231e-05,
"loss": 0.4198,
"step": 533
},
{
"epoch": 0.55741127348643,
"grad_norm": 0.7338582277297974,
"learning_rate": 8.35061344077166e-05,
"loss": 0.2725,
"step": 534
},
{
"epoch": 0.558455114822547,
"grad_norm": 0.7652982473373413,
"learning_rate": 8.317937274090747e-05,
"loss": 0.3007,
"step": 535
},
{
"epoch": 0.5594989561586639,
"grad_norm": 0.7415357232093811,
"learning_rate": 8.28527957987517e-05,
"loss": 0.3274,
"step": 536
},
{
"epoch": 0.5605427974947808,
"grad_norm": 0.6662179231643677,
"learning_rate": 8.252640716772749e-05,
"loss": 0.2606,
"step": 537
},
{
"epoch": 0.5615866388308977,
"grad_norm": 0.6139498353004456,
"learning_rate": 8.2200210432245e-05,
"loss": 0.2382,
"step": 538
},
{
"epoch": 0.5626304801670147,
"grad_norm": 0.7014831900596619,
"learning_rate": 8.187420917460686e-05,
"loss": 0.2542,
"step": 539
},
{
"epoch": 0.5636743215031316,
"grad_norm": 0.7138461470603943,
"learning_rate": 8.154840697496917e-05,
"loss": 0.2392,
"step": 540
},
{
"epoch": 0.5647181628392485,
"grad_norm": 0.7507902979850769,
"learning_rate": 8.122280741130176e-05,
"loss": 0.2554,
"step": 541
},
{
"epoch": 0.5657620041753654,
"grad_norm": 0.8535422086715698,
"learning_rate": 8.089741405934922e-05,
"loss": 0.2433,
"step": 542
},
{
"epoch": 0.5668058455114823,
"grad_norm": 0.660111129283905,
"learning_rate": 8.057223049259155e-05,
"loss": 0.1921,
"step": 543
},
{
"epoch": 0.5678496868475992,
"grad_norm": 0.49220120906829834,
"learning_rate": 8.024726028220474e-05,
"loss": 0.1793,
"step": 544
},
{
"epoch": 0.5688935281837161,
"grad_norm": 0.5934033393859863,
"learning_rate": 7.992250699702182e-05,
"loss": 0.1686,
"step": 545
},
{
"epoch": 0.569937369519833,
"grad_norm": 0.6598916053771973,
"learning_rate": 7.959797420349355e-05,
"loss": 0.1981,
"step": 546
},
{
"epoch": 0.5709812108559499,
"grad_norm": 0.6087566018104553,
"learning_rate": 7.927366546564911e-05,
"loss": 0.1845,
"step": 547
},
{
"epoch": 0.5720250521920668,
"grad_norm": 0.4998890459537506,
"learning_rate": 7.894958434505725e-05,
"loss": 0.1524,
"step": 548
},
{
"epoch": 0.5730688935281837,
"grad_norm": 0.5460024476051331,
"learning_rate": 7.862573440078686e-05,
"loss": 0.1808,
"step": 549
},
{
"epoch": 0.5741127348643006,
"grad_norm": 0.7462297677993774,
"learning_rate": 7.83021191893682e-05,
"loss": 0.1723,
"step": 550
},
{
"epoch": 0.5751565762004175,
"grad_norm": 0.5173273086547852,
"learning_rate": 7.797874226475361e-05,
"loss": 0.3054,
"step": 551
},
{
"epoch": 0.5762004175365344,
"grad_norm": 0.6547046303749084,
"learning_rate": 7.765560717827858e-05,
"loss": 0.3101,
"step": 552
},
{
"epoch": 0.5772442588726514,
"grad_norm": 0.676986575126648,
"learning_rate": 7.733271747862265e-05,
"loss": 0.3376,
"step": 553
},
{
"epoch": 0.5782881002087683,
"grad_norm": 0.7368578910827637,
"learning_rate": 7.701007671177067e-05,
"loss": 0.3517,
"step": 554
},
{
"epoch": 0.5793319415448852,
"grad_norm": 0.7136873006820679,
"learning_rate": 7.668768842097353e-05,
"loss": 0.3077,
"step": 555
},
{
"epoch": 0.5803757828810021,
"grad_norm": 0.7121712565422058,
"learning_rate": 7.636555614670953e-05,
"loss": 0.3271,
"step": 556
},
{
"epoch": 0.581419624217119,
"grad_norm": 0.7123695611953735,
"learning_rate": 7.604368342664533e-05,
"loss": 0.3356,
"step": 557
},
{
"epoch": 0.5824634655532359,
"grad_norm": 0.7206712961196899,
"learning_rate": 7.572207379559721e-05,
"loss": 0.2915,
"step": 558
},
{
"epoch": 0.5835073068893528,
"grad_norm": 0.6520224809646606,
"learning_rate": 7.540073078549221e-05,
"loss": 0.2657,
"step": 559
},
{
"epoch": 0.5845511482254697,
"grad_norm": 0.5960420370101929,
"learning_rate": 7.507965792532921e-05,
"loss": 0.1952,
"step": 560
},
{
"epoch": 0.5855949895615866,
"grad_norm": 0.5834378004074097,
"learning_rate": 7.475885874114047e-05,
"loss": 0.1878,
"step": 561
},
{
"epoch": 0.5866388308977035,
"grad_norm": 0.6201406121253967,
"learning_rate": 7.443833675595255e-05,
"loss": 0.1927,
"step": 562
},
{
"epoch": 0.5876826722338204,
"grad_norm": 0.5904473662376404,
"learning_rate": 7.411809548974792e-05,
"loss": 0.1804,
"step": 563
},
{
"epoch": 0.5887265135699373,
"grad_norm": 0.5292779803276062,
"learning_rate": 7.379813845942623e-05,
"loss": 0.1379,
"step": 564
},
{
"epoch": 0.5897703549060542,
"grad_norm": 0.618929922580719,
"learning_rate": 7.347846917876544e-05,
"loss": 0.1926,
"step": 565
},
{
"epoch": 0.5908141962421712,
"grad_norm": 0.5607888698577881,
"learning_rate": 7.315909115838367e-05,
"loss": 0.1845,
"step": 566
},
{
"epoch": 0.5918580375782881,
"grad_norm": 0.56803297996521,
"learning_rate": 7.284000790570029e-05,
"loss": 0.1762,
"step": 567
},
{
"epoch": 0.592901878914405,
"grad_norm": 0.5143932700157166,
"learning_rate": 7.252122292489747e-05,
"loss": 0.1514,
"step": 568
},
{
"epoch": 0.593945720250522,
"grad_norm": 0.6080281734466553,
"learning_rate": 7.220273971688192e-05,
"loss": 0.2516,
"step": 569
},
{
"epoch": 0.5949895615866388,
"grad_norm": 0.6721866130828857,
"learning_rate": 7.188456177924605e-05,
"loss": 0.2707,
"step": 570
},
{
"epoch": 0.5960334029227558,
"grad_norm": 0.5116624236106873,
"learning_rate": 7.156669260622996e-05,
"loss": 0.2083,
"step": 571
},
{
"epoch": 0.5970772442588727,
"grad_norm": 0.5874140858650208,
"learning_rate": 7.124913568868287e-05,
"loss": 0.1598,
"step": 572
},
{
"epoch": 0.5981210855949896,
"grad_norm": 0.506820559501648,
"learning_rate": 7.093189451402469e-05,
"loss": 0.1572,
"step": 573
},
{
"epoch": 0.5991649269311065,
"grad_norm": 0.7033873796463013,
"learning_rate": 7.061497256620793e-05,
"loss": 0.2867,
"step": 574
},
{
"epoch": 0.6002087682672234,
"grad_norm": 0.7932460904121399,
"learning_rate": 7.029837332567927e-05,
"loss": 0.2998,
"step": 575
},
{
"epoch": 0.6012526096033403,
"grad_norm": 0.8427619934082031,
"learning_rate": 6.998210026934148e-05,
"loss": 0.426,
"step": 576
},
{
"epoch": 0.6022964509394572,
"grad_norm": 0.7818666100502014,
"learning_rate": 6.966615687051516e-05,
"loss": 0.3559,
"step": 577
},
{
"epoch": 0.6033402922755741,
"grad_norm": 0.821897566318512,
"learning_rate": 6.935054659890052e-05,
"loss": 0.3928,
"step": 578
},
{
"epoch": 0.6043841336116911,
"grad_norm": 0.7375624179840088,
"learning_rate": 6.903527292053942e-05,
"loss": 0.3203,
"step": 579
},
{
"epoch": 0.605427974947808,
"grad_norm": 0.7323412299156189,
"learning_rate": 6.87203392977773e-05,
"loss": 0.3014,
"step": 580
},
{
"epoch": 0.6064718162839249,
"grad_norm": 0.8238475322723389,
"learning_rate": 6.840574918922493e-05,
"loss": 0.3447,
"step": 581
},
{
"epoch": 0.6075156576200418,
"grad_norm": 0.7970190644264221,
"learning_rate": 6.809150604972079e-05,
"loss": 0.3556,
"step": 582
},
{
"epoch": 0.6085594989561587,
"grad_norm": 0.718948483467102,
"learning_rate": 6.777761333029275e-05,
"loss": 0.318,
"step": 583
},
{
"epoch": 0.6096033402922756,
"grad_norm": 0.8113385438919067,
"learning_rate": 6.746407447812049e-05,
"loss": 0.2928,
"step": 584
},
{
"epoch": 0.6106471816283925,
"grad_norm": 0.732028603553772,
"learning_rate": 6.715089293649752e-05,
"loss": 0.2688,
"step": 585
},
{
"epoch": 0.6116910229645094,
"grad_norm": 0.693304181098938,
"learning_rate": 6.683807214479323e-05,
"loss": 0.2421,
"step": 586
},
{
"epoch": 0.6127348643006263,
"grad_norm": 0.8302125334739685,
"learning_rate": 6.652561553841537e-05,
"loss": 0.2625,
"step": 587
},
{
"epoch": 0.6137787056367432,
"grad_norm": 0.6446481943130493,
"learning_rate": 6.621352654877207e-05,
"loss": 0.2261,
"step": 588
},
{
"epoch": 0.6148225469728601,
"grad_norm": 0.7838292717933655,
"learning_rate": 6.59018086032344e-05,
"loss": 0.2619,
"step": 589
},
{
"epoch": 0.615866388308977,
"grad_norm": 0.6317050457000732,
"learning_rate": 6.55904651250986e-05,
"loss": 0.1996,
"step": 590
},
{
"epoch": 0.6169102296450939,
"grad_norm": 0.6110920310020447,
"learning_rate": 6.527949953354835e-05,
"loss": 0.2078,
"step": 591
},
{
"epoch": 0.6179540709812108,
"grad_norm": 0.5425273180007935,
"learning_rate": 6.496891524361757e-05,
"loss": 0.1852,
"step": 592
},
{
"epoch": 0.6189979123173278,
"grad_norm": 0.7897228002548218,
"learning_rate": 6.465871566615263e-05,
"loss": 0.2217,
"step": 593
},
{
"epoch": 0.6200417536534447,
"grad_norm": 0.6448274254798889,
"learning_rate": 6.434890420777491e-05,
"loss": 0.1918,
"step": 594
},
{
"epoch": 0.6210855949895616,
"grad_norm": 0.690799355506897,
"learning_rate": 6.403948427084356e-05,
"loss": 0.2129,
"step": 595
},
{
"epoch": 0.6221294363256785,
"grad_norm": 0.6128472685813904,
"learning_rate": 6.373045925341794e-05,
"loss": 0.196,
"step": 596
},
{
"epoch": 0.6231732776617954,
"grad_norm": 0.4894169270992279,
"learning_rate": 6.342183254922046e-05,
"loss": 0.1341,
"step": 597
},
{
"epoch": 0.6242171189979123,
"grad_norm": 0.5475450754165649,
"learning_rate": 6.311360754759923e-05,
"loss": 0.1655,
"step": 598
},
{
"epoch": 0.6252609603340292,
"grad_norm": 0.5742066502571106,
"learning_rate": 6.280578763349078e-05,
"loss": 0.2158,
"step": 599
},
{
"epoch": 0.6263048016701461,
"grad_norm": 0.6302378177642822,
"learning_rate": 6.249837618738311e-05,
"loss": 0.1211,
"step": 600
},
{
"epoch": 0.627348643006263,
"grad_norm": 0.6498920321464539,
"learning_rate": 6.219137658527818e-05,
"loss": 0.3415,
"step": 601
},
{
"epoch": 0.6283924843423799,
"grad_norm": 0.5803071856498718,
"learning_rate": 6.188479219865529e-05,
"loss": 0.3446,
"step": 602
},
{
"epoch": 0.6294363256784968,
"grad_norm": 0.7242146134376526,
"learning_rate": 6.157862639443374e-05,
"loss": 0.3406,
"step": 603
},
{
"epoch": 0.6304801670146137,
"grad_norm": 0.741543710231781,
"learning_rate": 6.127288253493591e-05,
"loss": 0.2851,
"step": 604
},
{
"epoch": 0.6315240083507306,
"grad_norm": 0.6710807681083679,
"learning_rate": 6.09675639778504e-05,
"loss": 0.2975,
"step": 605
},
{
"epoch": 0.6325678496868476,
"grad_norm": 0.6401992440223694,
"learning_rate": 6.0662674076195194e-05,
"loss": 0.2934,
"step": 606
},
{
"epoch": 0.6336116910229646,
"grad_norm": 0.7866775393486023,
"learning_rate": 6.03582161782806e-05,
"loss": 0.3303,
"step": 607
},
{
"epoch": 0.6346555323590815,
"grad_norm": 0.6878888607025146,
"learning_rate": 6.005419362767286e-05,
"loss": 0.2885,
"step": 608
},
{
"epoch": 0.6356993736951984,
"grad_norm": 0.667226254940033,
"learning_rate": 5.975060976315703e-05,
"loss": 0.2663,
"step": 609
},
{
"epoch": 0.6367432150313153,
"grad_norm": 0.5909189581871033,
"learning_rate": 5.9447467918700614e-05,
"loss": 0.2023,
"step": 610
},
{
"epoch": 0.6377870563674322,
"grad_norm": 0.6986932158470154,
"learning_rate": 5.9144771423416826e-05,
"loss": 0.2354,
"step": 611
},
{
"epoch": 0.6388308977035491,
"grad_norm": 0.5562401413917542,
"learning_rate": 5.8842523601528e-05,
"loss": 0.1928,
"step": 612
},
{
"epoch": 0.639874739039666,
"grad_norm": 0.5309166312217712,
"learning_rate": 5.854072777232914e-05,
"loss": 0.1611,
"step": 613
},
{
"epoch": 0.6409185803757829,
"grad_norm": 0.4029114842414856,
"learning_rate": 5.823938725015148e-05,
"loss": 0.14,
"step": 614
},
{
"epoch": 0.6419624217118998,
"grad_norm": 0.4246949851512909,
"learning_rate": 5.793850534432599e-05,
"loss": 0.1194,
"step": 615
},
{
"epoch": 0.6430062630480167,
"grad_norm": 0.44221794605255127,
"learning_rate": 5.763808535914723e-05,
"loss": 0.121,
"step": 616
},
{
"epoch": 0.6440501043841336,
"grad_norm": 0.40379002690315247,
"learning_rate": 5.7338130593836755e-05,
"loss": 0.1215,
"step": 617
},
{
"epoch": 0.6450939457202505,
"grad_norm": 0.6201443076133728,
"learning_rate": 5.7038644342507205e-05,
"loss": 0.1686,
"step": 618
},
{
"epoch": 0.6461377870563675,
"grad_norm": 0.7134044766426086,
"learning_rate": 5.673962989412599e-05,
"loss": 0.3048,
"step": 619
},
{
"epoch": 0.6471816283924844,
"grad_norm": 0.5085525512695312,
"learning_rate": 5.644109053247901e-05,
"loss": 0.2137,
"step": 620
},
{
"epoch": 0.6482254697286013,
"grad_norm": 0.5813112854957581,
"learning_rate": 5.614302953613489e-05,
"loss": 0.2164,
"step": 621
},
{
"epoch": 0.6492693110647182,
"grad_norm": 0.5314549803733826,
"learning_rate": 5.584545017840885e-05,
"loss": 0.1781,
"step": 622
},
{
"epoch": 0.6503131524008351,
"grad_norm": 0.4466283619403839,
"learning_rate": 5.5548355727326574e-05,
"loss": 0.1564,
"step": 623
},
{
"epoch": 0.651356993736952,
"grad_norm": 0.7003150582313538,
"learning_rate": 5.525174944558866e-05,
"loss": 0.3604,
"step": 624
},
{
"epoch": 0.6524008350730689,
"grad_norm": 0.7904312610626221,
"learning_rate": 5.4955634590534545e-05,
"loss": 0.3483,
"step": 625
},
{
"epoch": 0.6534446764091858,
"grad_norm": 0.7673099637031555,
"learning_rate": 5.466001441410682e-05,
"loss": 0.3912,
"step": 626
},
{
"epoch": 0.6544885177453027,
"grad_norm": 0.823867678642273,
"learning_rate": 5.4364892162815436e-05,
"loss": 0.3618,
"step": 627
},
{
"epoch": 0.6555323590814196,
"grad_norm": 0.6855948567390442,
"learning_rate": 5.407027107770219e-05,
"loss": 0.2816,
"step": 628
},
{
"epoch": 0.6565762004175365,
"grad_norm": 0.7723731994628906,
"learning_rate": 5.377615439430508e-05,
"loss": 0.3292,
"step": 629
},
{
"epoch": 0.6576200417536534,
"grad_norm": 0.7081869840621948,
"learning_rate": 5.348254534262262e-05,
"loss": 0.3232,
"step": 630
},
{
"epoch": 0.6586638830897703,
"grad_norm": 0.7101826071739197,
"learning_rate": 5.318944714707861e-05,
"loss": 0.3557,
"step": 631
},
{
"epoch": 0.6597077244258872,
"grad_norm": 0.7043560147285461,
"learning_rate": 5.289686302648661e-05,
"loss": 0.3251,
"step": 632
},
{
"epoch": 0.6607515657620042,
"grad_norm": 0.8865169286727905,
"learning_rate": 5.2604796194014507e-05,
"loss": 0.3514,
"step": 633
},
{
"epoch": 0.6617954070981211,
"grad_norm": 0.8106626868247986,
"learning_rate": 5.2313249857149414e-05,
"loss": 0.3226,
"step": 634
},
{
"epoch": 0.662839248434238,
"grad_norm": 0.7511535286903381,
"learning_rate": 5.202222721766226e-05,
"loss": 0.3186,
"step": 635
},
{
"epoch": 0.6638830897703549,
"grad_norm": 0.7110910415649414,
"learning_rate": 5.1731731471572755e-05,
"loss": 0.29,
"step": 636
},
{
"epoch": 0.6649269311064718,
"grad_norm": 0.7598642110824585,
"learning_rate": 5.144176580911431e-05,
"loss": 0.2552,
"step": 637
},
{
"epoch": 0.6659707724425887,
"grad_norm": 0.5399108529090881,
"learning_rate": 5.115233341469877e-05,
"loss": 0.2105,
"step": 638
},
{
"epoch": 0.6670146137787056,
"grad_norm": 0.6335327625274658,
"learning_rate": 5.0863437466881836e-05,
"loss": 0.2272,
"step": 639
},
{
"epoch": 0.6680584551148225,
"grad_norm": 0.6470877528190613,
"learning_rate": 5.0575081138327715e-05,
"loss": 0.2329,
"step": 640
},
{
"epoch": 0.6691022964509394,
"grad_norm": 0.5501940250396729,
"learning_rate": 5.028726759577467e-05,
"loss": 0.2057,
"step": 641
},
{
"epoch": 0.6701461377870563,
"grad_norm": 0.48475509881973267,
"learning_rate": 5.000000000000002e-05,
"loss": 0.1643,
"step": 642
},
{
"epoch": 0.6711899791231732,
"grad_norm": 0.8148300051689148,
"learning_rate": 4.97132815057854e-05,
"loss": 0.2454,
"step": 643
},
{
"epoch": 0.6722338204592901,
"grad_norm": 0.47534969449043274,
"learning_rate": 4.942711526188229e-05,
"loss": 0.1713,
"step": 644
},
{
"epoch": 0.673277661795407,
"grad_norm": 0.5151733160018921,
"learning_rate": 4.914150441097736e-05,
"loss": 0.1701,
"step": 645
},
{
"epoch": 0.6743215031315241,
"grad_norm": 0.5037069916725159,
"learning_rate": 4.885645208965779e-05,
"loss": 0.1814,
"step": 646
},
{
"epoch": 0.675365344467641,
"grad_norm": 0.4882695972919464,
"learning_rate": 4.857196142837716e-05,
"loss": 0.1685,
"step": 647
},
{
"epoch": 0.6764091858037579,
"grad_norm": 0.614020586013794,
"learning_rate": 4.8288035551420697e-05,
"loss": 0.1878,
"step": 648
},
{
"epoch": 0.6774530271398748,
"grad_norm": 0.44983476400375366,
"learning_rate": 4.80046775768713e-05,
"loss": 0.1455,
"step": 649
},
{
"epoch": 0.6784968684759917,
"grad_norm": 0.7257928252220154,
"learning_rate": 4.7721890616575103e-05,
"loss": 0.1417,
"step": 650
},
{
"epoch": 0.6795407098121086,
"grad_norm": 0.5455628633499146,
"learning_rate": 4.743967777610727e-05,
"loss": 0.298,
"step": 651
},
{
"epoch": 0.6805845511482255,
"grad_norm": 0.6038815379142761,
"learning_rate": 4.715804215473809e-05,
"loss": 0.322,
"step": 652
},
{
"epoch": 0.6816283924843424,
"grad_norm": 0.6264936327934265,
"learning_rate": 4.687698684539866e-05,
"loss": 0.3185,
"step": 653
},
{
"epoch": 0.6826722338204593,
"grad_norm": 0.6083415150642395,
"learning_rate": 4.659651493464721e-05,
"loss": 0.2614,
"step": 654
},
{
"epoch": 0.6837160751565762,
"grad_norm": 0.616062343120575,
"learning_rate": 4.6316629502635025e-05,
"loss": 0.2663,
"step": 655
},
{
"epoch": 0.6847599164926931,
"grad_norm": 0.6474526524543762,
"learning_rate": 4.603733362307261e-05,
"loss": 0.3216,
"step": 656
},
{
"epoch": 0.68580375782881,
"grad_norm": 0.6374465227127075,
"learning_rate": 4.575863036319604e-05,
"loss": 0.2995,
"step": 657
},
{
"epoch": 0.6868475991649269,
"grad_norm": 0.627227246761322,
"learning_rate": 4.548052278373327e-05,
"loss": 0.2399,
"step": 658
},
{
"epoch": 0.6878914405010439,
"grad_norm": 0.6535525321960449,
"learning_rate": 4.520301393887032e-05,
"loss": 0.2309,
"step": 659
},
{
"epoch": 0.6889352818371608,
"grad_norm": 0.5703950524330139,
"learning_rate": 4.492610687621804e-05,
"loss": 0.2128,
"step": 660
},
{
"epoch": 0.6899791231732777,
"grad_norm": 0.5039522647857666,
"learning_rate": 4.4649804636778456e-05,
"loss": 0.1812,
"step": 661
},
{
"epoch": 0.6910229645093946,
"grad_norm": 0.6118776798248291,
"learning_rate": 4.4374110254911306e-05,
"loss": 0.225,
"step": 662
},
{
"epoch": 0.6920668058455115,
"grad_norm": 0.4194796085357666,
"learning_rate": 4.4099026758300944e-05,
"loss": 0.1415,
"step": 663
},
{
"epoch": 0.6931106471816284,
"grad_norm": 0.5777245163917542,
"learning_rate": 4.382455716792291e-05,
"loss": 0.2032,
"step": 664
},
{
"epoch": 0.6941544885177453,
"grad_norm": 0.41527435183525085,
"learning_rate": 4.355070449801083e-05,
"loss": 0.1205,
"step": 665
},
{
"epoch": 0.6951983298538622,
"grad_norm": 0.502178430557251,
"learning_rate": 4.32774717560232e-05,
"loss": 0.1514,
"step": 666
},
{
"epoch": 0.6962421711899791,
"grad_norm": 0.5909983515739441,
"learning_rate": 4.300486194261057e-05,
"loss": 0.176,
"step": 667
},
{
"epoch": 0.697286012526096,
"grad_norm": 0.48378539085388184,
"learning_rate": 4.273287805158245e-05,
"loss": 0.1627,
"step": 668
},
{
"epoch": 0.6983298538622129,
"grad_norm": 0.6273384094238281,
"learning_rate": 4.2461523069874346e-05,
"loss": 0.2523,
"step": 669
},
{
"epoch": 0.6993736951983298,
"grad_norm": 0.5280055403709412,
"learning_rate": 4.219079997751515e-05,
"loss": 0.2035,
"step": 670
},
{
"epoch": 0.7004175365344467,
"grad_norm": 0.539364755153656,
"learning_rate": 4.192071174759435e-05,
"loss": 0.1896,
"step": 671
},
{
"epoch": 0.7014613778705637,
"grad_norm": 0.5403777956962585,
"learning_rate": 4.165126134622926e-05,
"loss": 0.1624,
"step": 672
},
{
"epoch": 0.7025052192066806,
"grad_norm": 0.448177695274353,
"learning_rate": 4.1382451732532665e-05,
"loss": 0.1308,
"step": 673
},
{
"epoch": 0.7035490605427975,
"grad_norm": 0.6774344444274902,
"learning_rate": 4.1114285858580045e-05,
"loss": 0.3161,
"step": 674
},
{
"epoch": 0.7045929018789144,
"grad_norm": 0.7080472707748413,
"learning_rate": 4.0846766669377446e-05,
"loss": 0.3357,
"step": 675
},
{
"epoch": 0.7056367432150313,
"grad_norm": 0.6589325666427612,
"learning_rate": 4.0579897102828966e-05,
"loss": 0.2815,
"step": 676
},
{
"epoch": 0.7066805845511482,
"grad_norm": 0.7149707078933716,
"learning_rate": 4.0313680089704454e-05,
"loss": 0.3612,
"step": 677
},
{
"epoch": 0.7077244258872651,
"grad_norm": 0.6252415776252747,
"learning_rate": 4.004811855360748e-05,
"loss": 0.2726,
"step": 678
},
{
"epoch": 0.708768267223382,
"grad_norm": 0.7726844549179077,
"learning_rate": 3.9783215410943174e-05,
"loss": 0.3229,
"step": 679
},
{
"epoch": 0.7098121085594989,
"grad_norm": 0.7369757890701294,
"learning_rate": 3.951897357088602e-05,
"loss": 0.3436,
"step": 680
},
{
"epoch": 0.7108559498956158,
"grad_norm": 0.788517951965332,
"learning_rate": 3.925539593534824e-05,
"loss": 0.312,
"step": 681
},
{
"epoch": 0.7118997912317327,
"grad_norm": 0.7983633875846863,
"learning_rate": 3.899248539894757e-05,
"loss": 0.361,
"step": 682
},
{
"epoch": 0.7129436325678496,
"grad_norm": 0.7084015607833862,
"learning_rate": 3.873024484897576e-05,
"loss": 0.2836,
"step": 683
},
{
"epoch": 0.7139874739039666,
"grad_norm": 0.7584156394004822,
"learning_rate": 3.8468677165366754e-05,
"loss": 0.2955,
"step": 684
},
{
"epoch": 0.7150313152400835,
"grad_norm": 0.7465482354164124,
"learning_rate": 3.820778522066494e-05,
"loss": 0.2564,
"step": 685
},
{
"epoch": 0.7160751565762005,
"grad_norm": 0.6189156770706177,
"learning_rate": 3.794757187999386e-05,
"loss": 0.221,
"step": 686
},
{
"epoch": 0.7171189979123174,
"grad_norm": 0.6960480809211731,
"learning_rate": 3.7688040001024475e-05,
"loss": 0.2522,
"step": 687
},
{
"epoch": 0.7181628392484343,
"grad_norm": 0.572296679019928,
"learning_rate": 3.7429192433944014e-05,
"loss": 0.1997,
"step": 688
},
{
"epoch": 0.7192066805845512,
"grad_norm": 0.6478104591369629,
"learning_rate": 3.717103202142457e-05,
"loss": 0.2126,
"step": 689
},
{
"epoch": 0.7202505219206681,
"grad_norm": 0.7389695644378662,
"learning_rate": 3.691356159859177e-05,
"loss": 0.2333,
"step": 690
},
{
"epoch": 0.721294363256785,
"grad_norm": 0.5489468574523926,
"learning_rate": 3.665678399299388e-05,
"loss": 0.1701,
"step": 691
},
{
"epoch": 0.7223382045929019,
"grad_norm": 0.7258986830711365,
"learning_rate": 3.64007020245706e-05,
"loss": 0.2457,
"step": 692
},
{
"epoch": 0.7233820459290188,
"grad_norm": 0.6785321235656738,
"learning_rate": 3.614531850562203e-05,
"loss": 0.1936,
"step": 693
},
{
"epoch": 0.7244258872651357,
"grad_norm": 0.612426221370697,
"learning_rate": 3.589063624077802e-05,
"loss": 0.2403,
"step": 694
},
{
"epoch": 0.7254697286012526,
"grad_norm": 0.5947864651679993,
"learning_rate": 3.563665802696707e-05,
"loss": 0.1743,
"step": 695
},
{
"epoch": 0.7265135699373695,
"grad_norm": 0.6579543352127075,
"learning_rate": 3.538338665338589e-05,
"loss": 0.1928,
"step": 696
},
{
"epoch": 0.7275574112734864,
"grad_norm": 0.5460782647132874,
"learning_rate": 3.513082490146864e-05,
"loss": 0.1655,
"step": 697
},
{
"epoch": 0.7286012526096033,
"grad_norm": 0.7640422582626343,
"learning_rate": 3.487897554485628e-05,
"loss": 0.1659,
"step": 698
},
{
"epoch": 0.7296450939457203,
"grad_norm": 0.7361250519752502,
"learning_rate": 3.462784134936636e-05,
"loss": 0.1992,
"step": 699
},
{
"epoch": 0.7306889352818372,
"grad_norm": 1.2653623819351196,
"learning_rate": 3.4377425072962465e-05,
"loss": 0.1184,
"step": 700
},
{
"epoch": 0.7317327766179541,
"grad_norm": 0.6173591613769531,
"learning_rate": 3.412772946572389e-05,
"loss": 0.4121,
"step": 701
},
{
"epoch": 0.732776617954071,
"grad_norm": 0.5525224208831787,
"learning_rate": 3.387875726981563e-05,
"loss": 0.2601,
"step": 702
},
{
"epoch": 0.7338204592901879,
"grad_norm": 0.6985558867454529,
"learning_rate": 3.363051121945809e-05,
"loss": 0.3448,
"step": 703
},
{
"epoch": 0.7348643006263048,
"grad_norm": 0.580680251121521,
"learning_rate": 3.3382994040897196e-05,
"loss": 0.2642,
"step": 704
},
{
"epoch": 0.7359081419624217,
"grad_norm": 0.5876568555831909,
"learning_rate": 3.3136208452374254e-05,
"loss": 0.271,
"step": 705
},
{
"epoch": 0.7369519832985386,
"grad_norm": 0.6181269884109497,
"learning_rate": 3.289015716409631e-05,
"loss": 0.2522,
"step": 706
},
{
"epoch": 0.7379958246346555,
"grad_norm": 0.6312392354011536,
"learning_rate": 3.264484287820634e-05,
"loss": 0.2735,
"step": 707
},
{
"epoch": 0.7390396659707724,
"grad_norm": 0.6163091063499451,
"learning_rate": 3.2400268288753425e-05,
"loss": 0.2415,
"step": 708
},
{
"epoch": 0.7400835073068893,
"grad_norm": 0.6303150057792664,
"learning_rate": 3.2156436081663356e-05,
"loss": 0.2495,
"step": 709
},
{
"epoch": 0.7411273486430062,
"grad_norm": 0.6544148325920105,
"learning_rate": 3.191334893470907e-05,
"loss": 0.2445,
"step": 710
},
{
"epoch": 0.7421711899791231,
"grad_norm": 0.468227744102478,
"learning_rate": 3.167100951748115e-05,
"loss": 0.1481,
"step": 711
},
{
"epoch": 0.7432150313152401,
"grad_norm": 0.5340932607650757,
"learning_rate": 3.14294204913587e-05,
"loss": 0.1431,
"step": 712
},
{
"epoch": 0.744258872651357,
"grad_norm": 0.5851957201957703,
"learning_rate": 3.1188584509479866e-05,
"loss": 0.1737,
"step": 713
},
{
"epoch": 0.7453027139874739,
"grad_norm": 0.5840248465538025,
"learning_rate": 3.094850421671295e-05,
"loss": 0.1752,
"step": 714
},
{
"epoch": 0.7463465553235908,
"grad_norm": 0.5659369826316833,
"learning_rate": 3.0709182249627255e-05,
"loss": 0.1967,
"step": 715
},
{
"epoch": 0.7473903966597077,
"grad_norm": 0.457015722990036,
"learning_rate": 3.0470621236464036e-05,
"loss": 0.1544,
"step": 716
},
{
"epoch": 0.7484342379958246,
"grad_norm": 0.5182324647903442,
"learning_rate": 3.023282379710779e-05,
"loss": 0.1414,
"step": 717
},
{
"epoch": 0.7494780793319415,
"grad_norm": 0.5334721207618713,
"learning_rate": 2.9995792543057478e-05,
"loss": 0.1615,
"step": 718
},
{
"epoch": 0.7505219206680585,
"grad_norm": 0.6061464548110962,
"learning_rate": 2.9759530077397636e-05,
"loss": 0.2452,
"step": 719
},
{
"epoch": 0.7515657620041754,
"grad_norm": 0.5774762630462646,
"learning_rate": 2.9524038994770107e-05,
"loss": 0.2234,
"step": 720
},
{
"epoch": 0.7515657620041754,
"eval_loss": 0.21803000569343567,
"eval_runtime": 81.2533,
"eval_samples_per_second": 19.864,
"eval_steps_per_second": 9.932,
"step": 720
},
{
"epoch": 0.7526096033402923,
"grad_norm": 0.43510791659355164,
"learning_rate": 2.9289321881345254e-05,
"loss": 0.1719,
"step": 721
},
{
"epoch": 0.7536534446764092,
"grad_norm": 0.4775535762310028,
"learning_rate": 2.905538131479376e-05,
"loss": 0.1874,
"step": 722
},
{
"epoch": 0.7546972860125261,
"grad_norm": 0.5398250818252563,
"learning_rate": 2.8822219864258272e-05,
"loss": 0.1667,
"step": 723
},
{
"epoch": 0.755741127348643,
"grad_norm": 0.4013484716415405,
"learning_rate": 2.8589840090325027e-05,
"loss": 0.1252,
"step": 724
},
{
"epoch": 0.7567849686847599,
"grad_norm": 0.8314927816390991,
"learning_rate": 2.8358244544996038e-05,
"loss": 0.3924,
"step": 725
},
{
"epoch": 0.7578288100208769,
"grad_norm": 0.8482892513275146,
"learning_rate": 2.8127435771660747e-05,
"loss": 0.3675,
"step": 726
},
{
"epoch": 0.7588726513569938,
"grad_norm": 0.6500058770179749,
"learning_rate": 2.7897416305068323e-05,
"loss": 0.3016,
"step": 727
},
{
"epoch": 0.7599164926931107,
"grad_norm": 0.6976576447486877,
"learning_rate": 2.7668188671299755e-05,
"loss": 0.3133,
"step": 728
},
{
"epoch": 0.7609603340292276,
"grad_norm": 0.7735400795936584,
"learning_rate": 2.743975538774002e-05,
"loss": 0.3557,
"step": 729
},
{
"epoch": 0.7620041753653445,
"grad_norm": 0.7905307412147522,
"learning_rate": 2.7212118963050592e-05,
"loss": 0.3316,
"step": 730
},
{
"epoch": 0.7630480167014614,
"grad_norm": 0.7098974585533142,
"learning_rate": 2.6985281897141812e-05,
"loss": 0.2877,
"step": 731
},
{
"epoch": 0.7640918580375783,
"grad_norm": 0.6374404430389404,
"learning_rate": 2.675924668114537e-05,
"loss": 0.2589,
"step": 732
},
{
"epoch": 0.7651356993736952,
"grad_norm": 0.7561594247817993,
"learning_rate": 2.65340157973871e-05,
"loss": 0.2953,
"step": 733
},
{
"epoch": 0.7661795407098121,
"grad_norm": 0.7232580780982971,
"learning_rate": 2.630959171935956e-05,
"loss": 0.3002,
"step": 734
},
{
"epoch": 0.767223382045929,
"grad_norm": 0.7140358686447144,
"learning_rate": 2.6085976911694987e-05,
"loss": 0.2545,
"step": 735
},
{
"epoch": 0.7682672233820459,
"grad_norm": 0.749450147151947,
"learning_rate": 2.586317383013821e-05,
"loss": 0.25,
"step": 736
},
{
"epoch": 0.7693110647181628,
"grad_norm": 0.847427248954773,
"learning_rate": 2.564118492151957e-05,
"loss": 0.2887,
"step": 737
},
{
"epoch": 0.7703549060542797,
"grad_norm": 0.5772815346717834,
"learning_rate": 2.5420012623728208e-05,
"loss": 0.2101,
"step": 738
},
{
"epoch": 0.7713987473903967,
"grad_norm": 0.6374946236610413,
"learning_rate": 2.5199659365685235e-05,
"loss": 0.2193,
"step": 739
},
{
"epoch": 0.7724425887265136,
"grad_norm": 0.642776370048523,
"learning_rate": 2.4980127567316948e-05,
"loss": 0.2168,
"step": 740
},
{
"epoch": 0.7734864300626305,
"grad_norm": 0.6233210563659668,
"learning_rate": 2.4761419639528437e-05,
"loss": 0.1986,
"step": 741
},
{
"epoch": 0.7745302713987474,
"grad_norm": 0.47856077551841736,
"learning_rate": 2.4543537984176978e-05,
"loss": 0.1681,
"step": 742
},
{
"epoch": 0.7755741127348643,
"grad_norm": 0.5664119720458984,
"learning_rate": 2.4326484994045752e-05,
"loss": 0.1847,
"step": 743
},
{
"epoch": 0.7766179540709812,
"grad_norm": 0.5382654666900635,
"learning_rate": 2.4110263052817394e-05,
"loss": 0.1765,
"step": 744
},
{
"epoch": 0.7776617954070981,
"grad_norm": 0.46047693490982056,
"learning_rate": 2.3894874535048063e-05,
"loss": 0.1487,
"step": 745
},
{
"epoch": 0.778705636743215,
"grad_norm": 0.5149843096733093,
"learning_rate": 2.368032180614118e-05,
"loss": 0.1817,
"step": 746
},
{
"epoch": 0.7797494780793319,
"grad_norm": 0.4655948579311371,
"learning_rate": 2.346660722232148e-05,
"loss": 0.1548,
"step": 747
},
{
"epoch": 0.7807933194154488,
"grad_norm": 0.5313979387283325,
"learning_rate": 2.325373313060919e-05,
"loss": 0.1504,
"step": 748
},
{
"epoch": 0.7818371607515657,
"grad_norm": 0.5336787700653076,
"learning_rate": 2.3041701868794287e-05,
"loss": 0.1306,
"step": 749
},
{
"epoch": 0.7828810020876826,
"grad_norm": 0.7393002510070801,
"learning_rate": 2.2830515765410622e-05,
"loss": 0.1137,
"step": 750
},
{
"epoch": 0.7839248434237995,
"grad_norm": 0.5598475933074951,
"learning_rate": 2.262017713971063e-05,
"loss": 0.3354,
"step": 751
},
{
"epoch": 0.7849686847599165,
"grad_norm": 0.5945207476615906,
"learning_rate": 2.2410688301639616e-05,
"loss": 0.2804,
"step": 752
},
{
"epoch": 0.7860125260960334,
"grad_norm": 0.4905988574028015,
"learning_rate": 2.2202051551810565e-05,
"loss": 0.2229,
"step": 753
},
{
"epoch": 0.7870563674321504,
"grad_norm": 0.607475996017456,
"learning_rate": 2.19942691814788e-05,
"loss": 0.289,
"step": 754
},
{
"epoch": 0.7881002087682673,
"grad_norm": 0.6597141027450562,
"learning_rate": 2.178734347251673e-05,
"loss": 0.2965,
"step": 755
},
{
"epoch": 0.7891440501043842,
"grad_norm": 0.6289554238319397,
"learning_rate": 2.1581276697388975e-05,
"loss": 0.2582,
"step": 756
},
{
"epoch": 0.7901878914405011,
"grad_norm": 0.6147776246070862,
"learning_rate": 2.1376071119127338e-05,
"loss": 0.2848,
"step": 757
},
{
"epoch": 0.791231732776618,
"grad_norm": 0.5135255455970764,
"learning_rate": 2.1171728991305795e-05,
"loss": 0.2306,
"step": 758
},
{
"epoch": 0.7922755741127349,
"grad_norm": 0.5475291013717651,
"learning_rate": 2.0968252558016055e-05,
"loss": 0.2394,
"step": 759
},
{
"epoch": 0.7933194154488518,
"grad_norm": 0.6322019696235657,
"learning_rate": 2.076564405384258e-05,
"loss": 0.2066,
"step": 760
},
{
"epoch": 0.7943632567849687,
"grad_norm": 0.5838301181793213,
"learning_rate": 2.0563905703838316e-05,
"loss": 0.2321,
"step": 761
},
{
"epoch": 0.7954070981210856,
"grad_norm": 0.47901853919029236,
"learning_rate": 2.0363039723500156e-05,
"loss": 0.1445,
"step": 762
},
{
"epoch": 0.7964509394572025,
"grad_norm": 0.6136653423309326,
"learning_rate": 2.0163048318744493e-05,
"loss": 0.2168,
"step": 763
},
{
"epoch": 0.7974947807933194,
"grad_norm": 0.49966001510620117,
"learning_rate": 1.9963933685883253e-05,
"loss": 0.1414,
"step": 764
},
{
"epoch": 0.7985386221294363,
"grad_norm": 0.5253435373306274,
"learning_rate": 1.9765698011599466e-05,
"loss": 0.1513,
"step": 765
},
{
"epoch": 0.7995824634655533,
"grad_norm": 0.3400777578353882,
"learning_rate": 1.9568343472923524e-05,
"loss": 0.1112,
"step": 766
},
{
"epoch": 0.8006263048016702,
"grad_norm": 0.5851226449012756,
"learning_rate": 1.9371872237209165e-05,
"loss": 0.1619,
"step": 767
},
{
"epoch": 0.8016701461377871,
"grad_norm": 0.30553382635116577,
"learning_rate": 1.917628646210957e-05,
"loss": 0.0872,
"step": 768
},
{
"epoch": 0.802713987473904,
"grad_norm": 0.5676819682121277,
"learning_rate": 1.8981588295553853e-05,
"loss": 0.2251,
"step": 769
},
{
"epoch": 0.8037578288100209,
"grad_norm": 0.5648460984230042,
"learning_rate": 1.878777987572339e-05,
"loss": 0.2181,
"step": 770
},
{
"epoch": 0.8048016701461378,
"grad_norm": 0.607913076877594,
"learning_rate": 1.8594863331028224e-05,
"loss": 0.2216,
"step": 771
},
{
"epoch": 0.8058455114822547,
"grad_norm": 0.49821627140045166,
"learning_rate": 1.840284078008393e-05,
"loss": 0.1845,
"step": 772
},
{
"epoch": 0.8068893528183716,
"grad_norm": 0.42891865968704224,
"learning_rate": 1.821171433168809e-05,
"loss": 0.1302,
"step": 773
},
{
"epoch": 0.8079331941544885,
"grad_norm": 0.8176518678665161,
"learning_rate": 1.8021486084797368e-05,
"loss": 0.3043,
"step": 774
},
{
"epoch": 0.8089770354906054,
"grad_norm": 0.6164413690567017,
"learning_rate": 1.7832158128504328e-05,
"loss": 0.2472,
"step": 775
},
{
"epoch": 0.8100208768267223,
"grad_norm": 0.7422143220901489,
"learning_rate": 1.7643732542014434e-05,
"loss": 0.3395,
"step": 776
},
{
"epoch": 0.8110647181628392,
"grad_norm": 0.7129305601119995,
"learning_rate": 1.7456211394623378e-05,
"loss": 0.2934,
"step": 777
},
{
"epoch": 0.8121085594989561,
"grad_norm": 0.7638242840766907,
"learning_rate": 1.7269596745694295e-05,
"loss": 0.3399,
"step": 778
},
{
"epoch": 0.8131524008350731,
"grad_norm": 0.624290943145752,
"learning_rate": 1.7083890644635014e-05,
"loss": 0.2525,
"step": 779
},
{
"epoch": 0.81419624217119,
"grad_norm": 0.6761390566825867,
"learning_rate": 1.6899095130875774e-05,
"loss": 0.2753,
"step": 780
},
{
"epoch": 0.8152400835073069,
"grad_norm": 0.8017570972442627,
"learning_rate": 1.6715212233846655e-05,
"loss": 0.3149,
"step": 781
},
{
"epoch": 0.8162839248434238,
"grad_norm": 0.6984527111053467,
"learning_rate": 1.6532243972955398e-05,
"loss": 0.3103,
"step": 782
},
{
"epoch": 0.8173277661795407,
"grad_norm": 0.6139991283416748,
"learning_rate": 1.635019235756511e-05,
"loss": 0.2274,
"step": 783
},
{
"epoch": 0.8183716075156576,
"grad_norm": 0.7596770524978638,
"learning_rate": 1.616905938697234e-05,
"loss": 0.2612,
"step": 784
},
{
"epoch": 0.8194154488517745,
"grad_norm": 0.685114324092865,
"learning_rate": 1.5988847050385037e-05,
"loss": 0.2542,
"step": 785
},
{
"epoch": 0.8204592901878914,
"grad_norm": 0.6803342700004578,
"learning_rate": 1.5809557326900647e-05,
"loss": 0.2267,
"step": 786
},
{
"epoch": 0.8215031315240083,
"grad_norm": 0.6194560527801514,
"learning_rate": 1.5631192185484554e-05,
"loss": 0.2029,
"step": 787
},
{
"epoch": 0.8225469728601252,
"grad_norm": 0.8063942193984985,
"learning_rate": 1.5453753584948328e-05,
"loss": 0.2735,
"step": 788
},
{
"epoch": 0.8235908141962421,
"grad_norm": 0.6718615293502808,
"learning_rate": 1.527724347392815e-05,
"loss": 0.2278,
"step": 789
},
{
"epoch": 0.824634655532359,
"grad_norm": 0.6082893013954163,
"learning_rate": 1.5101663790863596e-05,
"loss": 0.1969,
"step": 790
},
{
"epoch": 0.8256784968684759,
"grad_norm": 0.7025957703590393,
"learning_rate": 1.4927016463976263e-05,
"loss": 0.2071,
"step": 791
},
{
"epoch": 0.826722338204593,
"grad_norm": 0.5405511260032654,
"learning_rate": 1.4753303411248475e-05,
"loss": 0.191,
"step": 792
},
{
"epoch": 0.8277661795407099,
"grad_norm": 0.6413715481758118,
"learning_rate": 1.4580526540402461e-05,
"loss": 0.185,
"step": 793
},
{
"epoch": 0.8288100208768268,
"grad_norm": 0.6367815136909485,
"learning_rate": 1.4408687748879156e-05,
"loss": 0.221,
"step": 794
},
{
"epoch": 0.8298538622129437,
"grad_norm": 0.4684351086616516,
"learning_rate": 1.4237788923817553e-05,
"loss": 0.1415,
"step": 795
},
{
"epoch": 0.8308977035490606,
"grad_norm": 0.5200543403625488,
"learning_rate": 1.4067831942033904e-05,
"loss": 0.1648,
"step": 796
},
{
"epoch": 0.8319415448851775,
"grad_norm": 0.44216129183769226,
"learning_rate": 1.3898818670001034e-05,
"loss": 0.1608,
"step": 797
},
{
"epoch": 0.8329853862212944,
"grad_norm": 0.5650377869606018,
"learning_rate": 1.3730750963828032e-05,
"loss": 0.1585,
"step": 798
},
{
"epoch": 0.8340292275574113,
"grad_norm": 0.5171220898628235,
"learning_rate": 1.3563630669239624e-05,
"loss": 0.1575,
"step": 799
},
{
"epoch": 0.8350730688935282,
"grad_norm": 0.5441738367080688,
"learning_rate": 1.339745962155613e-05,
"loss": 0.0949,
"step": 800
},
{
"epoch": 0.8361169102296451,
"grad_norm": 0.6055110096931458,
"learning_rate": 1.3232239645673217e-05,
"loss": 0.4181,
"step": 801
},
{
"epoch": 0.837160751565762,
"grad_norm": 0.6455709338188171,
"learning_rate": 1.3067972556041752e-05,
"loss": 0.3383,
"step": 802
},
{
"epoch": 0.8382045929018789,
"grad_norm": 0.5849418640136719,
"learning_rate": 1.2904660156648074e-05,
"loss": 0.2803,
"step": 803
},
{
"epoch": 0.8392484342379958,
"grad_norm": 0.538429856300354,
"learning_rate": 1.2742304240994053e-05,
"loss": 0.2402,
"step": 804
},
{
"epoch": 0.8402922755741128,
"grad_norm": 0.6266717314720154,
"learning_rate": 1.2580906592077402e-05,
"loss": 0.3009,
"step": 805
},
{
"epoch": 0.8413361169102297,
"grad_norm": 0.5892521142959595,
"learning_rate": 1.2420468982372158e-05,
"loss": 0.249,
"step": 806
},
{
"epoch": 0.8423799582463466,
"grad_norm": 0.5853463411331177,
"learning_rate": 1.226099317380912e-05,
"loss": 0.2465,
"step": 807
},
{
"epoch": 0.8434237995824635,
"grad_norm": 0.640910267829895,
"learning_rate": 1.210248091775663e-05,
"loss": 0.1872,
"step": 808
},
{
"epoch": 0.8444676409185804,
"grad_norm": 0.6560840606689453,
"learning_rate": 1.1944933955001225e-05,
"loss": 0.2542,
"step": 809
},
{
"epoch": 0.8455114822546973,
"grad_norm": 0.5866312980651855,
"learning_rate": 1.1788354015728543e-05,
"loss": 0.1911,
"step": 810
},
{
"epoch": 0.8465553235908142,
"grad_norm": 0.6191656589508057,
"learning_rate": 1.1632742819504405e-05,
"loss": 0.2215,
"step": 811
},
{
"epoch": 0.8475991649269311,
"grad_norm": 0.5251643657684326,
"learning_rate": 1.147810207525577e-05,
"loss": 0.1583,
"step": 812
},
{
"epoch": 0.848643006263048,
"grad_norm": 0.4927600622177124,
"learning_rate": 1.132443348125214e-05,
"loss": 0.1512,
"step": 813
},
{
"epoch": 0.8496868475991649,
"grad_norm": 0.4139147698879242,
"learning_rate": 1.1171738725086833e-05,
"loss": 0.1172,
"step": 814
},
{
"epoch": 0.8507306889352818,
"grad_norm": 0.5602164268493652,
"learning_rate": 1.1020019483658384e-05,
"loss": 0.1821,
"step": 815
},
{
"epoch": 0.8517745302713987,
"grad_norm": 0.4220430850982666,
"learning_rate": 1.0869277423152246e-05,
"loss": 0.1263,
"step": 816
},
{
"epoch": 0.8528183716075156,
"grad_norm": 0.38943833112716675,
"learning_rate": 1.0719514199022473e-05,
"loss": 0.1141,
"step": 817
},
{
"epoch": 0.8538622129436325,
"grad_norm": 0.5523675084114075,
"learning_rate": 1.0570731455973414e-05,
"loss": 0.1578,
"step": 818
},
{
"epoch": 0.8549060542797495,
"grad_norm": 0.6071298718452454,
"learning_rate": 1.04229308279418e-05,
"loss": 0.2323,
"step": 819
},
{
"epoch": 0.8559498956158664,
"grad_norm": 0.5393807291984558,
"learning_rate": 1.0276113938078769e-05,
"loss": 0.1967,
"step": 820
},
{
"epoch": 0.8569937369519833,
"grad_norm": 0.6537972688674927,
"learning_rate": 1.0130282398731982e-05,
"loss": 0.1953,
"step": 821
},
{
"epoch": 0.8580375782881002,
"grad_norm": 0.4417877197265625,
"learning_rate": 9.985437811427933e-06,
"loss": 0.1453,
"step": 822
},
{
"epoch": 0.8590814196242171,
"grad_norm": 0.6434723734855652,
"learning_rate": 9.841581766854401e-06,
"loss": 0.1683,
"step": 823
},
{
"epoch": 0.860125260960334,
"grad_norm": 0.7254697680473328,
"learning_rate": 9.698715844842988e-06,
"loss": 0.3499,
"step": 824
},
{
"epoch": 0.8611691022964509,
"grad_norm": 0.6909394264221191,
"learning_rate": 9.556841614351664e-06,
"loss": 0.2757,
"step": 825
},
{
"epoch": 0.8622129436325678,
"grad_norm": 0.7137805223464966,
"learning_rate": 9.415960633447674e-06,
"loss": 0.3122,
"step": 826
},
{
"epoch": 0.8632567849686847,
"grad_norm": 0.7450171709060669,
"learning_rate": 9.276074449290361e-06,
"loss": 0.2928,
"step": 827
},
{
"epoch": 0.8643006263048016,
"grad_norm": 0.6502891778945923,
"learning_rate": 9.137184598114134e-06,
"loss": 0.2517,
"step": 828
},
{
"epoch": 0.8653444676409185,
"grad_norm": 0.6984295845031738,
"learning_rate": 8.999292605211695e-06,
"loss": 0.3062,
"step": 829
},
{
"epoch": 0.8663883089770354,
"grad_norm": 0.7169867753982544,
"learning_rate": 8.862399984917213e-06,
"loss": 0.2748,
"step": 830
},
{
"epoch": 0.8674321503131524,
"grad_norm": 0.7259141802787781,
"learning_rate": 8.726508240589692e-06,
"loss": 0.3033,
"step": 831
},
{
"epoch": 0.8684759916492694,
"grad_norm": 0.7949566841125488,
"learning_rate": 8.59161886459654e-06,
"loss": 0.3149,
"step": 832
},
{
"epoch": 0.8695198329853863,
"grad_norm": 0.7562083005905151,
"learning_rate": 8.457733338297069e-06,
"loss": 0.3192,
"step": 833
},
{
"epoch": 0.8705636743215032,
"grad_norm": 0.6112555265426636,
"learning_rate": 8.3248531320263e-06,
"loss": 0.2175,
"step": 834
},
{
"epoch": 0.8716075156576201,
"grad_norm": 0.7050125598907471,
"learning_rate": 8.192979705078852e-06,
"loss": 0.243,
"step": 835
},
{
"epoch": 0.872651356993737,
"grad_norm": 0.6470485925674438,
"learning_rate": 8.062114505692742e-06,
"loss": 0.2384,
"step": 836
},
{
"epoch": 0.8736951983298539,
"grad_norm": 0.7082952260971069,
"learning_rate": 7.932258971033746e-06,
"loss": 0.2795,
"step": 837
},
{
"epoch": 0.8747390396659708,
"grad_norm": 0.843268096446991,
"learning_rate": 7.803414527179343e-06,
"loss": 0.2991,
"step": 838
},
{
"epoch": 0.8757828810020877,
"grad_norm": 0.6356431245803833,
"learning_rate": 7.675582589103247e-06,
"loss": 0.1963,
"step": 839
},
{
"epoch": 0.8768267223382046,
"grad_norm": 0.569520890712738,
"learning_rate": 7.548764560659816e-06,
"loss": 0.1703,
"step": 840
},
{
"epoch": 0.8778705636743215,
"grad_norm": 0.6984921097755432,
"learning_rate": 7.422961834568565e-06,
"loss": 0.2231,
"step": 841
},
{
"epoch": 0.8789144050104384,
"grad_norm": 0.6111634969711304,
"learning_rate": 7.2981757923989755e-06,
"loss": 0.1825,
"step": 842
},
{
"epoch": 0.8799582463465553,
"grad_norm": 0.666388213634491,
"learning_rate": 7.174407804555261e-06,
"loss": 0.1775,
"step": 843
},
{
"epoch": 0.8810020876826722,
"grad_norm": 0.7088585495948792,
"learning_rate": 7.051659230261298e-06,
"loss": 0.1873,
"step": 844
},
{
"epoch": 0.8820459290187892,
"grad_norm": 0.6127867102622986,
"learning_rate": 6.929931417545788e-06,
"loss": 0.1732,
"step": 845
},
{
"epoch": 0.8830897703549061,
"grad_norm": 0.637973964214325,
"learning_rate": 6.809225703227351e-06,
"loss": 0.1856,
"step": 846
},
{
"epoch": 0.884133611691023,
"grad_norm": 0.5888153910636902,
"learning_rate": 6.689543412899913e-06,
"loss": 0.1872,
"step": 847
},
{
"epoch": 0.8851774530271399,
"grad_norm": 0.6536146402359009,
"learning_rate": 6.57088586091813e-06,
"loss": 0.2259,
"step": 848
},
{
"epoch": 0.8862212943632568,
"grad_norm": 0.4231550693511963,
"learning_rate": 6.45325435038292e-06,
"loss": 0.1388,
"step": 849
},
{
"epoch": 0.8872651356993737,
"grad_norm": 0.8168404698371887,
"learning_rate": 6.336650173127223e-06,
"loss": 0.1743,
"step": 850
},
{
"epoch": 0.8883089770354906,
"grad_norm": 0.5078144073486328,
"learning_rate": 6.221074609701738e-06,
"loss": 0.2793,
"step": 851
},
{
"epoch": 0.8893528183716075,
"grad_norm": 0.7190085053443909,
"learning_rate": 6.106528929360911e-06,
"loss": 0.4109,
"step": 852
},
{
"epoch": 0.8903966597077244,
"grad_norm": 0.5939377546310425,
"learning_rate": 5.99301439004899e-06,
"loss": 0.2797,
"step": 853
},
{
"epoch": 0.8914405010438413,
"grad_norm": 0.7328153848648071,
"learning_rate": 5.880532238386161e-06,
"loss": 0.3652,
"step": 854
},
{
"epoch": 0.8924843423799582,
"grad_norm": 0.6011344194412231,
"learning_rate": 5.769083709654932e-06,
"loss": 0.2699,
"step": 855
},
{
"epoch": 0.8935281837160751,
"grad_norm": 0.7004411220550537,
"learning_rate": 5.658670027786561e-06,
"loss": 0.3191,
"step": 856
},
{
"epoch": 0.894572025052192,
"grad_norm": 0.5985621809959412,
"learning_rate": 5.549292405347495e-06,
"loss": 0.2593,
"step": 857
},
{
"epoch": 0.8956158663883089,
"grad_norm": 0.6498935222625732,
"learning_rate": 5.440952043526215e-06,
"loss": 0.3027,
"step": 858
},
{
"epoch": 0.8966597077244259,
"grad_norm": 0.6431671380996704,
"learning_rate": 5.3336501321199714e-06,
"loss": 0.237,
"step": 859
},
{
"epoch": 0.8977035490605428,
"grad_norm": 0.6603933572769165,
"learning_rate": 5.22738784952167e-06,
"loss": 0.2533,
"step": 860
},
{
"epoch": 0.8987473903966597,
"grad_norm": 0.5379349589347839,
"learning_rate": 5.1221663627070485e-06,
"loss": 0.1883,
"step": 861
},
{
"epoch": 0.8997912317327766,
"grad_norm": 0.4264977276325226,
"learning_rate": 5.017986827221733e-06,
"loss": 0.1174,
"step": 862
},
{
"epoch": 0.9008350730688935,
"grad_norm": 0.5029094815254211,
"learning_rate": 4.914850387168657e-06,
"loss": 0.1564,
"step": 863
},
{
"epoch": 0.9018789144050104,
"grad_norm": 0.5162425637245178,
"learning_rate": 4.812758175195397e-06,
"loss": 0.1547,
"step": 864
},
{
"epoch": 0.9029227557411273,
"grad_norm": 0.6308012008666992,
"learning_rate": 4.711711312481815e-06,
"loss": 0.1844,
"step": 865
},
{
"epoch": 0.9039665970772442,
"grad_norm": 0.46069368720054626,
"learning_rate": 4.61171090872774e-06,
"loss": 0.1363,
"step": 866
},
{
"epoch": 0.9050104384133612,
"grad_norm": 0.40711909532546997,
"learning_rate": 4.512758062140687e-06,
"loss": 0.1252,
"step": 867
},
{
"epoch": 0.906054279749478,
"grad_norm": 0.5450437664985657,
"learning_rate": 4.4148538594239174e-06,
"loss": 0.1885,
"step": 868
},
{
"epoch": 0.907098121085595,
"grad_norm": 0.5693588852882385,
"learning_rate": 4.317999375764459e-06,
"loss": 0.2161,
"step": 869
},
{
"epoch": 0.9081419624217119,
"grad_norm": 0.45915868878364563,
"learning_rate": 4.2221956748212384e-06,
"loss": 0.1642,
"step": 870
},
{
"epoch": 0.9091858037578288,
"grad_norm": 0.3877635896205902,
"learning_rate": 4.127443808713527e-06,
"loss": 0.1424,
"step": 871
},
{
"epoch": 0.9102296450939458,
"grad_norm": 0.5209342241287231,
"learning_rate": 4.033744818009244e-06,
"loss": 0.1703,
"step": 872
},
{
"epoch": 0.9112734864300627,
"grad_norm": 0.4651091396808624,
"learning_rate": 3.941099731713637e-06,
"loss": 0.1584,
"step": 873
},
{
"epoch": 0.9123173277661796,
"grad_norm": 0.8967810869216919,
"learning_rate": 3.849509567257959e-06,
"loss": 0.3558,
"step": 874
},
{
"epoch": 0.9133611691022965,
"grad_norm": 0.8381048440933228,
"learning_rate": 3.7589753304882124e-06,
"loss": 0.3886,
"step": 875
},
{
"epoch": 0.9144050104384134,
"grad_norm": 0.6149895787239075,
"learning_rate": 3.669498015654249e-06,
"loss": 0.31,
"step": 876
},
{
"epoch": 0.9154488517745303,
"grad_norm": 0.7612007856369019,
"learning_rate": 3.5810786053987023e-06,
"loss": 0.359,
"step": 877
},
{
"epoch": 0.9164926931106472,
"grad_norm": 0.6727755069732666,
"learning_rate": 3.493718070746299e-06,
"loss": 0.2748,
"step": 878
},
{
"epoch": 0.9175365344467641,
"grad_norm": 0.734786868095398,
"learning_rate": 3.40741737109318e-06,
"loss": 0.3066,
"step": 879
},
{
"epoch": 0.918580375782881,
"grad_norm": 0.6576768159866333,
"learning_rate": 3.3221774541962845e-06,
"loss": 0.2677,
"step": 880
},
{
"epoch": 0.9196242171189979,
"grad_norm": 0.6196028590202332,
"learning_rate": 3.2379992561630712e-06,
"loss": 0.2484,
"step": 881
},
{
"epoch": 0.9206680584551148,
"grad_norm": 0.7527311444282532,
"learning_rate": 3.1548837014411357e-06,
"loss": 0.2787,
"step": 882
},
{
"epoch": 0.9217118997912317,
"grad_norm": 0.7063425779342651,
"learning_rate": 3.0728317028080657e-06,
"loss": 0.303,
"step": 883
},
{
"epoch": 0.9227557411273486,
"grad_norm": 0.6437200307846069,
"learning_rate": 2.9918441613615123e-06,
"loss": 0.2501,
"step": 884
},
{
"epoch": 0.9237995824634656,
"grad_norm": 0.8236239552497864,
"learning_rate": 2.9119219665091344e-06,
"loss": 0.3234,
"step": 885
},
{
"epoch": 0.9248434237995825,
"grad_norm": 0.686543881893158,
"learning_rate": 2.8330659959589946e-06,
"loss": 0.2407,
"step": 886
},
{
"epoch": 0.9258872651356994,
"grad_norm": 0.7093439698219299,
"learning_rate": 2.755277115709842e-06,
"loss": 0.246,
"step": 887
},
{
"epoch": 0.9269311064718163,
"grad_norm": 0.7115840315818787,
"learning_rate": 2.678556180041547e-06,
"loss": 0.2551,
"step": 888
},
{
"epoch": 0.9279749478079332,
"grad_norm": 0.6106806993484497,
"learning_rate": 2.6029040315058485e-06,
"loss": 0.2137,
"step": 889
},
{
"epoch": 0.9290187891440501,
"grad_norm": 0.5461225509643555,
"learning_rate": 2.5283215009169857e-06,
"loss": 0.1847,
"step": 890
},
{
"epoch": 0.930062630480167,
"grad_norm": 0.6097748875617981,
"learning_rate": 2.4548094073426398e-06,
"loss": 0.209,
"step": 891
},
{
"epoch": 0.9311064718162839,
"grad_norm": 0.6483787298202515,
"learning_rate": 2.3823685580949273e-06,
"loss": 0.2244,
"step": 892
},
{
"epoch": 0.9321503131524008,
"grad_norm": 0.5993359088897705,
"learning_rate": 2.3109997487214983e-06,
"loss": 0.189,
"step": 893
},
{
"epoch": 0.9331941544885177,
"grad_norm": 0.8005963563919067,
"learning_rate": 2.240703762996843e-06,
"loss": 0.2385,
"step": 894
},
{
"epoch": 0.9342379958246346,
"grad_norm": 0.4756294786930084,
"learning_rate": 2.1714813729136975e-06,
"loss": 0.1531,
"step": 895
},
{
"epoch": 0.9352818371607515,
"grad_norm": 0.577684760093689,
"learning_rate": 2.1033333386744846e-06,
"loss": 0.1722,
"step": 896
},
{
"epoch": 0.9363256784968684,
"grad_norm": 0.5431109666824341,
"learning_rate": 2.036260408683033e-06,
"loss": 0.1796,
"step": 897
},
{
"epoch": 0.9373695198329853,
"grad_norm": 0.5910576581954956,
"learning_rate": 1.9702633195363917e-06,
"loss": 0.1689,
"step": 898
},
{
"epoch": 0.9384133611691023,
"grad_norm": 0.6282055377960205,
"learning_rate": 1.9053427960166182e-06,
"loss": 0.1681,
"step": 899
},
{
"epoch": 0.9394572025052192,
"grad_norm": 0.46189793944358826,
"learning_rate": 1.8414995510829368e-06,
"loss": 0.0832,
"step": 900
},
{
"epoch": 0.9405010438413361,
"grad_norm": 0.7074732780456543,
"learning_rate": 1.778734285863859e-06,
"loss": 0.3342,
"step": 901
},
{
"epoch": 0.941544885177453,
"grad_norm": 0.6299365162849426,
"learning_rate": 1.717047689649487e-06,
"loss": 0.3618,
"step": 902
},
{
"epoch": 0.94258872651357,
"grad_norm": 0.6098471879959106,
"learning_rate": 1.6564404398839439e-06,
"loss": 0.3235,
"step": 903
},
{
"epoch": 0.9436325678496869,
"grad_norm": 0.5733586549758911,
"learning_rate": 1.5969132021579347e-06,
"loss": 0.2646,
"step": 904
},
{
"epoch": 0.9446764091858038,
"grad_norm": 0.6624009609222412,
"learning_rate": 1.5384666302014406e-06,
"loss": 0.3138,
"step": 905
},
{
"epoch": 0.9457202505219207,
"grad_norm": 0.7231829762458801,
"learning_rate": 1.481101365876547e-06,
"loss": 0.3016,
"step": 906
},
{
"epoch": 0.9467640918580376,
"grad_norm": 0.634404718875885,
"learning_rate": 1.4248180391703614e-06,
"loss": 0.2247,
"step": 907
},
{
"epoch": 0.9478079331941545,
"grad_norm": 0.664415180683136,
"learning_rate": 1.3696172681881503e-06,
"loss": 0.3176,
"step": 908
},
{
"epoch": 0.9488517745302714,
"grad_norm": 0.5892297625541687,
"learning_rate": 1.3154996591464908e-06,
"loss": 0.2327,
"step": 909
},
{
"epoch": 0.9498956158663883,
"grad_norm": 0.5772663354873657,
"learning_rate": 1.2624658063666639e-06,
"loss": 0.2104,
"step": 910
},
{
"epoch": 0.9509394572025052,
"grad_norm": 0.5238860845565796,
"learning_rate": 1.2105162922680824e-06,
"loss": 0.1807,
"step": 911
},
{
"epoch": 0.9519832985386222,
"grad_norm": 0.5960121750831604,
"learning_rate": 1.15965168736194e-06,
"loss": 0.2065,
"step": 912
},
{
"epoch": 0.9530271398747391,
"grad_norm": 0.47030001878738403,
"learning_rate": 1.109872550244917e-06,
"loss": 0.1519,
"step": 913
},
{
"epoch": 0.954070981210856,
"grad_norm": 0.5340988636016846,
"learning_rate": 1.0611794275930399e-06,
"loss": 0.1603,
"step": 914
},
{
"epoch": 0.9551148225469729,
"grad_norm": 0.3915995955467224,
"learning_rate": 1.01357285415572e-06,
"loss": 0.1179,
"step": 915
},
{
"epoch": 0.9561586638830898,
"grad_norm": 0.3927573561668396,
"learning_rate": 9.670533527498137e-07,
"loss": 0.1151,
"step": 916
},
{
"epoch": 0.9572025052192067,
"grad_norm": 0.3486252725124359,
"learning_rate": 9.216214342539386e-07,
"loss": 0.0942,
"step": 917
},
{
"epoch": 0.9582463465553236,
"grad_norm": 0.3841801583766937,
"learning_rate": 8.772775976028546e-07,
"loss": 0.1185,
"step": 918
},
{
"epoch": 0.9592901878914405,
"grad_norm": 0.49043968319892883,
"learning_rate": 8.340223297819471e-07,
"loss": 0.1571,
"step": 919
},
{
"epoch": 0.9603340292275574,
"grad_norm": 0.5569294691085815,
"learning_rate": 7.918561058219198e-07,
"loss": 0.1835,
"step": 920
},
{
"epoch": 0.9613778705636743,
"grad_norm": 0.39632856845855713,
"learning_rate": 7.507793887935654e-07,
"loss": 0.1649,
"step": 921
},
{
"epoch": 0.9624217118997912,
"grad_norm": 0.46687746047973633,
"learning_rate": 7.10792629802659e-07,
"loss": 0.1609,
"step": 922
},
{
"epoch": 0.9634655532359081,
"grad_norm": 0.3641255795955658,
"learning_rate": 6.718962679850505e-07,
"loss": 0.1387,
"step": 923
},
{
"epoch": 0.964509394572025,
"grad_norm": 0.5133196115493774,
"learning_rate": 6.340907305017907e-07,
"loss": 0.1752,
"step": 924
},
{
"epoch": 0.965553235908142,
"grad_norm": 0.7964520454406738,
"learning_rate": 5.973764325344688e-07,
"loss": 0.3131,
"step": 925
},
{
"epoch": 0.9665970772442589,
"grad_norm": 0.7928893566131592,
"learning_rate": 5.617537772806602e-07,
"loss": 0.3533,
"step": 926
},
{
"epoch": 0.9676409185803758,
"grad_norm": 0.7127247452735901,
"learning_rate": 5.272231559494634e-07,
"loss": 0.3272,
"step": 927
},
{
"epoch": 0.9686847599164927,
"grad_norm": 0.6839384436607361,
"learning_rate": 4.937849477572587e-07,
"loss": 0.3335,
"step": 928
},
{
"epoch": 0.9697286012526096,
"grad_norm": 0.6431897878646851,
"learning_rate": 4.614395199234678e-07,
"loss": 0.279,
"step": 929
},
{
"epoch": 0.9707724425887265,
"grad_norm": 0.6917023658752441,
"learning_rate": 4.3018722766661193e-07,
"loss": 0.3009,
"step": 930
},
{
"epoch": 0.9718162839248434,
"grad_norm": 0.6915965676307678,
"learning_rate": 4.0002841420032634e-07,
"loss": 0.2842,
"step": 931
},
{
"epoch": 0.9728601252609603,
"grad_norm": 0.6639567017555237,
"learning_rate": 3.7096341072964113e-07,
"loss": 0.2613,
"step": 932
},
{
"epoch": 0.9739039665970772,
"grad_norm": 0.5585394501686096,
"learning_rate": 3.4299253644732855e-07,
"loss": 0.2097,
"step": 933
},
{
"epoch": 0.9749478079331941,
"grad_norm": 0.7092443704605103,
"learning_rate": 3.161160985304168e-07,
"loss": 0.2731,
"step": 934
},
{
"epoch": 0.975991649269311,
"grad_norm": 0.8628230094909668,
"learning_rate": 2.903343921367707e-07,
"loss": 0.3337,
"step": 935
},
{
"epoch": 0.9770354906054279,
"grad_norm": 0.7503064870834351,
"learning_rate": 2.6564770040190535e-07,
"loss": 0.2528,
"step": 936
},
{
"epoch": 0.9780793319415448,
"grad_norm": 0.6571477651596069,
"learning_rate": 2.420562944358329e-07,
"loss": 0.2097,
"step": 937
},
{
"epoch": 0.9791231732776617,
"grad_norm": 0.7250248193740845,
"learning_rate": 2.1956043332010955e-07,
"loss": 0.2418,
"step": 938
},
{
"epoch": 0.9801670146137788,
"grad_norm": 0.5199112892150879,
"learning_rate": 1.9816036410499338e-07,
"loss": 0.1988,
"step": 939
},
{
"epoch": 0.9812108559498957,
"grad_norm": 0.6716915369033813,
"learning_rate": 1.7785632180670198e-07,
"loss": 0.2049,
"step": 940
},
{
"epoch": 0.9822546972860126,
"grad_norm": 0.6581385731697083,
"learning_rate": 1.5864852940485898e-07,
"loss": 0.2065,
"step": 941
},
{
"epoch": 0.9832985386221295,
"grad_norm": 0.5942964553833008,
"learning_rate": 1.405371978400516e-07,
"loss": 0.188,
"step": 942
},
{
"epoch": 0.9843423799582464,
"grad_norm": 0.6144523620605469,
"learning_rate": 1.2352252601147697e-07,
"loss": 0.1848,
"step": 943
},
{
"epoch": 0.9853862212943633,
"grad_norm": 0.6265504956245422,
"learning_rate": 1.0760470077479934e-07,
"loss": 0.2032,
"step": 944
},
{
"epoch": 0.9864300626304802,
"grad_norm": 0.5609498023986816,
"learning_rate": 9.278389694006296e-08,
"loss": 0.1657,
"step": 945
},
{
"epoch": 0.9874739039665971,
"grad_norm": 0.5039533972740173,
"learning_rate": 7.906027726981568e-08,
"loss": 0.158,
"step": 946
},
{
"epoch": 0.988517745302714,
"grad_norm": 0.5719618201255798,
"learning_rate": 6.643399247725502e-08,
"loss": 0.151,
"step": 947
},
{
"epoch": 0.9895615866388309,
"grad_norm": 0.5313311815261841,
"learning_rate": 5.490518122465149e-08,
"loss": 0.1563,
"step": 948
},
{
"epoch": 0.9906054279749478,
"grad_norm": 0.48120567202568054,
"learning_rate": 4.447397012177224e-08,
"loss": 0.1376,
"step": 949
},
{
"epoch": 0.9916492693110647,
"grad_norm": 0.44713348150253296,
"learning_rate": 3.5140473724482034e-08,
"loss": 0.1198,
"step": 950
},
{
"epoch": 0.9926931106471816,
"grad_norm": 0.6056554913520813,
"learning_rate": 2.6904794533544332e-08,
"loss": 0.2989,
"step": 951
},
{
"epoch": 0.9937369519832986,
"grad_norm": 0.6082270741462708,
"learning_rate": 1.976702299344435e-08,
"loss": 0.2356,
"step": 952
},
{
"epoch": 0.9947807933194155,
"grad_norm": 0.46945539116859436,
"learning_rate": 1.3727237491412137e-08,
"loss": 0.1697,
"step": 953
},
{
"epoch": 0.9958246346555324,
"grad_norm": 0.5799762010574341,
"learning_rate": 8.785504356556563e-09,
"loss": 0.2196,
"step": 954
},
{
"epoch": 0.9968684759916493,
"grad_norm": 0.6873183846473694,
"learning_rate": 4.941877859143684e-09,
"loss": 0.2819,
"step": 955
},
{
"epoch": 0.9979123173277662,
"grad_norm": 0.6708718538284302,
"learning_rate": 2.1964002100083312e-09,
"loss": 0.2342,
"step": 956
},
{
"epoch": 0.9989561586638831,
"grad_norm": 0.6889147758483887,
"learning_rate": 5.491015600656013e-10,
"loss": 0.2048,
"step": 957
},
{
"epoch": 1.0,
"grad_norm": 0.7045766115188599,
"learning_rate": 0.0,
"loss": 0.1947,
"step": 958
}
],
"logging_steps": 1,
"max_steps": 958,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 240,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.355154213385011e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}