Stewy Slocum
Add fine-tuned model
f73e507
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 0,
"global_step": 864,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011574074074074073,
"grad_norm": 0.22213494777679443,
"learning_rate": 1e-05,
"loss": 1.5076,
"step": 1
},
{
"epoch": 0.0023148148148148147,
"grad_norm": 0.22815562784671783,
"learning_rate": 9.988425925925927e-06,
"loss": 1.4784,
"step": 2
},
{
"epoch": 0.003472222222222222,
"grad_norm": 0.22811558842658997,
"learning_rate": 9.976851851851853e-06,
"loss": 1.4667,
"step": 3
},
{
"epoch": 0.004629629629629629,
"grad_norm": 0.2362716645002365,
"learning_rate": 9.965277777777778e-06,
"loss": 1.5482,
"step": 4
},
{
"epoch": 0.005787037037037037,
"grad_norm": 0.2334546148777008,
"learning_rate": 9.953703703703704e-06,
"loss": 1.5051,
"step": 5
},
{
"epoch": 0.006944444444444444,
"grad_norm": 0.224368616938591,
"learning_rate": 9.942129629629629e-06,
"loss": 1.4836,
"step": 6
},
{
"epoch": 0.008101851851851851,
"grad_norm": 0.24275268614292145,
"learning_rate": 9.930555555555557e-06,
"loss": 1.5993,
"step": 7
},
{
"epoch": 0.009259259259259259,
"grad_norm": 0.24044257402420044,
"learning_rate": 9.918981481481482e-06,
"loss": 1.4207,
"step": 8
},
{
"epoch": 0.010416666666666666,
"grad_norm": 0.2276177853345871,
"learning_rate": 9.907407407407408e-06,
"loss": 1.479,
"step": 9
},
{
"epoch": 0.011574074074074073,
"grad_norm": 0.23464104533195496,
"learning_rate": 9.895833333333334e-06,
"loss": 1.5192,
"step": 10
},
{
"epoch": 0.01273148148148148,
"grad_norm": 0.2245517075061798,
"learning_rate": 9.88425925925926e-06,
"loss": 1.4462,
"step": 11
},
{
"epoch": 0.013888888888888888,
"grad_norm": 0.4287811517715454,
"learning_rate": 9.872685185185185e-06,
"loss": 1.5524,
"step": 12
},
{
"epoch": 0.015046296296296295,
"grad_norm": 0.23612529039382935,
"learning_rate": 9.861111111111112e-06,
"loss": 1.4822,
"step": 13
},
{
"epoch": 0.016203703703703703,
"grad_norm": 0.19577904045581818,
"learning_rate": 9.849537037037038e-06,
"loss": 1.3985,
"step": 14
},
{
"epoch": 0.017361111111111112,
"grad_norm": 0.19019098579883575,
"learning_rate": 9.837962962962964e-06,
"loss": 1.3753,
"step": 15
},
{
"epoch": 0.018518518518518517,
"grad_norm": 0.21135130524635315,
"learning_rate": 9.826388888888889e-06,
"loss": 1.5148,
"step": 16
},
{
"epoch": 0.019675925925925927,
"grad_norm": 0.19131916761398315,
"learning_rate": 9.814814814814815e-06,
"loss": 1.5473,
"step": 17
},
{
"epoch": 0.020833333333333332,
"grad_norm": 0.1860252171754837,
"learning_rate": 9.803240740740742e-06,
"loss": 1.5053,
"step": 18
},
{
"epoch": 0.02199074074074074,
"grad_norm": 0.17170019447803497,
"learning_rate": 9.791666666666666e-06,
"loss": 1.5025,
"step": 19
},
{
"epoch": 0.023148148148148147,
"grad_norm": 0.16492699086666107,
"learning_rate": 9.780092592592594e-06,
"loss": 1.3922,
"step": 20
},
{
"epoch": 0.024305555555555556,
"grad_norm": 0.15029262006282806,
"learning_rate": 9.768518518518519e-06,
"loss": 1.44,
"step": 21
},
{
"epoch": 0.02546296296296296,
"grad_norm": 0.13995124399662018,
"learning_rate": 9.756944444444445e-06,
"loss": 1.4524,
"step": 22
},
{
"epoch": 0.02662037037037037,
"grad_norm": 0.14475427567958832,
"learning_rate": 9.745370370370372e-06,
"loss": 1.4576,
"step": 23
},
{
"epoch": 0.027777777777777776,
"grad_norm": 0.15014007687568665,
"learning_rate": 9.733796296296298e-06,
"loss": 1.4862,
"step": 24
},
{
"epoch": 0.028935185185185185,
"grad_norm": 0.13555286824703217,
"learning_rate": 9.722222222222223e-06,
"loss": 1.4582,
"step": 25
},
{
"epoch": 0.03009259259259259,
"grad_norm": 0.15213678777217865,
"learning_rate": 9.710648148148149e-06,
"loss": 1.4877,
"step": 26
},
{
"epoch": 0.03125,
"grad_norm": 0.13051781058311462,
"learning_rate": 9.699074074074075e-06,
"loss": 1.3628,
"step": 27
},
{
"epoch": 0.032407407407407406,
"grad_norm": 0.15080274641513824,
"learning_rate": 9.6875e-06,
"loss": 1.4834,
"step": 28
},
{
"epoch": 0.03356481481481482,
"grad_norm": 0.1410171389579773,
"learning_rate": 9.675925925925926e-06,
"loss": 1.5207,
"step": 29
},
{
"epoch": 0.034722222222222224,
"grad_norm": 0.14558039605617523,
"learning_rate": 9.664351851851853e-06,
"loss": 1.4201,
"step": 30
},
{
"epoch": 0.03587962962962963,
"grad_norm": 0.1254415363073349,
"learning_rate": 9.652777777777779e-06,
"loss": 1.3993,
"step": 31
},
{
"epoch": 0.037037037037037035,
"grad_norm": 0.14481398463249207,
"learning_rate": 9.641203703703704e-06,
"loss": 1.3278,
"step": 32
},
{
"epoch": 0.03819444444444445,
"grad_norm": 0.1267780363559723,
"learning_rate": 9.62962962962963e-06,
"loss": 1.3886,
"step": 33
},
{
"epoch": 0.03935185185185185,
"grad_norm": 0.14567571878433228,
"learning_rate": 9.618055555555556e-06,
"loss": 1.3612,
"step": 34
},
{
"epoch": 0.04050925925925926,
"grad_norm": 0.14164520800113678,
"learning_rate": 9.606481481481483e-06,
"loss": 1.339,
"step": 35
},
{
"epoch": 0.041666666666666664,
"grad_norm": 0.12852604687213898,
"learning_rate": 9.594907407407407e-06,
"loss": 1.4468,
"step": 36
},
{
"epoch": 0.04282407407407408,
"grad_norm": 0.14108137786388397,
"learning_rate": 9.583333333333335e-06,
"loss": 1.3848,
"step": 37
},
{
"epoch": 0.04398148148148148,
"grad_norm": 0.1143556609749794,
"learning_rate": 9.57175925925926e-06,
"loss": 1.344,
"step": 38
},
{
"epoch": 0.04513888888888889,
"grad_norm": 0.11572553217411041,
"learning_rate": 9.560185185185186e-06,
"loss": 1.331,
"step": 39
},
{
"epoch": 0.046296296296296294,
"grad_norm": 0.1106305718421936,
"learning_rate": 9.548611111111113e-06,
"loss": 1.4159,
"step": 40
},
{
"epoch": 0.047453703703703706,
"grad_norm": 0.11018543690443039,
"learning_rate": 9.537037037037037e-06,
"loss": 1.3817,
"step": 41
},
{
"epoch": 0.04861111111111111,
"grad_norm": 0.11123362183570862,
"learning_rate": 9.525462962962964e-06,
"loss": 1.3859,
"step": 42
},
{
"epoch": 0.04976851851851852,
"grad_norm": 0.11585158854722977,
"learning_rate": 9.51388888888889e-06,
"loss": 1.4172,
"step": 43
},
{
"epoch": 0.05092592592592592,
"grad_norm": 0.11234026402235031,
"learning_rate": 9.502314814814816e-06,
"loss": 1.4225,
"step": 44
},
{
"epoch": 0.052083333333333336,
"grad_norm": 0.17347489297389984,
"learning_rate": 9.490740740740741e-06,
"loss": 1.3333,
"step": 45
},
{
"epoch": 0.05324074074074074,
"grad_norm": 0.10686583071947098,
"learning_rate": 9.479166666666667e-06,
"loss": 1.3721,
"step": 46
},
{
"epoch": 0.05439814814814815,
"grad_norm": 0.11265669763088226,
"learning_rate": 9.467592592592594e-06,
"loss": 1.4224,
"step": 47
},
{
"epoch": 0.05555555555555555,
"grad_norm": 0.10087595134973526,
"learning_rate": 9.45601851851852e-06,
"loss": 1.3365,
"step": 48
},
{
"epoch": 0.056712962962962965,
"grad_norm": 0.10566269606351852,
"learning_rate": 9.444444444444445e-06,
"loss": 1.366,
"step": 49
},
{
"epoch": 0.05787037037037037,
"grad_norm": 0.11439696699380875,
"learning_rate": 9.432870370370371e-06,
"loss": 1.4139,
"step": 50
},
{
"epoch": 0.059027777777777776,
"grad_norm": 0.10534631460905075,
"learning_rate": 9.421296296296297e-06,
"loss": 1.3832,
"step": 51
},
{
"epoch": 0.06018518518518518,
"grad_norm": 0.09864930808544159,
"learning_rate": 9.409722222222224e-06,
"loss": 1.3692,
"step": 52
},
{
"epoch": 0.061342592592592594,
"grad_norm": 0.10979383438825607,
"learning_rate": 9.398148148148148e-06,
"loss": 1.4028,
"step": 53
},
{
"epoch": 0.0625,
"grad_norm": 0.11381463706493378,
"learning_rate": 9.386574074074075e-06,
"loss": 1.4229,
"step": 54
},
{
"epoch": 0.06365740740740741,
"grad_norm": 0.10466167330741882,
"learning_rate": 9.375000000000001e-06,
"loss": 1.3767,
"step": 55
},
{
"epoch": 0.06481481481481481,
"grad_norm": 0.10089318454265594,
"learning_rate": 9.363425925925927e-06,
"loss": 1.3755,
"step": 56
},
{
"epoch": 0.06597222222222222,
"grad_norm": 0.1125788539648056,
"learning_rate": 9.351851851851854e-06,
"loss": 1.3654,
"step": 57
},
{
"epoch": 0.06712962962962964,
"grad_norm": 0.10320808738470078,
"learning_rate": 9.340277777777778e-06,
"loss": 1.3788,
"step": 58
},
{
"epoch": 0.06828703703703703,
"grad_norm": 0.10044930130243301,
"learning_rate": 9.328703703703705e-06,
"loss": 1.3912,
"step": 59
},
{
"epoch": 0.06944444444444445,
"grad_norm": 0.10364623367786407,
"learning_rate": 9.31712962962963e-06,
"loss": 1.2838,
"step": 60
},
{
"epoch": 0.07060185185185185,
"grad_norm": 0.14089667797088623,
"learning_rate": 9.305555555555557e-06,
"loss": 1.3711,
"step": 61
},
{
"epoch": 0.07175925925925926,
"grad_norm": 0.10308589786291122,
"learning_rate": 9.293981481481482e-06,
"loss": 1.2889,
"step": 62
},
{
"epoch": 0.07291666666666667,
"grad_norm": 0.1022154688835144,
"learning_rate": 9.282407407407408e-06,
"loss": 1.3387,
"step": 63
},
{
"epoch": 0.07407407407407407,
"grad_norm": 0.09682654589414597,
"learning_rate": 9.270833333333334e-06,
"loss": 1.2755,
"step": 64
},
{
"epoch": 0.07523148148148148,
"grad_norm": 0.10220059752464294,
"learning_rate": 9.25925925925926e-06,
"loss": 1.2953,
"step": 65
},
{
"epoch": 0.0763888888888889,
"grad_norm": 0.09950669854879379,
"learning_rate": 9.247685185185185e-06,
"loss": 1.3319,
"step": 66
},
{
"epoch": 0.0775462962962963,
"grad_norm": 0.09631174057722092,
"learning_rate": 9.236111111111112e-06,
"loss": 1.3037,
"step": 67
},
{
"epoch": 0.0787037037037037,
"grad_norm": 0.10525327175855637,
"learning_rate": 9.224537037037038e-06,
"loss": 1.3669,
"step": 68
},
{
"epoch": 0.0798611111111111,
"grad_norm": 0.1023765280842781,
"learning_rate": 9.212962962962963e-06,
"loss": 1.3172,
"step": 69
},
{
"epoch": 0.08101851851851852,
"grad_norm": 0.09824973344802856,
"learning_rate": 9.201388888888889e-06,
"loss": 1.2471,
"step": 70
},
{
"epoch": 0.08217592592592593,
"grad_norm": 0.09957920759916306,
"learning_rate": 9.189814814814815e-06,
"loss": 1.2974,
"step": 71
},
{
"epoch": 0.08333333333333333,
"grad_norm": 0.10006606578826904,
"learning_rate": 9.178240740740742e-06,
"loss": 1.2478,
"step": 72
},
{
"epoch": 0.08449074074074074,
"grad_norm": 0.09730537980794907,
"learning_rate": 9.166666666666666e-06,
"loss": 1.2874,
"step": 73
},
{
"epoch": 0.08564814814814815,
"grad_norm": 0.10841472446918488,
"learning_rate": 9.155092592592594e-06,
"loss": 1.3142,
"step": 74
},
{
"epoch": 0.08680555555555555,
"grad_norm": 0.10033254325389862,
"learning_rate": 9.143518518518519e-06,
"loss": 1.3031,
"step": 75
},
{
"epoch": 0.08796296296296297,
"grad_norm": 0.10729570686817169,
"learning_rate": 9.131944444444445e-06,
"loss": 1.318,
"step": 76
},
{
"epoch": 0.08912037037037036,
"grad_norm": 0.10036814212799072,
"learning_rate": 9.120370370370372e-06,
"loss": 1.315,
"step": 77
},
{
"epoch": 0.09027777777777778,
"grad_norm": 0.09904678165912628,
"learning_rate": 9.108796296296296e-06,
"loss": 1.3105,
"step": 78
},
{
"epoch": 0.09143518518518519,
"grad_norm": 0.10250908136367798,
"learning_rate": 9.097222222222223e-06,
"loss": 1.3388,
"step": 79
},
{
"epoch": 0.09259259259259259,
"grad_norm": 0.10013310611248016,
"learning_rate": 9.085648148148149e-06,
"loss": 1.3081,
"step": 80
},
{
"epoch": 0.09375,
"grad_norm": 0.12040674686431885,
"learning_rate": 9.074074074074075e-06,
"loss": 1.29,
"step": 81
},
{
"epoch": 0.09490740740740741,
"grad_norm": 0.10173387080430984,
"learning_rate": 9.0625e-06,
"loss": 1.3216,
"step": 82
},
{
"epoch": 0.09606481481481481,
"grad_norm": 0.0995723232626915,
"learning_rate": 9.050925925925926e-06,
"loss": 1.2348,
"step": 83
},
{
"epoch": 0.09722222222222222,
"grad_norm": 0.09998712688684464,
"learning_rate": 9.039351851851853e-06,
"loss": 1.2941,
"step": 84
},
{
"epoch": 0.09837962962962964,
"grad_norm": 0.10181009769439697,
"learning_rate": 9.027777777777779e-06,
"loss": 1.3435,
"step": 85
},
{
"epoch": 0.09953703703703703,
"grad_norm": 0.10247822105884552,
"learning_rate": 9.016203703703704e-06,
"loss": 1.2227,
"step": 86
},
{
"epoch": 0.10069444444444445,
"grad_norm": 0.09980504959821701,
"learning_rate": 9.00462962962963e-06,
"loss": 1.2352,
"step": 87
},
{
"epoch": 0.10185185185185185,
"grad_norm": 0.09725635498762131,
"learning_rate": 8.993055555555556e-06,
"loss": 1.2665,
"step": 88
},
{
"epoch": 0.10300925925925926,
"grad_norm": 0.10134412348270416,
"learning_rate": 8.981481481481483e-06,
"loss": 1.2586,
"step": 89
},
{
"epoch": 0.10416666666666667,
"grad_norm": 0.10719292610883713,
"learning_rate": 8.969907407407407e-06,
"loss": 1.3019,
"step": 90
},
{
"epoch": 0.10532407407407407,
"grad_norm": 0.10486488789319992,
"learning_rate": 8.958333333333334e-06,
"loss": 1.2936,
"step": 91
},
{
"epoch": 0.10648148148148148,
"grad_norm": 0.10944393277168274,
"learning_rate": 8.94675925925926e-06,
"loss": 1.3363,
"step": 92
},
{
"epoch": 0.1076388888888889,
"grad_norm": 0.11079176515340805,
"learning_rate": 8.935185185185186e-06,
"loss": 1.2367,
"step": 93
},
{
"epoch": 0.1087962962962963,
"grad_norm": 0.10356653481721878,
"learning_rate": 8.923611111111113e-06,
"loss": 1.2405,
"step": 94
},
{
"epoch": 0.1099537037037037,
"grad_norm": 0.10176271945238113,
"learning_rate": 8.912037037037037e-06,
"loss": 1.2165,
"step": 95
},
{
"epoch": 0.1111111111111111,
"grad_norm": 0.10132142156362534,
"learning_rate": 8.900462962962964e-06,
"loss": 1.2244,
"step": 96
},
{
"epoch": 0.11226851851851852,
"grad_norm": 0.09974969923496246,
"learning_rate": 8.888888888888888e-06,
"loss": 1.2174,
"step": 97
},
{
"epoch": 0.11342592592592593,
"grad_norm": 0.10894669592380524,
"learning_rate": 8.877314814814816e-06,
"loss": 1.2788,
"step": 98
},
{
"epoch": 0.11458333333333333,
"grad_norm": 0.10840534418821335,
"learning_rate": 8.865740740740741e-06,
"loss": 1.3159,
"step": 99
},
{
"epoch": 0.11574074074074074,
"grad_norm": 0.10695167630910873,
"learning_rate": 8.854166666666667e-06,
"loss": 1.1849,
"step": 100
},
{
"epoch": 0.11689814814814815,
"grad_norm": 0.10731031000614166,
"learning_rate": 8.842592592592594e-06,
"loss": 1.2752,
"step": 101
},
{
"epoch": 0.11805555555555555,
"grad_norm": 0.10931055247783661,
"learning_rate": 8.83101851851852e-06,
"loss": 1.3104,
"step": 102
},
{
"epoch": 0.11921296296296297,
"grad_norm": 0.09977404028177261,
"learning_rate": 8.819444444444445e-06,
"loss": 1.2294,
"step": 103
},
{
"epoch": 0.12037037037037036,
"grad_norm": 0.10199018567800522,
"learning_rate": 8.807870370370371e-06,
"loss": 1.26,
"step": 104
},
{
"epoch": 0.12152777777777778,
"grad_norm": 0.10436324030160904,
"learning_rate": 8.796296296296297e-06,
"loss": 1.1939,
"step": 105
},
{
"epoch": 0.12268518518518519,
"grad_norm": 0.10257507115602493,
"learning_rate": 8.784722222222224e-06,
"loss": 1.2018,
"step": 106
},
{
"epoch": 0.12384259259259259,
"grad_norm": 0.10942145437002182,
"learning_rate": 8.773148148148148e-06,
"loss": 1.2486,
"step": 107
},
{
"epoch": 0.125,
"grad_norm": 0.1106390580534935,
"learning_rate": 8.761574074074075e-06,
"loss": 1.2423,
"step": 108
},
{
"epoch": 0.1261574074074074,
"grad_norm": 0.10852088034152985,
"learning_rate": 8.750000000000001e-06,
"loss": 1.2164,
"step": 109
},
{
"epoch": 0.12731481481481483,
"grad_norm": 0.10899867117404938,
"learning_rate": 8.738425925925926e-06,
"loss": 1.1733,
"step": 110
},
{
"epoch": 0.1284722222222222,
"grad_norm": 0.1090475544333458,
"learning_rate": 8.726851851851854e-06,
"loss": 1.211,
"step": 111
},
{
"epoch": 0.12962962962962962,
"grad_norm": 0.10560119152069092,
"learning_rate": 8.715277777777778e-06,
"loss": 1.1627,
"step": 112
},
{
"epoch": 0.13078703703703703,
"grad_norm": 0.140657439827919,
"learning_rate": 8.703703703703705e-06,
"loss": 1.2528,
"step": 113
},
{
"epoch": 0.13194444444444445,
"grad_norm": 0.10568102449178696,
"learning_rate": 8.69212962962963e-06,
"loss": 1.1707,
"step": 114
},
{
"epoch": 0.13310185185185186,
"grad_norm": 0.10824406147003174,
"learning_rate": 8.680555555555557e-06,
"loss": 1.236,
"step": 115
},
{
"epoch": 0.13425925925925927,
"grad_norm": 0.13147617876529694,
"learning_rate": 8.668981481481482e-06,
"loss": 1.2215,
"step": 116
},
{
"epoch": 0.13541666666666666,
"grad_norm": 0.10489261895418167,
"learning_rate": 8.657407407407408e-06,
"loss": 1.1983,
"step": 117
},
{
"epoch": 0.13657407407407407,
"grad_norm": 0.10651307553052902,
"learning_rate": 8.645833333333335e-06,
"loss": 1.134,
"step": 118
},
{
"epoch": 0.13773148148148148,
"grad_norm": 0.10825668275356293,
"learning_rate": 8.63425925925926e-06,
"loss": 1.2174,
"step": 119
},
{
"epoch": 0.1388888888888889,
"grad_norm": 0.11069732904434204,
"learning_rate": 8.622685185185186e-06,
"loss": 1.1513,
"step": 120
},
{
"epoch": 0.1400462962962963,
"grad_norm": 0.10880957543849945,
"learning_rate": 8.611111111111112e-06,
"loss": 1.1804,
"step": 121
},
{
"epoch": 0.1412037037037037,
"grad_norm": 0.1930810511112213,
"learning_rate": 8.599537037037038e-06,
"loss": 1.258,
"step": 122
},
{
"epoch": 0.1423611111111111,
"grad_norm": 0.11332959681749344,
"learning_rate": 8.587962962962963e-06,
"loss": 1.1904,
"step": 123
},
{
"epoch": 0.14351851851851852,
"grad_norm": 0.10661832243204117,
"learning_rate": 8.57638888888889e-06,
"loss": 1.1768,
"step": 124
},
{
"epoch": 0.14467592592592593,
"grad_norm": 0.11842379719018936,
"learning_rate": 8.564814814814816e-06,
"loss": 1.262,
"step": 125
},
{
"epoch": 0.14583333333333334,
"grad_norm": 0.11844724416732788,
"learning_rate": 8.553240740740742e-06,
"loss": 1.2975,
"step": 126
},
{
"epoch": 0.14699074074074073,
"grad_norm": 0.11894673854112625,
"learning_rate": 8.541666666666666e-06,
"loss": 1.2188,
"step": 127
},
{
"epoch": 0.14814814814814814,
"grad_norm": 0.12049190700054169,
"learning_rate": 8.530092592592595e-06,
"loss": 1.2252,
"step": 128
},
{
"epoch": 0.14930555555555555,
"grad_norm": 0.1142427921295166,
"learning_rate": 8.518518518518519e-06,
"loss": 1.242,
"step": 129
},
{
"epoch": 0.15046296296296297,
"grad_norm": 0.1175389289855957,
"learning_rate": 8.506944444444445e-06,
"loss": 1.17,
"step": 130
},
{
"epoch": 0.15162037037037038,
"grad_norm": 0.11764886230230331,
"learning_rate": 8.495370370370372e-06,
"loss": 1.2249,
"step": 131
},
{
"epoch": 0.1527777777777778,
"grad_norm": 0.11646956950426102,
"learning_rate": 8.483796296296296e-06,
"loss": 1.2169,
"step": 132
},
{
"epoch": 0.15393518518518517,
"grad_norm": 0.11439011991024017,
"learning_rate": 8.472222222222223e-06,
"loss": 1.218,
"step": 133
},
{
"epoch": 0.1550925925925926,
"grad_norm": 0.12252136319875717,
"learning_rate": 8.460648148148149e-06,
"loss": 1.2023,
"step": 134
},
{
"epoch": 0.15625,
"grad_norm": 0.12436626106500626,
"learning_rate": 8.449074074074075e-06,
"loss": 1.2033,
"step": 135
},
{
"epoch": 0.1574074074074074,
"grad_norm": 0.11901386827230453,
"learning_rate": 8.4375e-06,
"loss": 1.272,
"step": 136
},
{
"epoch": 0.15856481481481483,
"grad_norm": 0.11729884147644043,
"learning_rate": 8.425925925925926e-06,
"loss": 1.2505,
"step": 137
},
{
"epoch": 0.1597222222222222,
"grad_norm": 0.13269205391407013,
"learning_rate": 8.414351851851853e-06,
"loss": 1.1483,
"step": 138
},
{
"epoch": 0.16087962962962962,
"grad_norm": 0.12206093221902847,
"learning_rate": 8.402777777777779e-06,
"loss": 1.1865,
"step": 139
},
{
"epoch": 0.16203703703703703,
"grad_norm": 0.11812015622854233,
"learning_rate": 8.391203703703704e-06,
"loss": 1.199,
"step": 140
},
{
"epoch": 0.16319444444444445,
"grad_norm": 0.11516984552145004,
"learning_rate": 8.37962962962963e-06,
"loss": 1.1904,
"step": 141
},
{
"epoch": 0.16435185185185186,
"grad_norm": 0.1210319846868515,
"learning_rate": 8.368055555555556e-06,
"loss": 1.178,
"step": 142
},
{
"epoch": 0.16550925925925927,
"grad_norm": 0.11981932818889618,
"learning_rate": 8.356481481481483e-06,
"loss": 1.1645,
"step": 143
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.1515599936246872,
"learning_rate": 8.344907407407407e-06,
"loss": 1.1536,
"step": 144
},
{
"epoch": 0.16782407407407407,
"grad_norm": 0.12906567752361298,
"learning_rate": 8.333333333333334e-06,
"loss": 1.2263,
"step": 145
},
{
"epoch": 0.16898148148148148,
"grad_norm": 0.13303443789482117,
"learning_rate": 8.32175925925926e-06,
"loss": 1.1575,
"step": 146
},
{
"epoch": 0.1701388888888889,
"grad_norm": 0.13370473682880402,
"learning_rate": 8.310185185185186e-06,
"loss": 1.1965,
"step": 147
},
{
"epoch": 0.1712962962962963,
"grad_norm": 0.13174037635326385,
"learning_rate": 8.298611111111113e-06,
"loss": 1.2192,
"step": 148
},
{
"epoch": 0.1724537037037037,
"grad_norm": 0.12153156101703644,
"learning_rate": 8.287037037037037e-06,
"loss": 1.2325,
"step": 149
},
{
"epoch": 0.1736111111111111,
"grad_norm": 0.12656794488430023,
"learning_rate": 8.275462962962964e-06,
"loss": 1.1419,
"step": 150
},
{
"epoch": 0.17476851851851852,
"grad_norm": 0.12459167093038559,
"learning_rate": 8.263888888888888e-06,
"loss": 1.1894,
"step": 151
},
{
"epoch": 0.17592592592592593,
"grad_norm": 0.1209847703576088,
"learning_rate": 8.252314814814816e-06,
"loss": 1.1009,
"step": 152
},
{
"epoch": 0.17708333333333334,
"grad_norm": 0.122232586145401,
"learning_rate": 8.240740740740741e-06,
"loss": 1.1768,
"step": 153
},
{
"epoch": 0.17824074074074073,
"grad_norm": 0.12607316672801971,
"learning_rate": 8.229166666666667e-06,
"loss": 1.1269,
"step": 154
},
{
"epoch": 0.17939814814814814,
"grad_norm": 0.12453708052635193,
"learning_rate": 8.217592592592594e-06,
"loss": 1.1205,
"step": 155
},
{
"epoch": 0.18055555555555555,
"grad_norm": 0.12600407004356384,
"learning_rate": 8.20601851851852e-06,
"loss": 1.1924,
"step": 156
},
{
"epoch": 0.18171296296296297,
"grad_norm": 0.13688451051712036,
"learning_rate": 8.194444444444445e-06,
"loss": 1.2399,
"step": 157
},
{
"epoch": 0.18287037037037038,
"grad_norm": 0.12164533883333206,
"learning_rate": 8.182870370370371e-06,
"loss": 1.119,
"step": 158
},
{
"epoch": 0.1840277777777778,
"grad_norm": 0.12914681434631348,
"learning_rate": 8.171296296296297e-06,
"loss": 1.2067,
"step": 159
},
{
"epoch": 0.18518518518518517,
"grad_norm": 0.13052506744861603,
"learning_rate": 8.159722222222222e-06,
"loss": 1.1148,
"step": 160
},
{
"epoch": 0.1863425925925926,
"grad_norm": 0.14109309017658234,
"learning_rate": 8.148148148148148e-06,
"loss": 1.1888,
"step": 161
},
{
"epoch": 0.1875,
"grad_norm": 0.1215931847691536,
"learning_rate": 8.136574074074075e-06,
"loss": 1.1696,
"step": 162
},
{
"epoch": 0.1886574074074074,
"grad_norm": 0.12615671753883362,
"learning_rate": 8.125000000000001e-06,
"loss": 1.0878,
"step": 163
},
{
"epoch": 0.18981481481481483,
"grad_norm": 0.12854883074760437,
"learning_rate": 8.113425925925926e-06,
"loss": 1.1693,
"step": 164
},
{
"epoch": 0.1909722222222222,
"grad_norm": 0.13595585525035858,
"learning_rate": 8.101851851851854e-06,
"loss": 1.2612,
"step": 165
},
{
"epoch": 0.19212962962962962,
"grad_norm": 0.125200554728508,
"learning_rate": 8.090277777777778e-06,
"loss": 1.1334,
"step": 166
},
{
"epoch": 0.19328703703703703,
"grad_norm": 0.12699203193187714,
"learning_rate": 8.078703703703705e-06,
"loss": 1.1895,
"step": 167
},
{
"epoch": 0.19444444444444445,
"grad_norm": 0.14522430300712585,
"learning_rate": 8.06712962962963e-06,
"loss": 1.2751,
"step": 168
},
{
"epoch": 0.19560185185185186,
"grad_norm": 0.12672430276870728,
"learning_rate": 8.055555555555557e-06,
"loss": 1.1263,
"step": 169
},
{
"epoch": 0.19675925925925927,
"grad_norm": 0.1354917585849762,
"learning_rate": 8.043981481481482e-06,
"loss": 1.1853,
"step": 170
},
{
"epoch": 0.19791666666666666,
"grad_norm": 0.12905584275722504,
"learning_rate": 8.032407407407408e-06,
"loss": 1.1671,
"step": 171
},
{
"epoch": 0.19907407407407407,
"grad_norm": 0.14876537024974823,
"learning_rate": 8.020833333333335e-06,
"loss": 1.192,
"step": 172
},
{
"epoch": 0.20023148148148148,
"grad_norm": 0.12572503089904785,
"learning_rate": 8.00925925925926e-06,
"loss": 1.1425,
"step": 173
},
{
"epoch": 0.2013888888888889,
"grad_norm": 0.1319337785243988,
"learning_rate": 7.997685185185186e-06,
"loss": 1.1048,
"step": 174
},
{
"epoch": 0.2025462962962963,
"grad_norm": 0.1327625960111618,
"learning_rate": 7.986111111111112e-06,
"loss": 1.1644,
"step": 175
},
{
"epoch": 0.2037037037037037,
"grad_norm": 0.15492317080497742,
"learning_rate": 7.974537037037038e-06,
"loss": 1.1716,
"step": 176
},
{
"epoch": 0.2048611111111111,
"grad_norm": 0.13174159824848175,
"learning_rate": 7.962962962962963e-06,
"loss": 1.17,
"step": 177
},
{
"epoch": 0.20601851851851852,
"grad_norm": 0.13209496438503265,
"learning_rate": 7.95138888888889e-06,
"loss": 1.1483,
"step": 178
},
{
"epoch": 0.20717592592592593,
"grad_norm": 0.13201004266738892,
"learning_rate": 7.939814814814816e-06,
"loss": 1.1383,
"step": 179
},
{
"epoch": 0.20833333333333334,
"grad_norm": 0.13409128785133362,
"learning_rate": 7.928240740740742e-06,
"loss": 1.2027,
"step": 180
},
{
"epoch": 0.20949074074074073,
"grad_norm": 0.12518879771232605,
"learning_rate": 7.916666666666667e-06,
"loss": 1.124,
"step": 181
},
{
"epoch": 0.21064814814814814,
"grad_norm": 0.13146327435970306,
"learning_rate": 7.905092592592593e-06,
"loss": 1.1499,
"step": 182
},
{
"epoch": 0.21180555555555555,
"grad_norm": 0.13177363574504852,
"learning_rate": 7.89351851851852e-06,
"loss": 1.1327,
"step": 183
},
{
"epoch": 0.21296296296296297,
"grad_norm": 0.1373285949230194,
"learning_rate": 7.881944444444446e-06,
"loss": 1.1803,
"step": 184
},
{
"epoch": 0.21412037037037038,
"grad_norm": 0.13236674666404724,
"learning_rate": 7.870370370370372e-06,
"loss": 1.1941,
"step": 185
},
{
"epoch": 0.2152777777777778,
"grad_norm": 0.14942052960395813,
"learning_rate": 7.858796296296297e-06,
"loss": 1.0679,
"step": 186
},
{
"epoch": 0.21643518518518517,
"grad_norm": 0.1368960291147232,
"learning_rate": 7.847222222222223e-06,
"loss": 1.2047,
"step": 187
},
{
"epoch": 0.2175925925925926,
"grad_norm": 0.13313372433185577,
"learning_rate": 7.835648148148147e-06,
"loss": 1.1604,
"step": 188
},
{
"epoch": 0.21875,
"grad_norm": 0.13543230295181274,
"learning_rate": 7.824074074074076e-06,
"loss": 1.1457,
"step": 189
},
{
"epoch": 0.2199074074074074,
"grad_norm": 0.13306613266468048,
"learning_rate": 7.8125e-06,
"loss": 1.1072,
"step": 190
},
{
"epoch": 0.22106481481481483,
"grad_norm": 0.1399572342634201,
"learning_rate": 7.800925925925926e-06,
"loss": 1.1941,
"step": 191
},
{
"epoch": 0.2222222222222222,
"grad_norm": 0.13952195644378662,
"learning_rate": 7.789351851851853e-06,
"loss": 1.1165,
"step": 192
},
{
"epoch": 0.22337962962962962,
"grad_norm": 0.1346345692873001,
"learning_rate": 7.77777777777778e-06,
"loss": 1.1592,
"step": 193
},
{
"epoch": 0.22453703703703703,
"grad_norm": 0.14998078346252441,
"learning_rate": 7.766203703703704e-06,
"loss": 1.0881,
"step": 194
},
{
"epoch": 0.22569444444444445,
"grad_norm": 0.1336279958486557,
"learning_rate": 7.75462962962963e-06,
"loss": 1.1719,
"step": 195
},
{
"epoch": 0.22685185185185186,
"grad_norm": 0.1357223093509674,
"learning_rate": 7.743055555555556e-06,
"loss": 1.1395,
"step": 196
},
{
"epoch": 0.22800925925925927,
"grad_norm": 0.13101308047771454,
"learning_rate": 7.731481481481483e-06,
"loss": 1.1486,
"step": 197
},
{
"epoch": 0.22916666666666666,
"grad_norm": 0.13381895422935486,
"learning_rate": 7.719907407407407e-06,
"loss": 1.1791,
"step": 198
},
{
"epoch": 0.23032407407407407,
"grad_norm": 0.14860618114471436,
"learning_rate": 7.708333333333334e-06,
"loss": 1.2094,
"step": 199
},
{
"epoch": 0.23148148148148148,
"grad_norm": 0.14094901084899902,
"learning_rate": 7.69675925925926e-06,
"loss": 1.1111,
"step": 200
},
{
"epoch": 0.2326388888888889,
"grad_norm": 0.14977441728115082,
"learning_rate": 7.685185185185185e-06,
"loss": 1.1361,
"step": 201
},
{
"epoch": 0.2337962962962963,
"grad_norm": 0.14465676248073578,
"learning_rate": 7.673611111111113e-06,
"loss": 1.1081,
"step": 202
},
{
"epoch": 0.2349537037037037,
"grad_norm": 0.13928347826004028,
"learning_rate": 7.662037037037037e-06,
"loss": 1.1567,
"step": 203
},
{
"epoch": 0.2361111111111111,
"grad_norm": 0.13289427757263184,
"learning_rate": 7.650462962962964e-06,
"loss": 1.0925,
"step": 204
},
{
"epoch": 0.23726851851851852,
"grad_norm": 0.13851501047611237,
"learning_rate": 7.638888888888888e-06,
"loss": 1.1707,
"step": 205
},
{
"epoch": 0.23842592592592593,
"grad_norm": 0.1386367678642273,
"learning_rate": 7.627314814814816e-06,
"loss": 1.1997,
"step": 206
},
{
"epoch": 0.23958333333333334,
"grad_norm": 0.140579491853714,
"learning_rate": 7.615740740740741e-06,
"loss": 1.0898,
"step": 207
},
{
"epoch": 0.24074074074074073,
"grad_norm": 0.14284437894821167,
"learning_rate": 7.6041666666666666e-06,
"loss": 1.1325,
"step": 208
},
{
"epoch": 0.24189814814814814,
"grad_norm": 0.1468944400548935,
"learning_rate": 7.592592592592594e-06,
"loss": 1.1007,
"step": 209
},
{
"epoch": 0.24305555555555555,
"grad_norm": 0.14520986378192902,
"learning_rate": 7.581018518518519e-06,
"loss": 1.0402,
"step": 210
},
{
"epoch": 0.24421296296296297,
"grad_norm": 0.1370912790298462,
"learning_rate": 7.569444444444445e-06,
"loss": 1.1189,
"step": 211
},
{
"epoch": 0.24537037037037038,
"grad_norm": 0.13841593265533447,
"learning_rate": 7.557870370370372e-06,
"loss": 1.1722,
"step": 212
},
{
"epoch": 0.2465277777777778,
"grad_norm": 0.13426385819911957,
"learning_rate": 7.546296296296297e-06,
"loss": 1.077,
"step": 213
},
{
"epoch": 0.24768518518518517,
"grad_norm": 0.14030449092388153,
"learning_rate": 7.534722222222223e-06,
"loss": 1.1423,
"step": 214
},
{
"epoch": 0.2488425925925926,
"grad_norm": 0.13870425522327423,
"learning_rate": 7.523148148148148e-06,
"loss": 1.1222,
"step": 215
},
{
"epoch": 0.25,
"grad_norm": 0.1389496624469757,
"learning_rate": 7.511574074074075e-06,
"loss": 1.1281,
"step": 216
},
{
"epoch": 0.2511574074074074,
"grad_norm": 0.13513287901878357,
"learning_rate": 7.500000000000001e-06,
"loss": 1.0742,
"step": 217
},
{
"epoch": 0.2523148148148148,
"grad_norm": 0.12963634729385376,
"learning_rate": 7.4884259259259265e-06,
"loss": 1.1002,
"step": 218
},
{
"epoch": 0.2534722222222222,
"grad_norm": 0.14197136461734772,
"learning_rate": 7.476851851851853e-06,
"loss": 1.2041,
"step": 219
},
{
"epoch": 0.25462962962962965,
"grad_norm": 0.14973148703575134,
"learning_rate": 7.465277777777778e-06,
"loss": 1.1425,
"step": 220
},
{
"epoch": 0.25578703703703703,
"grad_norm": 0.15027864277362823,
"learning_rate": 7.453703703703704e-06,
"loss": 1.1197,
"step": 221
},
{
"epoch": 0.2569444444444444,
"grad_norm": 0.14004850387573242,
"learning_rate": 7.442129629629629e-06,
"loss": 1.0865,
"step": 222
},
{
"epoch": 0.25810185185185186,
"grad_norm": 0.14548063278198242,
"learning_rate": 7.4305555555555565e-06,
"loss": 1.1288,
"step": 223
},
{
"epoch": 0.25925925925925924,
"grad_norm": 0.15253789722919464,
"learning_rate": 7.418981481481482e-06,
"loss": 1.1755,
"step": 224
},
{
"epoch": 0.2604166666666667,
"grad_norm": 0.17614801228046417,
"learning_rate": 7.4074074074074075e-06,
"loss": 1.1337,
"step": 225
},
{
"epoch": 0.26157407407407407,
"grad_norm": 0.14582227170467377,
"learning_rate": 7.395833333333335e-06,
"loss": 1.127,
"step": 226
},
{
"epoch": 0.26273148148148145,
"grad_norm": 0.15011462569236755,
"learning_rate": 7.38425925925926e-06,
"loss": 1.1503,
"step": 227
},
{
"epoch": 0.2638888888888889,
"grad_norm": 0.14777828752994537,
"learning_rate": 7.372685185185186e-06,
"loss": 1.1979,
"step": 228
},
{
"epoch": 0.2650462962962963,
"grad_norm": 0.14377012848854065,
"learning_rate": 7.361111111111112e-06,
"loss": 1.159,
"step": 229
},
{
"epoch": 0.2662037037037037,
"grad_norm": 0.14066831767559052,
"learning_rate": 7.3495370370370375e-06,
"loss": 1.083,
"step": 230
},
{
"epoch": 0.2673611111111111,
"grad_norm": 0.14046640694141388,
"learning_rate": 7.337962962962964e-06,
"loss": 1.0942,
"step": 231
},
{
"epoch": 0.26851851851851855,
"grad_norm": 0.14818687736988068,
"learning_rate": 7.326388888888889e-06,
"loss": 1.1081,
"step": 232
},
{
"epoch": 0.26967592592592593,
"grad_norm": 0.14957569539546967,
"learning_rate": 7.314814814814816e-06,
"loss": 1.1607,
"step": 233
},
{
"epoch": 0.2708333333333333,
"grad_norm": 0.14794419705867767,
"learning_rate": 7.303240740740741e-06,
"loss": 1.1061,
"step": 234
},
{
"epoch": 0.27199074074074076,
"grad_norm": 0.14263461530208588,
"learning_rate": 7.291666666666667e-06,
"loss": 1.0914,
"step": 235
},
{
"epoch": 0.27314814814814814,
"grad_norm": 0.1496988832950592,
"learning_rate": 7.280092592592594e-06,
"loss": 1.1051,
"step": 236
},
{
"epoch": 0.2743055555555556,
"grad_norm": 0.17018254101276398,
"learning_rate": 7.268518518518519e-06,
"loss": 1.1102,
"step": 237
},
{
"epoch": 0.27546296296296297,
"grad_norm": 0.13826943933963776,
"learning_rate": 7.256944444444445e-06,
"loss": 1.1153,
"step": 238
},
{
"epoch": 0.27662037037037035,
"grad_norm": 0.15133726596832275,
"learning_rate": 7.245370370370371e-06,
"loss": 1.1704,
"step": 239
},
{
"epoch": 0.2777777777777778,
"grad_norm": 0.1380927413702011,
"learning_rate": 7.233796296296297e-06,
"loss": 1.0637,
"step": 240
},
{
"epoch": 0.2789351851851852,
"grad_norm": 0.17274874448776245,
"learning_rate": 7.222222222222223e-06,
"loss": 1.0886,
"step": 241
},
{
"epoch": 0.2800925925925926,
"grad_norm": 0.1451474279165268,
"learning_rate": 7.210648148148148e-06,
"loss": 1.1426,
"step": 242
},
{
"epoch": 0.28125,
"grad_norm": 0.15506966412067413,
"learning_rate": 7.199074074074075e-06,
"loss": 1.1539,
"step": 243
},
{
"epoch": 0.2824074074074074,
"grad_norm": 0.14218544960021973,
"learning_rate": 7.1875e-06,
"loss": 1.1328,
"step": 244
},
{
"epoch": 0.2835648148148148,
"grad_norm": 0.14230510592460632,
"learning_rate": 7.1759259259259266e-06,
"loss": 1.1231,
"step": 245
},
{
"epoch": 0.2847222222222222,
"grad_norm": 0.15011471509933472,
"learning_rate": 7.164351851851853e-06,
"loss": 1.1324,
"step": 246
},
{
"epoch": 0.28587962962962965,
"grad_norm": 0.17919589579105377,
"learning_rate": 7.152777777777778e-06,
"loss": 1.174,
"step": 247
},
{
"epoch": 0.28703703703703703,
"grad_norm": 0.22477856278419495,
"learning_rate": 7.141203703703704e-06,
"loss": 1.1293,
"step": 248
},
{
"epoch": 0.2881944444444444,
"grad_norm": 0.15485665202140808,
"learning_rate": 7.129629629629629e-06,
"loss": 1.0675,
"step": 249
},
{
"epoch": 0.28935185185185186,
"grad_norm": 0.16213078796863556,
"learning_rate": 7.1180555555555565e-06,
"loss": 1.1293,
"step": 250
},
{
"epoch": 0.29050925925925924,
"grad_norm": 0.15522325038909912,
"learning_rate": 7.106481481481482e-06,
"loss": 1.1387,
"step": 251
},
{
"epoch": 0.2916666666666667,
"grad_norm": 0.16681715846061707,
"learning_rate": 7.0949074074074075e-06,
"loss": 1.0447,
"step": 252
},
{
"epoch": 0.29282407407407407,
"grad_norm": 0.15302035212516785,
"learning_rate": 7.083333333333335e-06,
"loss": 1.1575,
"step": 253
},
{
"epoch": 0.29398148148148145,
"grad_norm": 0.1592303216457367,
"learning_rate": 7.07175925925926e-06,
"loss": 1.1439,
"step": 254
},
{
"epoch": 0.2951388888888889,
"grad_norm": 0.14766466617584229,
"learning_rate": 7.060185185185186e-06,
"loss": 1.1439,
"step": 255
},
{
"epoch": 0.2962962962962963,
"grad_norm": 0.14591576159000397,
"learning_rate": 7.048611111111112e-06,
"loss": 1.0393,
"step": 256
},
{
"epoch": 0.2974537037037037,
"grad_norm": 0.1551518440246582,
"learning_rate": 7.0370370370370375e-06,
"loss": 1.0774,
"step": 257
},
{
"epoch": 0.2986111111111111,
"grad_norm": 0.15124709904193878,
"learning_rate": 7.025462962962963e-06,
"loss": 1.1139,
"step": 258
},
{
"epoch": 0.29976851851851855,
"grad_norm": 0.15364578366279602,
"learning_rate": 7.013888888888889e-06,
"loss": 1.091,
"step": 259
},
{
"epoch": 0.30092592592592593,
"grad_norm": 0.15513962507247925,
"learning_rate": 7.002314814814816e-06,
"loss": 1.1245,
"step": 260
},
{
"epoch": 0.3020833333333333,
"grad_norm": 0.15260472893714905,
"learning_rate": 6.990740740740741e-06,
"loss": 1.0679,
"step": 261
},
{
"epoch": 0.30324074074074076,
"grad_norm": 0.15631963312625885,
"learning_rate": 6.979166666666667e-06,
"loss": 1.0904,
"step": 262
},
{
"epoch": 0.30439814814814814,
"grad_norm": 0.16324004530906677,
"learning_rate": 6.967592592592594e-06,
"loss": 1.122,
"step": 263
},
{
"epoch": 0.3055555555555556,
"grad_norm": 0.1569405198097229,
"learning_rate": 6.956018518518519e-06,
"loss": 1.1128,
"step": 264
},
{
"epoch": 0.30671296296296297,
"grad_norm": 0.14927881956100464,
"learning_rate": 6.944444444444445e-06,
"loss": 1.1037,
"step": 265
},
{
"epoch": 0.30787037037037035,
"grad_norm": 0.16223756968975067,
"learning_rate": 6.932870370370371e-06,
"loss": 1.1296,
"step": 266
},
{
"epoch": 0.3090277777777778,
"grad_norm": 0.15246330201625824,
"learning_rate": 6.9212962962962974e-06,
"loss": 1.121,
"step": 267
},
{
"epoch": 0.3101851851851852,
"grad_norm": 0.15738119184970856,
"learning_rate": 6.909722222222223e-06,
"loss": 1.021,
"step": 268
},
{
"epoch": 0.3113425925925926,
"grad_norm": 0.14696645736694336,
"learning_rate": 6.898148148148148e-06,
"loss": 1.1129,
"step": 269
},
{
"epoch": 0.3125,
"grad_norm": 0.17972029745578766,
"learning_rate": 6.886574074074075e-06,
"loss": 1.1459,
"step": 270
},
{
"epoch": 0.3136574074074074,
"grad_norm": 0.15355713665485382,
"learning_rate": 6.875e-06,
"loss": 1.0327,
"step": 271
},
{
"epoch": 0.3148148148148148,
"grad_norm": 0.14485421776771545,
"learning_rate": 6.863425925925927e-06,
"loss": 1.0728,
"step": 272
},
{
"epoch": 0.3159722222222222,
"grad_norm": 0.14966461062431335,
"learning_rate": 6.851851851851853e-06,
"loss": 1.11,
"step": 273
},
{
"epoch": 0.31712962962962965,
"grad_norm": 0.15380822122097015,
"learning_rate": 6.840277777777778e-06,
"loss": 1.0322,
"step": 274
},
{
"epoch": 0.31828703703703703,
"grad_norm": 0.15833279490470886,
"learning_rate": 6.828703703703704e-06,
"loss": 1.108,
"step": 275
},
{
"epoch": 0.3194444444444444,
"grad_norm": 0.15459351241588593,
"learning_rate": 6.817129629629629e-06,
"loss": 1.0904,
"step": 276
},
{
"epoch": 0.32060185185185186,
"grad_norm": 0.1778038591146469,
"learning_rate": 6.8055555555555566e-06,
"loss": 1.1272,
"step": 277
},
{
"epoch": 0.32175925925925924,
"grad_norm": 0.16406135261058807,
"learning_rate": 6.793981481481482e-06,
"loss": 1.05,
"step": 278
},
{
"epoch": 0.3229166666666667,
"grad_norm": 0.18400990962982178,
"learning_rate": 6.7824074074074075e-06,
"loss": 1.0778,
"step": 279
},
{
"epoch": 0.32407407407407407,
"grad_norm": 0.157948300242424,
"learning_rate": 6.770833333333334e-06,
"loss": 1.0552,
"step": 280
},
{
"epoch": 0.32523148148148145,
"grad_norm": 0.14986403286457062,
"learning_rate": 6.75925925925926e-06,
"loss": 1.0509,
"step": 281
},
{
"epoch": 0.3263888888888889,
"grad_norm": 0.1901959925889969,
"learning_rate": 6.747685185185186e-06,
"loss": 1.0861,
"step": 282
},
{
"epoch": 0.3275462962962963,
"grad_norm": 0.15678079426288605,
"learning_rate": 6.736111111111112e-06,
"loss": 1.0942,
"step": 283
},
{
"epoch": 0.3287037037037037,
"grad_norm": 0.16509470343589783,
"learning_rate": 6.7245370370370375e-06,
"loss": 1.0301,
"step": 284
},
{
"epoch": 0.3298611111111111,
"grad_norm": 0.16847574710845947,
"learning_rate": 6.712962962962963e-06,
"loss": 1.1624,
"step": 285
},
{
"epoch": 0.33101851851851855,
"grad_norm": 0.17270340025424957,
"learning_rate": 6.701388888888889e-06,
"loss": 1.0494,
"step": 286
},
{
"epoch": 0.33217592592592593,
"grad_norm": 0.16031427681446075,
"learning_rate": 6.689814814814816e-06,
"loss": 1.1025,
"step": 287
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.15844310820102692,
"learning_rate": 6.678240740740741e-06,
"loss": 1.1172,
"step": 288
},
{
"epoch": 0.33449074074074076,
"grad_norm": 0.18013358116149902,
"learning_rate": 6.666666666666667e-06,
"loss": 1.143,
"step": 289
},
{
"epoch": 0.33564814814814814,
"grad_norm": 0.15711136162281036,
"learning_rate": 6.655092592592594e-06,
"loss": 1.0963,
"step": 290
},
{
"epoch": 0.3368055555555556,
"grad_norm": 0.1526433676481247,
"learning_rate": 6.643518518518519e-06,
"loss": 1.1131,
"step": 291
},
{
"epoch": 0.33796296296296297,
"grad_norm": 0.1604623794555664,
"learning_rate": 6.631944444444445e-06,
"loss": 1.091,
"step": 292
},
{
"epoch": 0.33912037037037035,
"grad_norm": 0.1599881798028946,
"learning_rate": 6.620370370370371e-06,
"loss": 1.1546,
"step": 293
},
{
"epoch": 0.3402777777777778,
"grad_norm": 0.1568020135164261,
"learning_rate": 6.608796296296297e-06,
"loss": 1.0707,
"step": 294
},
{
"epoch": 0.3414351851851852,
"grad_norm": 0.16622503101825714,
"learning_rate": 6.597222222222223e-06,
"loss": 1.0423,
"step": 295
},
{
"epoch": 0.3425925925925926,
"grad_norm": 0.1662890464067459,
"learning_rate": 6.5856481481481484e-06,
"loss": 1.0784,
"step": 296
},
{
"epoch": 0.34375,
"grad_norm": 0.24495013058185577,
"learning_rate": 6.574074074074075e-06,
"loss": 1.0781,
"step": 297
},
{
"epoch": 0.3449074074074074,
"grad_norm": 0.15532761812210083,
"learning_rate": 6.5625e-06,
"loss": 1.0132,
"step": 298
},
{
"epoch": 0.3460648148148148,
"grad_norm": 0.1543862670660019,
"learning_rate": 6.550925925925926e-06,
"loss": 1.1089,
"step": 299
},
{
"epoch": 0.3472222222222222,
"grad_norm": 0.1753946989774704,
"learning_rate": 6.539351851851853e-06,
"loss": 1.1471,
"step": 300
},
{
"epoch": 0.34837962962962965,
"grad_norm": 0.15991893410682678,
"learning_rate": 6.5277777777777784e-06,
"loss": 1.109,
"step": 301
},
{
"epoch": 0.34953703703703703,
"grad_norm": 0.15706345438957214,
"learning_rate": 6.516203703703704e-06,
"loss": 1.0623,
"step": 302
},
{
"epoch": 0.3506944444444444,
"grad_norm": 0.16499797999858856,
"learning_rate": 6.504629629629629e-06,
"loss": 1.0779,
"step": 303
},
{
"epoch": 0.35185185185185186,
"grad_norm": 0.1579602062702179,
"learning_rate": 6.493055555555557e-06,
"loss": 1.0858,
"step": 304
},
{
"epoch": 0.35300925925925924,
"grad_norm": 0.16340512037277222,
"learning_rate": 6.481481481481482e-06,
"loss": 1.1147,
"step": 305
},
{
"epoch": 0.3541666666666667,
"grad_norm": 0.16150407493114471,
"learning_rate": 6.4699074074074076e-06,
"loss": 1.0838,
"step": 306
},
{
"epoch": 0.35532407407407407,
"grad_norm": 0.1647614687681198,
"learning_rate": 6.458333333333334e-06,
"loss": 1.1434,
"step": 307
},
{
"epoch": 0.35648148148148145,
"grad_norm": 0.1532718688249588,
"learning_rate": 6.44675925925926e-06,
"loss": 1.0938,
"step": 308
},
{
"epoch": 0.3576388888888889,
"grad_norm": 0.15940245985984802,
"learning_rate": 6.435185185185186e-06,
"loss": 1.0954,
"step": 309
},
{
"epoch": 0.3587962962962963,
"grad_norm": 0.1621445119380951,
"learning_rate": 6.423611111111112e-06,
"loss": 1.1109,
"step": 310
},
{
"epoch": 0.3599537037037037,
"grad_norm": 0.15937702357769012,
"learning_rate": 6.4120370370370375e-06,
"loss": 1.0635,
"step": 311
},
{
"epoch": 0.3611111111111111,
"grad_norm": 0.16414445638656616,
"learning_rate": 6.400462962962963e-06,
"loss": 1.0817,
"step": 312
},
{
"epoch": 0.36226851851851855,
"grad_norm": 0.16262070834636688,
"learning_rate": 6.3888888888888885e-06,
"loss": 1.1235,
"step": 313
},
{
"epoch": 0.36342592592592593,
"grad_norm": 0.23703633248806,
"learning_rate": 6.377314814814816e-06,
"loss": 1.0968,
"step": 314
},
{
"epoch": 0.3645833333333333,
"grad_norm": 0.1544935554265976,
"learning_rate": 6.365740740740741e-06,
"loss": 1.0329,
"step": 315
},
{
"epoch": 0.36574074074074076,
"grad_norm": 0.1603870689868927,
"learning_rate": 6.354166666666667e-06,
"loss": 1.1552,
"step": 316
},
{
"epoch": 0.36689814814814814,
"grad_norm": 0.16536477208137512,
"learning_rate": 6.342592592592594e-06,
"loss": 1.0654,
"step": 317
},
{
"epoch": 0.3680555555555556,
"grad_norm": 0.153824120759964,
"learning_rate": 6.331018518518519e-06,
"loss": 1.025,
"step": 318
},
{
"epoch": 0.36921296296296297,
"grad_norm": 0.17179211974143982,
"learning_rate": 6.319444444444445e-06,
"loss": 1.1973,
"step": 319
},
{
"epoch": 0.37037037037037035,
"grad_norm": 0.16214361786842346,
"learning_rate": 6.307870370370371e-06,
"loss": 1.0792,
"step": 320
},
{
"epoch": 0.3715277777777778,
"grad_norm": 0.17154665291309357,
"learning_rate": 6.296296296296297e-06,
"loss": 1.1232,
"step": 321
},
{
"epoch": 0.3726851851851852,
"grad_norm": 0.16794797778129578,
"learning_rate": 6.284722222222223e-06,
"loss": 1.0727,
"step": 322
},
{
"epoch": 0.3738425925925926,
"grad_norm": 0.16869844496250153,
"learning_rate": 6.2731481481481485e-06,
"loss": 1.0605,
"step": 323
},
{
"epoch": 0.375,
"grad_norm": 0.1624298244714737,
"learning_rate": 6.261574074074075e-06,
"loss": 1.1402,
"step": 324
},
{
"epoch": 0.3761574074074074,
"grad_norm": 0.16999289393424988,
"learning_rate": 6.25e-06,
"loss": 1.0476,
"step": 325
},
{
"epoch": 0.3773148148148148,
"grad_norm": 0.18032115697860718,
"learning_rate": 6.238425925925926e-06,
"loss": 1.0555,
"step": 326
},
{
"epoch": 0.3784722222222222,
"grad_norm": 0.18404638767242432,
"learning_rate": 6.226851851851853e-06,
"loss": 1.1022,
"step": 327
},
{
"epoch": 0.37962962962962965,
"grad_norm": 0.17180074751377106,
"learning_rate": 6.2152777777777785e-06,
"loss": 1.1233,
"step": 328
},
{
"epoch": 0.38078703703703703,
"grad_norm": 0.16245652735233307,
"learning_rate": 6.203703703703704e-06,
"loss": 1.088,
"step": 329
},
{
"epoch": 0.3819444444444444,
"grad_norm": 0.16371048986911774,
"learning_rate": 6.1921296296296294e-06,
"loss": 1.0627,
"step": 330
},
{
"epoch": 0.38310185185185186,
"grad_norm": 0.17475458979606628,
"learning_rate": 6.180555555555557e-06,
"loss": 1.0689,
"step": 331
},
{
"epoch": 0.38425925925925924,
"grad_norm": 0.1612567901611328,
"learning_rate": 6.168981481481482e-06,
"loss": 1.0404,
"step": 332
},
{
"epoch": 0.3854166666666667,
"grad_norm": 0.16201715171337128,
"learning_rate": 6.157407407407408e-06,
"loss": 1.0846,
"step": 333
},
{
"epoch": 0.38657407407407407,
"grad_norm": 0.15960504114627838,
"learning_rate": 6.145833333333334e-06,
"loss": 1.0497,
"step": 334
},
{
"epoch": 0.38773148148148145,
"grad_norm": 0.1676989644765854,
"learning_rate": 6.134259259259259e-06,
"loss": 1.0672,
"step": 335
},
{
"epoch": 0.3888888888888889,
"grad_norm": 0.17084524035453796,
"learning_rate": 6.122685185185186e-06,
"loss": 1.0504,
"step": 336
},
{
"epoch": 0.3900462962962963,
"grad_norm": 0.20204032957553864,
"learning_rate": 6.111111111111112e-06,
"loss": 1.0935,
"step": 337
},
{
"epoch": 0.3912037037037037,
"grad_norm": 0.16617536544799805,
"learning_rate": 6.0995370370370376e-06,
"loss": 1.0946,
"step": 338
},
{
"epoch": 0.3923611111111111,
"grad_norm": 0.17326408624649048,
"learning_rate": 6.087962962962963e-06,
"loss": 1.072,
"step": 339
},
{
"epoch": 0.39351851851851855,
"grad_norm": 0.1672402322292328,
"learning_rate": 6.0763888888888885e-06,
"loss": 1.0738,
"step": 340
},
{
"epoch": 0.39467592592592593,
"grad_norm": 0.18285104632377625,
"learning_rate": 6.064814814814816e-06,
"loss": 1.0381,
"step": 341
},
{
"epoch": 0.3958333333333333,
"grad_norm": 0.19718050956726074,
"learning_rate": 6.053240740740741e-06,
"loss": 1.0386,
"step": 342
},
{
"epoch": 0.39699074074074076,
"grad_norm": 0.1670585423707962,
"learning_rate": 6.041666666666667e-06,
"loss": 1.1144,
"step": 343
},
{
"epoch": 0.39814814814814814,
"grad_norm": 0.1710127294063568,
"learning_rate": 6.030092592592594e-06,
"loss": 1.1076,
"step": 344
},
{
"epoch": 0.3993055555555556,
"grad_norm": 0.1666959673166275,
"learning_rate": 6.018518518518519e-06,
"loss": 1.0994,
"step": 345
},
{
"epoch": 0.40046296296296297,
"grad_norm": 0.16433195769786835,
"learning_rate": 6.006944444444445e-06,
"loss": 1.0441,
"step": 346
},
{
"epoch": 0.40162037037037035,
"grad_norm": 0.1659228801727295,
"learning_rate": 5.995370370370371e-06,
"loss": 1.0886,
"step": 347
},
{
"epoch": 0.4027777777777778,
"grad_norm": 0.19117744266986847,
"learning_rate": 5.983796296296297e-06,
"loss": 1.0332,
"step": 348
},
{
"epoch": 0.4039351851851852,
"grad_norm": 0.17095550894737244,
"learning_rate": 5.972222222222222e-06,
"loss": 1.1132,
"step": 349
},
{
"epoch": 0.4050925925925926,
"grad_norm": 0.1593662053346634,
"learning_rate": 5.9606481481481485e-06,
"loss": 1.0432,
"step": 350
},
{
"epoch": 0.40625,
"grad_norm": 0.257305383682251,
"learning_rate": 5.949074074074075e-06,
"loss": 0.9819,
"step": 351
},
{
"epoch": 0.4074074074074074,
"grad_norm": 0.17508187890052795,
"learning_rate": 5.9375e-06,
"loss": 1.0263,
"step": 352
},
{
"epoch": 0.4085648148148148,
"grad_norm": 0.17764556407928467,
"learning_rate": 5.925925925925926e-06,
"loss": 1.0585,
"step": 353
},
{
"epoch": 0.4097222222222222,
"grad_norm": 0.1692568063735962,
"learning_rate": 5.914351851851853e-06,
"loss": 1.0111,
"step": 354
},
{
"epoch": 0.41087962962962965,
"grad_norm": 0.1803068220615387,
"learning_rate": 5.9027777777777785e-06,
"loss": 1.0209,
"step": 355
},
{
"epoch": 0.41203703703703703,
"grad_norm": 0.17242421209812164,
"learning_rate": 5.891203703703704e-06,
"loss": 1.0509,
"step": 356
},
{
"epoch": 0.4131944444444444,
"grad_norm": 0.17684854567050934,
"learning_rate": 5.8796296296296295e-06,
"loss": 1.0462,
"step": 357
},
{
"epoch": 0.41435185185185186,
"grad_norm": 0.16622677445411682,
"learning_rate": 5.868055555555557e-06,
"loss": 1.0227,
"step": 358
},
{
"epoch": 0.41550925925925924,
"grad_norm": 0.18189331889152527,
"learning_rate": 5.856481481481482e-06,
"loss": 1.0007,
"step": 359
},
{
"epoch": 0.4166666666666667,
"grad_norm": 0.17341043055057526,
"learning_rate": 5.844907407407408e-06,
"loss": 1.0132,
"step": 360
},
{
"epoch": 0.41782407407407407,
"grad_norm": 0.17046672105789185,
"learning_rate": 5.833333333333334e-06,
"loss": 1.0724,
"step": 361
},
{
"epoch": 0.41898148148148145,
"grad_norm": 0.1714986264705658,
"learning_rate": 5.8217592592592594e-06,
"loss": 1.0817,
"step": 362
},
{
"epoch": 0.4201388888888889,
"grad_norm": 0.17200608551502228,
"learning_rate": 5.810185185185186e-06,
"loss": 1.1255,
"step": 363
},
{
"epoch": 0.4212962962962963,
"grad_norm": 0.16442537307739258,
"learning_rate": 5.798611111111112e-06,
"loss": 1.0655,
"step": 364
},
{
"epoch": 0.4224537037037037,
"grad_norm": 0.17119750380516052,
"learning_rate": 5.787037037037038e-06,
"loss": 1.1056,
"step": 365
},
{
"epoch": 0.4236111111111111,
"grad_norm": 0.26399606466293335,
"learning_rate": 5.775462962962963e-06,
"loss": 1.0784,
"step": 366
},
{
"epoch": 0.42476851851851855,
"grad_norm": 0.17613111436367035,
"learning_rate": 5.7638888888888886e-06,
"loss": 1.0825,
"step": 367
},
{
"epoch": 0.42592592592592593,
"grad_norm": 0.1671566367149353,
"learning_rate": 5.752314814814816e-06,
"loss": 1.0748,
"step": 368
},
{
"epoch": 0.4270833333333333,
"grad_norm": 0.1717667132616043,
"learning_rate": 5.740740740740741e-06,
"loss": 1.0559,
"step": 369
},
{
"epoch": 0.42824074074074076,
"grad_norm": 0.1810443252325058,
"learning_rate": 5.729166666666667e-06,
"loss": 1.1115,
"step": 370
},
{
"epoch": 0.42939814814814814,
"grad_norm": 0.1782471239566803,
"learning_rate": 5.717592592592593e-06,
"loss": 1.1441,
"step": 371
},
{
"epoch": 0.4305555555555556,
"grad_norm": 0.16452831029891968,
"learning_rate": 5.706018518518519e-06,
"loss": 1.0284,
"step": 372
},
{
"epoch": 0.43171296296296297,
"grad_norm": 0.1789056956768036,
"learning_rate": 5.694444444444445e-06,
"loss": 1.0889,
"step": 373
},
{
"epoch": 0.43287037037037035,
"grad_norm": 0.18854652345180511,
"learning_rate": 5.682870370370371e-06,
"loss": 0.982,
"step": 374
},
{
"epoch": 0.4340277777777778,
"grad_norm": 0.16741988062858582,
"learning_rate": 5.671296296296297e-06,
"loss": 1.0511,
"step": 375
},
{
"epoch": 0.4351851851851852,
"grad_norm": 0.17495352029800415,
"learning_rate": 5.659722222222222e-06,
"loss": 1.0276,
"step": 376
},
{
"epoch": 0.4363425925925926,
"grad_norm": 0.18103112280368805,
"learning_rate": 5.6481481481481485e-06,
"loss": 1.0774,
"step": 377
},
{
"epoch": 0.4375,
"grad_norm": 0.17093877494335175,
"learning_rate": 5.636574074074075e-06,
"loss": 1.0617,
"step": 378
},
{
"epoch": 0.4386574074074074,
"grad_norm": 0.17672328650951385,
"learning_rate": 5.625e-06,
"loss": 1.1172,
"step": 379
},
{
"epoch": 0.4398148148148148,
"grad_norm": 0.17157655954360962,
"learning_rate": 5.613425925925926e-06,
"loss": 1.0651,
"step": 380
},
{
"epoch": 0.4409722222222222,
"grad_norm": 0.21019726991653442,
"learning_rate": 5.601851851851853e-06,
"loss": 1.0853,
"step": 381
},
{
"epoch": 0.44212962962962965,
"grad_norm": 0.16854119300842285,
"learning_rate": 5.5902777777777785e-06,
"loss": 1.0568,
"step": 382
},
{
"epoch": 0.44328703703703703,
"grad_norm": 0.17988835275173187,
"learning_rate": 5.578703703703704e-06,
"loss": 1.0607,
"step": 383
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.1783677488565445,
"learning_rate": 5.5671296296296295e-06,
"loss": 1.135,
"step": 384
},
{
"epoch": 0.44560185185185186,
"grad_norm": 0.1827668398618698,
"learning_rate": 5.555555555555557e-06,
"loss": 1.0959,
"step": 385
},
{
"epoch": 0.44675925925925924,
"grad_norm": 0.1617071032524109,
"learning_rate": 5.543981481481482e-06,
"loss": 1.0016,
"step": 386
},
{
"epoch": 0.4479166666666667,
"grad_norm": 0.17546425759792328,
"learning_rate": 5.532407407407408e-06,
"loss": 1.0316,
"step": 387
},
{
"epoch": 0.44907407407407407,
"grad_norm": 0.19249460101127625,
"learning_rate": 5.520833333333334e-06,
"loss": 1.0625,
"step": 388
},
{
"epoch": 0.45023148148148145,
"grad_norm": 0.18015442788600922,
"learning_rate": 5.5092592592592595e-06,
"loss": 1.0718,
"step": 389
},
{
"epoch": 0.4513888888888889,
"grad_norm": 0.17945371568202972,
"learning_rate": 5.497685185185185e-06,
"loss": 1.0278,
"step": 390
},
{
"epoch": 0.4525462962962963,
"grad_norm": 0.21291273832321167,
"learning_rate": 5.486111111111112e-06,
"loss": 1.0924,
"step": 391
},
{
"epoch": 0.4537037037037037,
"grad_norm": 0.18178021907806396,
"learning_rate": 5.474537037037038e-06,
"loss": 1.0163,
"step": 392
},
{
"epoch": 0.4548611111111111,
"grad_norm": 0.16679397225379944,
"learning_rate": 5.462962962962963e-06,
"loss": 1.0027,
"step": 393
},
{
"epoch": 0.45601851851851855,
"grad_norm": 0.19789078831672668,
"learning_rate": 5.451388888888889e-06,
"loss": 1.0478,
"step": 394
},
{
"epoch": 0.45717592592592593,
"grad_norm": 0.183577299118042,
"learning_rate": 5.439814814814816e-06,
"loss": 1.0869,
"step": 395
},
{
"epoch": 0.4583333333333333,
"grad_norm": 0.17376714944839478,
"learning_rate": 5.428240740740741e-06,
"loss": 1.0763,
"step": 396
},
{
"epoch": 0.45949074074074076,
"grad_norm": 0.17424362897872925,
"learning_rate": 5.416666666666667e-06,
"loss": 1.1034,
"step": 397
},
{
"epoch": 0.46064814814814814,
"grad_norm": 0.18567875027656555,
"learning_rate": 5.405092592592593e-06,
"loss": 1.1487,
"step": 398
},
{
"epoch": 0.4618055555555556,
"grad_norm": 0.1731259822845459,
"learning_rate": 5.3935185185185194e-06,
"loss": 1.0371,
"step": 399
},
{
"epoch": 0.46296296296296297,
"grad_norm": 0.1738322377204895,
"learning_rate": 5.381944444444445e-06,
"loss": 1.0892,
"step": 400
},
{
"epoch": 0.46412037037037035,
"grad_norm": 0.181026428937912,
"learning_rate": 5.370370370370371e-06,
"loss": 1.0335,
"step": 401
},
{
"epoch": 0.4652777777777778,
"grad_norm": 0.1937108188867569,
"learning_rate": 5.358796296296297e-06,
"loss": 1.0659,
"step": 402
},
{
"epoch": 0.4664351851851852,
"grad_norm": 0.1845736801624298,
"learning_rate": 5.347222222222222e-06,
"loss": 1.0769,
"step": 403
},
{
"epoch": 0.4675925925925926,
"grad_norm": 0.17622292041778564,
"learning_rate": 5.335648148148148e-06,
"loss": 1.0344,
"step": 404
},
{
"epoch": 0.46875,
"grad_norm": 0.19735385477542877,
"learning_rate": 5.324074074074075e-06,
"loss": 1.1107,
"step": 405
},
{
"epoch": 0.4699074074074074,
"grad_norm": 0.1723097264766693,
"learning_rate": 5.3125e-06,
"loss": 1.0776,
"step": 406
},
{
"epoch": 0.4710648148148148,
"grad_norm": 0.1863865703344345,
"learning_rate": 5.300925925925926e-06,
"loss": 1.0645,
"step": 407
},
{
"epoch": 0.4722222222222222,
"grad_norm": 0.18787802755832672,
"learning_rate": 5.289351851851853e-06,
"loss": 1.0585,
"step": 408
},
{
"epoch": 0.47337962962962965,
"grad_norm": 0.17600968480110168,
"learning_rate": 5.2777777777777785e-06,
"loss": 1.0306,
"step": 409
},
{
"epoch": 0.47453703703703703,
"grad_norm": 0.1793355494737625,
"learning_rate": 5.266203703703704e-06,
"loss": 1.0337,
"step": 410
},
{
"epoch": 0.4756944444444444,
"grad_norm": 0.17563559114933014,
"learning_rate": 5.2546296296296295e-06,
"loss": 1.0991,
"step": 411
},
{
"epoch": 0.47685185185185186,
"grad_norm": 0.17623300850391388,
"learning_rate": 5.243055555555556e-06,
"loss": 1.0204,
"step": 412
},
{
"epoch": 0.47800925925925924,
"grad_norm": 0.17649118602275848,
"learning_rate": 5.231481481481482e-06,
"loss": 1.0523,
"step": 413
},
{
"epoch": 0.4791666666666667,
"grad_norm": 0.1824215203523636,
"learning_rate": 5.219907407407408e-06,
"loss": 1.0839,
"step": 414
},
{
"epoch": 0.48032407407407407,
"grad_norm": 0.18569543957710266,
"learning_rate": 5.208333333333334e-06,
"loss": 1.0277,
"step": 415
},
{
"epoch": 0.48148148148148145,
"grad_norm": 0.18239271640777588,
"learning_rate": 5.1967592592592595e-06,
"loss": 0.9784,
"step": 416
},
{
"epoch": 0.4826388888888889,
"grad_norm": 0.17993135750293732,
"learning_rate": 5.185185185185185e-06,
"loss": 1.0564,
"step": 417
},
{
"epoch": 0.4837962962962963,
"grad_norm": 0.18158768117427826,
"learning_rate": 5.173611111111112e-06,
"loss": 1.0332,
"step": 418
},
{
"epoch": 0.4849537037037037,
"grad_norm": 0.1836976557970047,
"learning_rate": 5.162037037037038e-06,
"loss": 1.0209,
"step": 419
},
{
"epoch": 0.4861111111111111,
"grad_norm": 0.19026342034339905,
"learning_rate": 5.150462962962963e-06,
"loss": 1.1446,
"step": 420
},
{
"epoch": 0.48726851851851855,
"grad_norm": 0.17658670246601105,
"learning_rate": 5.138888888888889e-06,
"loss": 0.9938,
"step": 421
},
{
"epoch": 0.48842592592592593,
"grad_norm": 0.1957542598247528,
"learning_rate": 5.127314814814816e-06,
"loss": 1.0233,
"step": 422
},
{
"epoch": 0.4895833333333333,
"grad_norm": 0.18295548856258392,
"learning_rate": 5.115740740740741e-06,
"loss": 1.0643,
"step": 423
},
{
"epoch": 0.49074074074074076,
"grad_norm": 0.18762163817882538,
"learning_rate": 5.104166666666667e-06,
"loss": 1.0762,
"step": 424
},
{
"epoch": 0.49189814814814814,
"grad_norm": 0.18123242259025574,
"learning_rate": 5.092592592592593e-06,
"loss": 1.0738,
"step": 425
},
{
"epoch": 0.4930555555555556,
"grad_norm": 0.17455293238162994,
"learning_rate": 5.081018518518519e-06,
"loss": 1.0094,
"step": 426
},
{
"epoch": 0.49421296296296297,
"grad_norm": 0.1735786497592926,
"learning_rate": 5.069444444444445e-06,
"loss": 1.0997,
"step": 427
},
{
"epoch": 0.49537037037037035,
"grad_norm": 0.18896614015102386,
"learning_rate": 5.057870370370371e-06,
"loss": 1.0226,
"step": 428
},
{
"epoch": 0.4965277777777778,
"grad_norm": 0.2275708019733429,
"learning_rate": 5.046296296296297e-06,
"loss": 1.0348,
"step": 429
},
{
"epoch": 0.4976851851851852,
"grad_norm": 0.1853165179491043,
"learning_rate": 5.034722222222222e-06,
"loss": 1.0053,
"step": 430
},
{
"epoch": 0.4988425925925926,
"grad_norm": 0.1842850297689438,
"learning_rate": 5.023148148148148e-06,
"loss": 1.0847,
"step": 431
},
{
"epoch": 0.5,
"grad_norm": 0.19081000983715057,
"learning_rate": 5.011574074074075e-06,
"loss": 1.0571,
"step": 432
},
{
"epoch": 0.5011574074074074,
"grad_norm": 0.18483759462833405,
"learning_rate": 5e-06,
"loss": 1.017,
"step": 433
},
{
"epoch": 0.5023148148148148,
"grad_norm": 0.18229275941848755,
"learning_rate": 4.988425925925927e-06,
"loss": 1.0299,
"step": 434
},
{
"epoch": 0.5034722222222222,
"grad_norm": 0.19052374362945557,
"learning_rate": 4.976851851851852e-06,
"loss": 1.1137,
"step": 435
},
{
"epoch": 0.5046296296296297,
"grad_norm": 0.19372689723968506,
"learning_rate": 4.9652777777777786e-06,
"loss": 1.0611,
"step": 436
},
{
"epoch": 0.5057870370370371,
"grad_norm": 0.18391257524490356,
"learning_rate": 4.953703703703704e-06,
"loss": 1.0225,
"step": 437
},
{
"epoch": 0.5069444444444444,
"grad_norm": 0.18629541993141174,
"learning_rate": 4.94212962962963e-06,
"loss": 0.9979,
"step": 438
},
{
"epoch": 0.5081018518518519,
"grad_norm": 0.19393537938594818,
"learning_rate": 4.930555555555556e-06,
"loss": 1.1086,
"step": 439
},
{
"epoch": 0.5092592592592593,
"grad_norm": 0.1992591768503189,
"learning_rate": 4.918981481481482e-06,
"loss": 1.0497,
"step": 440
},
{
"epoch": 0.5104166666666666,
"grad_norm": 0.18729424476623535,
"learning_rate": 4.907407407407408e-06,
"loss": 1.0624,
"step": 441
},
{
"epoch": 0.5115740740740741,
"grad_norm": 0.18621307611465454,
"learning_rate": 4.895833333333333e-06,
"loss": 1.01,
"step": 442
},
{
"epoch": 0.5127314814814815,
"grad_norm": 0.1837424784898758,
"learning_rate": 4.8842592592592595e-06,
"loss": 1.055,
"step": 443
},
{
"epoch": 0.5138888888888888,
"grad_norm": 0.2010790854692459,
"learning_rate": 4.872685185185186e-06,
"loss": 1.1718,
"step": 444
},
{
"epoch": 0.5150462962962963,
"grad_norm": 0.1856854408979416,
"learning_rate": 4.861111111111111e-06,
"loss": 1.0231,
"step": 445
},
{
"epoch": 0.5162037037037037,
"grad_norm": 0.18698154389858246,
"learning_rate": 4.849537037037038e-06,
"loss": 1.0363,
"step": 446
},
{
"epoch": 0.5173611111111112,
"grad_norm": 0.18074962496757507,
"learning_rate": 4.837962962962963e-06,
"loss": 1.0585,
"step": 447
},
{
"epoch": 0.5185185185185185,
"grad_norm": 0.18344129621982574,
"learning_rate": 4.8263888888888895e-06,
"loss": 1.0549,
"step": 448
},
{
"epoch": 0.5196759259259259,
"grad_norm": 0.1772606372833252,
"learning_rate": 4.814814814814815e-06,
"loss": 1.1409,
"step": 449
},
{
"epoch": 0.5208333333333334,
"grad_norm": 0.18440352380275726,
"learning_rate": 4.803240740740741e-06,
"loss": 1.0651,
"step": 450
},
{
"epoch": 0.5219907407407407,
"grad_norm": 0.2034968137741089,
"learning_rate": 4.791666666666668e-06,
"loss": 0.9843,
"step": 451
},
{
"epoch": 0.5231481481481481,
"grad_norm": 0.18375763297080994,
"learning_rate": 4.780092592592593e-06,
"loss": 1.0571,
"step": 452
},
{
"epoch": 0.5243055555555556,
"grad_norm": 0.18968044221401215,
"learning_rate": 4.768518518518519e-06,
"loss": 1.0843,
"step": 453
},
{
"epoch": 0.5254629629629629,
"grad_norm": 0.18977278470993042,
"learning_rate": 4.756944444444445e-06,
"loss": 1.076,
"step": 454
},
{
"epoch": 0.5266203703703703,
"grad_norm": 0.191218301653862,
"learning_rate": 4.7453703703703705e-06,
"loss": 1.1139,
"step": 455
},
{
"epoch": 0.5277777777777778,
"grad_norm": 0.18904373049736023,
"learning_rate": 4.733796296296297e-06,
"loss": 1.1205,
"step": 456
},
{
"epoch": 0.5289351851851852,
"grad_norm": 0.19077414274215698,
"learning_rate": 4.722222222222222e-06,
"loss": 1.086,
"step": 457
},
{
"epoch": 0.5300925925925926,
"grad_norm": 0.19545550644397736,
"learning_rate": 4.710648148148149e-06,
"loss": 1.0117,
"step": 458
},
{
"epoch": 0.53125,
"grad_norm": 0.19482102990150452,
"learning_rate": 4.699074074074074e-06,
"loss": 1.0768,
"step": 459
},
{
"epoch": 0.5324074074074074,
"grad_norm": 0.1793646365404129,
"learning_rate": 4.6875000000000004e-06,
"loss": 1.0542,
"step": 460
},
{
"epoch": 0.5335648148148148,
"grad_norm": 0.18208812177181244,
"learning_rate": 4.675925925925927e-06,
"loss": 1.0722,
"step": 461
},
{
"epoch": 0.5347222222222222,
"grad_norm": 0.19631284475326538,
"learning_rate": 4.664351851851852e-06,
"loss": 1.0069,
"step": 462
},
{
"epoch": 0.5358796296296297,
"grad_norm": 0.1896466761827469,
"learning_rate": 4.652777777777779e-06,
"loss": 1.0232,
"step": 463
},
{
"epoch": 0.5370370370370371,
"grad_norm": 0.17967751622200012,
"learning_rate": 4.641203703703704e-06,
"loss": 1.0982,
"step": 464
},
{
"epoch": 0.5381944444444444,
"grad_norm": 0.19527268409729004,
"learning_rate": 4.62962962962963e-06,
"loss": 1.0464,
"step": 465
},
{
"epoch": 0.5393518518518519,
"grad_norm": 0.18822702765464783,
"learning_rate": 4.618055555555556e-06,
"loss": 1.0881,
"step": 466
},
{
"epoch": 0.5405092592592593,
"grad_norm": 0.19704404473304749,
"learning_rate": 4.606481481481481e-06,
"loss": 1.0765,
"step": 467
},
{
"epoch": 0.5416666666666666,
"grad_norm": 0.1972915083169937,
"learning_rate": 4.594907407407408e-06,
"loss": 1.1231,
"step": 468
},
{
"epoch": 0.5428240740740741,
"grad_norm": 0.18384619057178497,
"learning_rate": 4.583333333333333e-06,
"loss": 1.0372,
"step": 469
},
{
"epoch": 0.5439814814814815,
"grad_norm": 0.19371986389160156,
"learning_rate": 4.5717592592592595e-06,
"loss": 1.0317,
"step": 470
},
{
"epoch": 0.5451388888888888,
"grad_norm": 0.21294532716274261,
"learning_rate": 4.560185185185186e-06,
"loss": 0.9952,
"step": 471
},
{
"epoch": 0.5462962962962963,
"grad_norm": 0.19161927700042725,
"learning_rate": 4.548611111111111e-06,
"loss": 1.0672,
"step": 472
},
{
"epoch": 0.5474537037037037,
"grad_norm": 0.18557506799697876,
"learning_rate": 4.537037037037038e-06,
"loss": 1.0299,
"step": 473
},
{
"epoch": 0.5486111111111112,
"grad_norm": 0.19008249044418335,
"learning_rate": 4.525462962962963e-06,
"loss": 1.0675,
"step": 474
},
{
"epoch": 0.5497685185185185,
"grad_norm": 0.19142603874206543,
"learning_rate": 4.5138888888888895e-06,
"loss": 1.0979,
"step": 475
},
{
"epoch": 0.5509259259259259,
"grad_norm": 0.19404229521751404,
"learning_rate": 4.502314814814815e-06,
"loss": 0.9621,
"step": 476
},
{
"epoch": 0.5520833333333334,
"grad_norm": 0.19568336009979248,
"learning_rate": 4.490740740740741e-06,
"loss": 1.0802,
"step": 477
},
{
"epoch": 0.5532407407407407,
"grad_norm": 0.1997448205947876,
"learning_rate": 4.479166666666667e-06,
"loss": 1.1484,
"step": 478
},
{
"epoch": 0.5543981481481481,
"grad_norm": 0.2235734760761261,
"learning_rate": 4.467592592592593e-06,
"loss": 1.0239,
"step": 479
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.20642857253551483,
"learning_rate": 4.456018518518519e-06,
"loss": 1.0456,
"step": 480
},
{
"epoch": 0.5567129629629629,
"grad_norm": 0.1832839995622635,
"learning_rate": 4.444444444444444e-06,
"loss": 1.0074,
"step": 481
},
{
"epoch": 0.5578703703703703,
"grad_norm": 0.1859252154827118,
"learning_rate": 4.4328703703703705e-06,
"loss": 1.0592,
"step": 482
},
{
"epoch": 0.5590277777777778,
"grad_norm": 0.19628946483135223,
"learning_rate": 4.421296296296297e-06,
"loss": 0.9807,
"step": 483
},
{
"epoch": 0.5601851851851852,
"grad_norm": 0.18916712701320648,
"learning_rate": 4.409722222222222e-06,
"loss": 0.9956,
"step": 484
},
{
"epoch": 0.5613425925925926,
"grad_norm": 0.19245581328868866,
"learning_rate": 4.398148148148149e-06,
"loss": 1.0348,
"step": 485
},
{
"epoch": 0.5625,
"grad_norm": 0.18789884448051453,
"learning_rate": 4.386574074074074e-06,
"loss": 1.0984,
"step": 486
},
{
"epoch": 0.5636574074074074,
"grad_norm": 0.18986792862415314,
"learning_rate": 4.3750000000000005e-06,
"loss": 0.9934,
"step": 487
},
{
"epoch": 0.5648148148148148,
"grad_norm": 0.1932685375213623,
"learning_rate": 4.363425925925927e-06,
"loss": 1.1058,
"step": 488
},
{
"epoch": 0.5659722222222222,
"grad_norm": 0.21794289350509644,
"learning_rate": 4.351851851851852e-06,
"loss": 1.0482,
"step": 489
},
{
"epoch": 0.5671296296296297,
"grad_norm": 0.19237902760505676,
"learning_rate": 4.340277777777779e-06,
"loss": 1.0242,
"step": 490
},
{
"epoch": 0.5682870370370371,
"grad_norm": 0.19967250525951385,
"learning_rate": 4.328703703703704e-06,
"loss": 1.0208,
"step": 491
},
{
"epoch": 0.5694444444444444,
"grad_norm": 0.18474088609218597,
"learning_rate": 4.31712962962963e-06,
"loss": 1.0251,
"step": 492
},
{
"epoch": 0.5706018518518519,
"grad_norm": 0.22464704513549805,
"learning_rate": 4.305555555555556e-06,
"loss": 0.9969,
"step": 493
},
{
"epoch": 0.5717592592592593,
"grad_norm": 0.1928233802318573,
"learning_rate": 4.293981481481481e-06,
"loss": 1.1077,
"step": 494
},
{
"epoch": 0.5729166666666666,
"grad_norm": 0.20419670641422272,
"learning_rate": 4.282407407407408e-06,
"loss": 1.0883,
"step": 495
},
{
"epoch": 0.5740740740740741,
"grad_norm": 0.21001799404621124,
"learning_rate": 4.270833333333333e-06,
"loss": 1.0765,
"step": 496
},
{
"epoch": 0.5752314814814815,
"grad_norm": 0.197916179895401,
"learning_rate": 4.2592592592592596e-06,
"loss": 1.0165,
"step": 497
},
{
"epoch": 0.5763888888888888,
"grad_norm": 0.18999773263931274,
"learning_rate": 4.247685185185186e-06,
"loss": 1.0272,
"step": 498
},
{
"epoch": 0.5775462962962963,
"grad_norm": 0.19263650476932526,
"learning_rate": 4.236111111111111e-06,
"loss": 0.985,
"step": 499
},
{
"epoch": 0.5787037037037037,
"grad_norm": 0.19697001576423645,
"learning_rate": 4.224537037037038e-06,
"loss": 1.1516,
"step": 500
},
{
"epoch": 0.5798611111111112,
"grad_norm": 0.20765507221221924,
"learning_rate": 4.212962962962963e-06,
"loss": 1.0993,
"step": 501
},
{
"epoch": 0.5810185185185185,
"grad_norm": 0.1897115409374237,
"learning_rate": 4.2013888888888896e-06,
"loss": 1.051,
"step": 502
},
{
"epoch": 0.5821759259259259,
"grad_norm": 0.20036956667900085,
"learning_rate": 4.189814814814815e-06,
"loss": 1.0293,
"step": 503
},
{
"epoch": 0.5833333333333334,
"grad_norm": 0.19671155512332916,
"learning_rate": 4.178240740740741e-06,
"loss": 1.0452,
"step": 504
},
{
"epoch": 0.5844907407407407,
"grad_norm": 0.1968628168106079,
"learning_rate": 4.166666666666667e-06,
"loss": 1.0635,
"step": 505
},
{
"epoch": 0.5856481481481481,
"grad_norm": 0.19419008493423462,
"learning_rate": 4.155092592592593e-06,
"loss": 1.0393,
"step": 506
},
{
"epoch": 0.5868055555555556,
"grad_norm": 0.20477062463760376,
"learning_rate": 4.143518518518519e-06,
"loss": 1.1189,
"step": 507
},
{
"epoch": 0.5879629629629629,
"grad_norm": 0.19754110276699066,
"learning_rate": 4.131944444444444e-06,
"loss": 1.0108,
"step": 508
},
{
"epoch": 0.5891203703703703,
"grad_norm": 0.19897371530532837,
"learning_rate": 4.1203703703703705e-06,
"loss": 0.9864,
"step": 509
},
{
"epoch": 0.5902777777777778,
"grad_norm": 0.19264736771583557,
"learning_rate": 4.108796296296297e-06,
"loss": 1.0757,
"step": 510
},
{
"epoch": 0.5914351851851852,
"grad_norm": 0.19793768227100372,
"learning_rate": 4.097222222222222e-06,
"loss": 1.0985,
"step": 511
},
{
"epoch": 0.5925925925925926,
"grad_norm": 0.22821103036403656,
"learning_rate": 4.085648148148149e-06,
"loss": 1.0662,
"step": 512
},
{
"epoch": 0.59375,
"grad_norm": 0.2039816677570343,
"learning_rate": 4.074074074074074e-06,
"loss": 1.0677,
"step": 513
},
{
"epoch": 0.5949074074074074,
"grad_norm": 0.19190923869609833,
"learning_rate": 4.0625000000000005e-06,
"loss": 1.052,
"step": 514
},
{
"epoch": 0.5960648148148148,
"grad_norm": 0.18609005212783813,
"learning_rate": 4.050925925925927e-06,
"loss": 1.0877,
"step": 515
},
{
"epoch": 0.5972222222222222,
"grad_norm": 0.1913410723209381,
"learning_rate": 4.039351851851852e-06,
"loss": 1.1173,
"step": 516
},
{
"epoch": 0.5983796296296297,
"grad_norm": 0.1951320916414261,
"learning_rate": 4.027777777777779e-06,
"loss": 1.032,
"step": 517
},
{
"epoch": 0.5995370370370371,
"grad_norm": 0.19345664978027344,
"learning_rate": 4.016203703703704e-06,
"loss": 1.0731,
"step": 518
},
{
"epoch": 0.6006944444444444,
"grad_norm": 0.19238971173763275,
"learning_rate": 4.00462962962963e-06,
"loss": 1.0494,
"step": 519
},
{
"epoch": 0.6018518518518519,
"grad_norm": 0.24165932834148407,
"learning_rate": 3.993055555555556e-06,
"loss": 1.0699,
"step": 520
},
{
"epoch": 0.6030092592592593,
"grad_norm": 0.19671660661697388,
"learning_rate": 3.9814814814814814e-06,
"loss": 0.9833,
"step": 521
},
{
"epoch": 0.6041666666666666,
"grad_norm": 0.20090967416763306,
"learning_rate": 3.969907407407408e-06,
"loss": 1.0286,
"step": 522
},
{
"epoch": 0.6053240740740741,
"grad_norm": 0.18830811977386475,
"learning_rate": 3.958333333333333e-06,
"loss": 0.9915,
"step": 523
},
{
"epoch": 0.6064814814814815,
"grad_norm": 0.2034299075603485,
"learning_rate": 3.94675925925926e-06,
"loss": 1.0386,
"step": 524
},
{
"epoch": 0.6076388888888888,
"grad_norm": 0.206466406583786,
"learning_rate": 3.935185185185186e-06,
"loss": 1.1197,
"step": 525
},
{
"epoch": 0.6087962962962963,
"grad_norm": 0.1990206092596054,
"learning_rate": 3.9236111111111114e-06,
"loss": 1.0646,
"step": 526
},
{
"epoch": 0.6099537037037037,
"grad_norm": 0.18752767145633698,
"learning_rate": 3.912037037037038e-06,
"loss": 1.0491,
"step": 527
},
{
"epoch": 0.6111111111111112,
"grad_norm": 0.18697869777679443,
"learning_rate": 3.900462962962963e-06,
"loss": 1.0638,
"step": 528
},
{
"epoch": 0.6122685185185185,
"grad_norm": 0.19752278923988342,
"learning_rate": 3.88888888888889e-06,
"loss": 1.0748,
"step": 529
},
{
"epoch": 0.6134259259259259,
"grad_norm": 0.21283791959285736,
"learning_rate": 3.877314814814815e-06,
"loss": 1.0623,
"step": 530
},
{
"epoch": 0.6145833333333334,
"grad_norm": 0.19169341027736664,
"learning_rate": 3.865740740740741e-06,
"loss": 1.0359,
"step": 531
},
{
"epoch": 0.6157407407407407,
"grad_norm": 0.2018590122461319,
"learning_rate": 3.854166666666667e-06,
"loss": 0.9971,
"step": 532
},
{
"epoch": 0.6168981481481481,
"grad_norm": 0.22165873646736145,
"learning_rate": 3.842592592592592e-06,
"loss": 1.0325,
"step": 533
},
{
"epoch": 0.6180555555555556,
"grad_norm": 0.2014155089855194,
"learning_rate": 3.831018518518519e-06,
"loss": 1.0155,
"step": 534
},
{
"epoch": 0.6192129629629629,
"grad_norm": 0.19486472010612488,
"learning_rate": 3.819444444444444e-06,
"loss": 1.0517,
"step": 535
},
{
"epoch": 0.6203703703703703,
"grad_norm": 0.19350384175777435,
"learning_rate": 3.8078703703703705e-06,
"loss": 1.0234,
"step": 536
},
{
"epoch": 0.6215277777777778,
"grad_norm": 0.20624063909053802,
"learning_rate": 3.796296296296297e-06,
"loss": 1.0635,
"step": 537
},
{
"epoch": 0.6226851851851852,
"grad_norm": 0.19752593338489532,
"learning_rate": 3.7847222222222224e-06,
"loss": 1.048,
"step": 538
},
{
"epoch": 0.6238425925925926,
"grad_norm": 0.19572751224040985,
"learning_rate": 3.7731481481481487e-06,
"loss": 1.0476,
"step": 539
},
{
"epoch": 0.625,
"grad_norm": 0.19526030123233795,
"learning_rate": 3.761574074074074e-06,
"loss": 1.0884,
"step": 540
},
{
"epoch": 0.6261574074074074,
"grad_norm": 0.1917031854391098,
"learning_rate": 3.7500000000000005e-06,
"loss": 1.0593,
"step": 541
},
{
"epoch": 0.6273148148148148,
"grad_norm": 0.18925105035305023,
"learning_rate": 3.7384259259259264e-06,
"loss": 1.056,
"step": 542
},
{
"epoch": 0.6284722222222222,
"grad_norm": 0.18812283873558044,
"learning_rate": 3.726851851851852e-06,
"loss": 1.0288,
"step": 543
},
{
"epoch": 0.6296296296296297,
"grad_norm": 0.2086893916130066,
"learning_rate": 3.7152777777777783e-06,
"loss": 1.0505,
"step": 544
},
{
"epoch": 0.6307870370370371,
"grad_norm": 0.2322862148284912,
"learning_rate": 3.7037037037037037e-06,
"loss": 1.0552,
"step": 545
},
{
"epoch": 0.6319444444444444,
"grad_norm": 0.19495348632335663,
"learning_rate": 3.69212962962963e-06,
"loss": 1.085,
"step": 546
},
{
"epoch": 0.6331018518518519,
"grad_norm": 0.19767415523529053,
"learning_rate": 3.680555555555556e-06,
"loss": 1.0229,
"step": 547
},
{
"epoch": 0.6342592592592593,
"grad_norm": 0.19750790297985077,
"learning_rate": 3.668981481481482e-06,
"loss": 1.0217,
"step": 548
},
{
"epoch": 0.6354166666666666,
"grad_norm": 0.20190097391605377,
"learning_rate": 3.657407407407408e-06,
"loss": 1.0161,
"step": 549
},
{
"epoch": 0.6365740740740741,
"grad_norm": 0.2032175362110138,
"learning_rate": 3.6458333333333333e-06,
"loss": 1.0353,
"step": 550
},
{
"epoch": 0.6377314814814815,
"grad_norm": 0.19069179892539978,
"learning_rate": 3.6342592592592596e-06,
"loss": 1.0275,
"step": 551
},
{
"epoch": 0.6388888888888888,
"grad_norm": 0.20992456376552582,
"learning_rate": 3.6226851851851855e-06,
"loss": 1.0731,
"step": 552
},
{
"epoch": 0.6400462962962963,
"grad_norm": 0.2105102688074112,
"learning_rate": 3.6111111111111115e-06,
"loss": 1.0781,
"step": 553
},
{
"epoch": 0.6412037037037037,
"grad_norm": 0.19200275838375092,
"learning_rate": 3.5995370370370374e-06,
"loss": 1.0076,
"step": 554
},
{
"epoch": 0.6423611111111112,
"grad_norm": 0.2056274116039276,
"learning_rate": 3.5879629629629633e-06,
"loss": 1.0997,
"step": 555
},
{
"epoch": 0.6435185185185185,
"grad_norm": 0.3178321421146393,
"learning_rate": 3.576388888888889e-06,
"loss": 1.0507,
"step": 556
},
{
"epoch": 0.6446759259259259,
"grad_norm": 0.19499000906944275,
"learning_rate": 3.5648148148148147e-06,
"loss": 1.1137,
"step": 557
},
{
"epoch": 0.6458333333333334,
"grad_norm": 0.19996462762355804,
"learning_rate": 3.553240740740741e-06,
"loss": 1.0567,
"step": 558
},
{
"epoch": 0.6469907407407407,
"grad_norm": 0.19035173952579498,
"learning_rate": 3.5416666666666673e-06,
"loss": 1.0325,
"step": 559
},
{
"epoch": 0.6481481481481481,
"grad_norm": 0.20647963881492615,
"learning_rate": 3.530092592592593e-06,
"loss": 1.0859,
"step": 560
},
{
"epoch": 0.6493055555555556,
"grad_norm": 0.20454923808574677,
"learning_rate": 3.5185185185185187e-06,
"loss": 1.07,
"step": 561
},
{
"epoch": 0.6504629629629629,
"grad_norm": 0.18973702192306519,
"learning_rate": 3.5069444444444447e-06,
"loss": 1.0253,
"step": 562
},
{
"epoch": 0.6516203703703703,
"grad_norm": 0.23206621408462524,
"learning_rate": 3.4953703703703706e-06,
"loss": 0.9817,
"step": 563
},
{
"epoch": 0.6527777777777778,
"grad_norm": 0.19497150182724,
"learning_rate": 3.483796296296297e-06,
"loss": 0.9765,
"step": 564
},
{
"epoch": 0.6539351851851852,
"grad_norm": 0.20704008638858795,
"learning_rate": 3.4722222222222224e-06,
"loss": 1.0923,
"step": 565
},
{
"epoch": 0.6550925925925926,
"grad_norm": 0.19126930832862854,
"learning_rate": 3.4606481481481487e-06,
"loss": 1.0344,
"step": 566
},
{
"epoch": 0.65625,
"grad_norm": 0.3226393163204193,
"learning_rate": 3.449074074074074e-06,
"loss": 1.0769,
"step": 567
},
{
"epoch": 0.6574074074074074,
"grad_norm": 0.22098763287067413,
"learning_rate": 3.4375e-06,
"loss": 1.0469,
"step": 568
},
{
"epoch": 0.6585648148148148,
"grad_norm": 0.20191290974617004,
"learning_rate": 3.4259259259259265e-06,
"loss": 1.0734,
"step": 569
},
{
"epoch": 0.6597222222222222,
"grad_norm": 0.2134922593832016,
"learning_rate": 3.414351851851852e-06,
"loss": 1.0846,
"step": 570
},
{
"epoch": 0.6608796296296297,
"grad_norm": 0.23931817710399628,
"learning_rate": 3.4027777777777783e-06,
"loss": 1.0915,
"step": 571
},
{
"epoch": 0.6620370370370371,
"grad_norm": 0.3414127230644226,
"learning_rate": 3.3912037037037038e-06,
"loss": 1.0368,
"step": 572
},
{
"epoch": 0.6631944444444444,
"grad_norm": 0.2182944118976593,
"learning_rate": 3.37962962962963e-06,
"loss": 1.0002,
"step": 573
},
{
"epoch": 0.6643518518518519,
"grad_norm": 0.1967781037092209,
"learning_rate": 3.368055555555556e-06,
"loss": 1.0334,
"step": 574
},
{
"epoch": 0.6655092592592593,
"grad_norm": 0.2108740508556366,
"learning_rate": 3.3564814814814815e-06,
"loss": 1.1083,
"step": 575
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.20377075672149658,
"learning_rate": 3.344907407407408e-06,
"loss": 0.9726,
"step": 576
},
{
"epoch": 0.6678240740740741,
"grad_norm": 0.19917164742946625,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.0444,
"step": 577
},
{
"epoch": 0.6689814814814815,
"grad_norm": 0.20163099467754364,
"learning_rate": 3.3217592592592597e-06,
"loss": 1.0322,
"step": 578
},
{
"epoch": 0.6701388888888888,
"grad_norm": 0.20033225417137146,
"learning_rate": 3.3101851851851856e-06,
"loss": 1.0193,
"step": 579
},
{
"epoch": 0.6712962962962963,
"grad_norm": 0.1936756819486618,
"learning_rate": 3.2986111111111115e-06,
"loss": 1.0447,
"step": 580
},
{
"epoch": 0.6724537037037037,
"grad_norm": 0.19549590349197388,
"learning_rate": 3.2870370370370374e-06,
"loss": 1.0219,
"step": 581
},
{
"epoch": 0.6736111111111112,
"grad_norm": 0.20389345288276672,
"learning_rate": 3.275462962962963e-06,
"loss": 1.0208,
"step": 582
},
{
"epoch": 0.6747685185185185,
"grad_norm": 0.2609894275665283,
"learning_rate": 3.2638888888888892e-06,
"loss": 1.0721,
"step": 583
},
{
"epoch": 0.6759259259259259,
"grad_norm": 0.19846400618553162,
"learning_rate": 3.2523148148148147e-06,
"loss": 1.0241,
"step": 584
},
{
"epoch": 0.6770833333333334,
"grad_norm": 0.20081739127635956,
"learning_rate": 3.240740740740741e-06,
"loss": 1.01,
"step": 585
},
{
"epoch": 0.6782407407407407,
"grad_norm": 0.19961774349212646,
"learning_rate": 3.229166666666667e-06,
"loss": 1.0194,
"step": 586
},
{
"epoch": 0.6793981481481481,
"grad_norm": 0.20194299519062042,
"learning_rate": 3.217592592592593e-06,
"loss": 1.0244,
"step": 587
},
{
"epoch": 0.6805555555555556,
"grad_norm": 0.2130938321352005,
"learning_rate": 3.2060185185185188e-06,
"loss": 1.1111,
"step": 588
},
{
"epoch": 0.6817129629629629,
"grad_norm": 0.21115541458129883,
"learning_rate": 3.1944444444444443e-06,
"loss": 1.0162,
"step": 589
},
{
"epoch": 0.6828703703703703,
"grad_norm": 0.22813038527965546,
"learning_rate": 3.1828703703703706e-06,
"loss": 1.0524,
"step": 590
},
{
"epoch": 0.6840277777777778,
"grad_norm": 0.20492292940616608,
"learning_rate": 3.171296296296297e-06,
"loss": 1.0527,
"step": 591
},
{
"epoch": 0.6851851851851852,
"grad_norm": 0.2057611495256424,
"learning_rate": 3.1597222222222224e-06,
"loss": 1.0563,
"step": 592
},
{
"epoch": 0.6863425925925926,
"grad_norm": 0.19977176189422607,
"learning_rate": 3.1481481481481483e-06,
"loss": 1.0076,
"step": 593
},
{
"epoch": 0.6875,
"grad_norm": 0.20001326501369476,
"learning_rate": 3.1365740740740742e-06,
"loss": 1.0837,
"step": 594
},
{
"epoch": 0.6886574074074074,
"grad_norm": 0.20452041923999786,
"learning_rate": 3.125e-06,
"loss": 1.0414,
"step": 595
},
{
"epoch": 0.6898148148148148,
"grad_norm": 0.19678117334842682,
"learning_rate": 3.1134259259259265e-06,
"loss": 1.0396,
"step": 596
},
{
"epoch": 0.6909722222222222,
"grad_norm": 0.2082507163286209,
"learning_rate": 3.101851851851852e-06,
"loss": 0.9543,
"step": 597
},
{
"epoch": 0.6921296296296297,
"grad_norm": 0.20555655658245087,
"learning_rate": 3.0902777777777783e-06,
"loss": 1.0255,
"step": 598
},
{
"epoch": 0.6932870370370371,
"grad_norm": 0.2342251092195511,
"learning_rate": 3.078703703703704e-06,
"loss": 1.0321,
"step": 599
},
{
"epoch": 0.6944444444444444,
"grad_norm": 0.19482319056987762,
"learning_rate": 3.0671296296296297e-06,
"loss": 1.0691,
"step": 600
},
{
"epoch": 0.6956018518518519,
"grad_norm": 0.20958203077316284,
"learning_rate": 3.055555555555556e-06,
"loss": 1.1018,
"step": 601
},
{
"epoch": 0.6967592592592593,
"grad_norm": 0.20291192829608917,
"learning_rate": 3.0439814814814815e-06,
"loss": 1.0635,
"step": 602
},
{
"epoch": 0.6979166666666666,
"grad_norm": 0.2140427827835083,
"learning_rate": 3.032407407407408e-06,
"loss": 1.0796,
"step": 603
},
{
"epoch": 0.6990740740740741,
"grad_norm": 0.2033246010541916,
"learning_rate": 3.0208333333333334e-06,
"loss": 1.0493,
"step": 604
},
{
"epoch": 0.7002314814814815,
"grad_norm": 0.19889415800571442,
"learning_rate": 3.0092592592592597e-06,
"loss": 0.9938,
"step": 605
},
{
"epoch": 0.7013888888888888,
"grad_norm": 0.19974705576896667,
"learning_rate": 2.9976851851851856e-06,
"loss": 1.1022,
"step": 606
},
{
"epoch": 0.7025462962962963,
"grad_norm": 0.2727442681789398,
"learning_rate": 2.986111111111111e-06,
"loss": 1.0657,
"step": 607
},
{
"epoch": 0.7037037037037037,
"grad_norm": 0.27862659096717834,
"learning_rate": 2.9745370370370374e-06,
"loss": 0.9777,
"step": 608
},
{
"epoch": 0.7048611111111112,
"grad_norm": 0.1923118382692337,
"learning_rate": 2.962962962962963e-06,
"loss": 1.0446,
"step": 609
},
{
"epoch": 0.7060185185185185,
"grad_norm": 0.21940167248249054,
"learning_rate": 2.9513888888888892e-06,
"loss": 0.9947,
"step": 610
},
{
"epoch": 0.7071759259259259,
"grad_norm": 0.19208753108978271,
"learning_rate": 2.9398148148148147e-06,
"loss": 1.0289,
"step": 611
},
{
"epoch": 0.7083333333333334,
"grad_norm": 0.19762782752513885,
"learning_rate": 2.928240740740741e-06,
"loss": 0.9995,
"step": 612
},
{
"epoch": 0.7094907407407407,
"grad_norm": 0.20181262493133545,
"learning_rate": 2.916666666666667e-06,
"loss": 0.9995,
"step": 613
},
{
"epoch": 0.7106481481481481,
"grad_norm": 0.20299744606018066,
"learning_rate": 2.905092592592593e-06,
"loss": 1.0601,
"step": 614
},
{
"epoch": 0.7118055555555556,
"grad_norm": 0.23061317205429077,
"learning_rate": 2.893518518518519e-06,
"loss": 1.0337,
"step": 615
},
{
"epoch": 0.7129629629629629,
"grad_norm": 0.2045440375804901,
"learning_rate": 2.8819444444444443e-06,
"loss": 0.9872,
"step": 616
},
{
"epoch": 0.7141203703703703,
"grad_norm": 0.20532366633415222,
"learning_rate": 2.8703703703703706e-06,
"loss": 1.0355,
"step": 617
},
{
"epoch": 0.7152777777777778,
"grad_norm": 0.20266734063625336,
"learning_rate": 2.8587962962962965e-06,
"loss": 1.0675,
"step": 618
},
{
"epoch": 0.7164351851851852,
"grad_norm": 0.2020139992237091,
"learning_rate": 2.8472222222222224e-06,
"loss": 1.0446,
"step": 619
},
{
"epoch": 0.7175925925925926,
"grad_norm": 0.21043045818805695,
"learning_rate": 2.8356481481481484e-06,
"loss": 1.1025,
"step": 620
},
{
"epoch": 0.71875,
"grad_norm": 0.20197473466396332,
"learning_rate": 2.8240740740740743e-06,
"loss": 1.0526,
"step": 621
},
{
"epoch": 0.7199074074074074,
"grad_norm": 0.21696704626083374,
"learning_rate": 2.8125e-06,
"loss": 1.0345,
"step": 622
},
{
"epoch": 0.7210648148148148,
"grad_norm": 0.203839972615242,
"learning_rate": 2.8009259259259265e-06,
"loss": 1.035,
"step": 623
},
{
"epoch": 0.7222222222222222,
"grad_norm": 0.21520066261291504,
"learning_rate": 2.789351851851852e-06,
"loss": 1.033,
"step": 624
},
{
"epoch": 0.7233796296296297,
"grad_norm": 0.212286576628685,
"learning_rate": 2.7777777777777783e-06,
"loss": 1.0467,
"step": 625
},
{
"epoch": 0.7245370370370371,
"grad_norm": 0.20607005059719086,
"learning_rate": 2.766203703703704e-06,
"loss": 1.0341,
"step": 626
},
{
"epoch": 0.7256944444444444,
"grad_norm": 0.2012479156255722,
"learning_rate": 2.7546296296296297e-06,
"loss": 1.0087,
"step": 627
},
{
"epoch": 0.7268518518518519,
"grad_norm": 0.2070990800857544,
"learning_rate": 2.743055555555556e-06,
"loss": 1.0397,
"step": 628
},
{
"epoch": 0.7280092592592593,
"grad_norm": 0.20724061131477356,
"learning_rate": 2.7314814814814816e-06,
"loss": 1.0123,
"step": 629
},
{
"epoch": 0.7291666666666666,
"grad_norm": 0.20699013769626617,
"learning_rate": 2.719907407407408e-06,
"loss": 0.992,
"step": 630
},
{
"epoch": 0.7303240740740741,
"grad_norm": 0.2006494551897049,
"learning_rate": 2.7083333333333334e-06,
"loss": 1.0206,
"step": 631
},
{
"epoch": 0.7314814814814815,
"grad_norm": 0.2076648622751236,
"learning_rate": 2.6967592592592597e-06,
"loss": 1.0702,
"step": 632
},
{
"epoch": 0.7326388888888888,
"grad_norm": 0.1989884227514267,
"learning_rate": 2.6851851851851856e-06,
"loss": 1.0134,
"step": 633
},
{
"epoch": 0.7337962962962963,
"grad_norm": 0.20851609110832214,
"learning_rate": 2.673611111111111e-06,
"loss": 1.0791,
"step": 634
},
{
"epoch": 0.7349537037037037,
"grad_norm": 0.2005952000617981,
"learning_rate": 2.6620370370370374e-06,
"loss": 1.0168,
"step": 635
},
{
"epoch": 0.7361111111111112,
"grad_norm": 0.20755450427532196,
"learning_rate": 2.650462962962963e-06,
"loss": 1.0862,
"step": 636
},
{
"epoch": 0.7372685185185185,
"grad_norm": 0.20766755938529968,
"learning_rate": 2.6388888888888893e-06,
"loss": 1.0771,
"step": 637
},
{
"epoch": 0.7384259259259259,
"grad_norm": 0.2103908360004425,
"learning_rate": 2.6273148148148148e-06,
"loss": 1.0256,
"step": 638
},
{
"epoch": 0.7395833333333334,
"grad_norm": 0.20396418869495392,
"learning_rate": 2.615740740740741e-06,
"loss": 1.0897,
"step": 639
},
{
"epoch": 0.7407407407407407,
"grad_norm": 0.2057056427001953,
"learning_rate": 2.604166666666667e-06,
"loss": 1.0358,
"step": 640
},
{
"epoch": 0.7418981481481481,
"grad_norm": 0.20586839318275452,
"learning_rate": 2.5925925925925925e-06,
"loss": 1.0551,
"step": 641
},
{
"epoch": 0.7430555555555556,
"grad_norm": 0.20847848057746887,
"learning_rate": 2.581018518518519e-06,
"loss": 1.0597,
"step": 642
},
{
"epoch": 0.7442129629629629,
"grad_norm": 0.19656863808631897,
"learning_rate": 2.5694444444444443e-06,
"loss": 0.9972,
"step": 643
},
{
"epoch": 0.7453703703703703,
"grad_norm": 0.20384305715560913,
"learning_rate": 2.5578703703703706e-06,
"loss": 1.087,
"step": 644
},
{
"epoch": 0.7465277777777778,
"grad_norm": 0.21364444494247437,
"learning_rate": 2.5462962962962966e-06,
"loss": 1.022,
"step": 645
},
{
"epoch": 0.7476851851851852,
"grad_norm": 0.20833688974380493,
"learning_rate": 2.5347222222222225e-06,
"loss": 1.0704,
"step": 646
},
{
"epoch": 0.7488425925925926,
"grad_norm": 0.20712623000144958,
"learning_rate": 2.5231481481481484e-06,
"loss": 1.075,
"step": 647
},
{
"epoch": 0.75,
"grad_norm": 0.2294331043958664,
"learning_rate": 2.511574074074074e-06,
"loss": 1.0206,
"step": 648
},
{
"epoch": 0.7511574074074074,
"grad_norm": 0.21636289358139038,
"learning_rate": 2.5e-06,
"loss": 1.1148,
"step": 649
},
{
"epoch": 0.7523148148148148,
"grad_norm": 0.20249076187610626,
"learning_rate": 2.488425925925926e-06,
"loss": 1.0555,
"step": 650
},
{
"epoch": 0.7534722222222222,
"grad_norm": 0.19626903533935547,
"learning_rate": 2.476851851851852e-06,
"loss": 1.0323,
"step": 651
},
{
"epoch": 0.7546296296296297,
"grad_norm": 0.2054426223039627,
"learning_rate": 2.465277777777778e-06,
"loss": 1.0264,
"step": 652
},
{
"epoch": 0.7557870370370371,
"grad_norm": 0.2102532982826233,
"learning_rate": 2.453703703703704e-06,
"loss": 1.0552,
"step": 653
},
{
"epoch": 0.7569444444444444,
"grad_norm": 0.20038294792175293,
"learning_rate": 2.4421296296296298e-06,
"loss": 1.0043,
"step": 654
},
{
"epoch": 0.7581018518518519,
"grad_norm": 0.20885545015335083,
"learning_rate": 2.4305555555555557e-06,
"loss": 1.0417,
"step": 655
},
{
"epoch": 0.7592592592592593,
"grad_norm": 0.20562267303466797,
"learning_rate": 2.4189814814814816e-06,
"loss": 0.9974,
"step": 656
},
{
"epoch": 0.7604166666666666,
"grad_norm": 0.21955883502960205,
"learning_rate": 2.4074074074074075e-06,
"loss": 1.0758,
"step": 657
},
{
"epoch": 0.7615740740740741,
"grad_norm": 0.29182884097099304,
"learning_rate": 2.395833333333334e-06,
"loss": 1.059,
"step": 658
},
{
"epoch": 0.7627314814814815,
"grad_norm": 0.20799702405929565,
"learning_rate": 2.3842592592592593e-06,
"loss": 0.9904,
"step": 659
},
{
"epoch": 0.7638888888888888,
"grad_norm": 0.19996783137321472,
"learning_rate": 2.3726851851851852e-06,
"loss": 1.0222,
"step": 660
},
{
"epoch": 0.7650462962962963,
"grad_norm": 0.20935088396072388,
"learning_rate": 2.361111111111111e-06,
"loss": 1.0626,
"step": 661
},
{
"epoch": 0.7662037037037037,
"grad_norm": 0.2047765702009201,
"learning_rate": 2.349537037037037e-06,
"loss": 1.0108,
"step": 662
},
{
"epoch": 0.7673611111111112,
"grad_norm": 0.22284026443958282,
"learning_rate": 2.3379629629629634e-06,
"loss": 1.0378,
"step": 663
},
{
"epoch": 0.7685185185185185,
"grad_norm": 0.2159646898508072,
"learning_rate": 2.3263888888888893e-06,
"loss": 1.0571,
"step": 664
},
{
"epoch": 0.7696759259259259,
"grad_norm": 0.21102292835712433,
"learning_rate": 2.314814814814815e-06,
"loss": 1.0601,
"step": 665
},
{
"epoch": 0.7708333333333334,
"grad_norm": 0.24397940933704376,
"learning_rate": 2.3032407407407407e-06,
"loss": 1.0433,
"step": 666
},
{
"epoch": 0.7719907407407407,
"grad_norm": 0.2021220624446869,
"learning_rate": 2.2916666666666666e-06,
"loss": 1.0553,
"step": 667
},
{
"epoch": 0.7731481481481481,
"grad_norm": 0.20541134476661682,
"learning_rate": 2.280092592592593e-06,
"loss": 1.0218,
"step": 668
},
{
"epoch": 0.7743055555555556,
"grad_norm": 0.2077786922454834,
"learning_rate": 2.268518518518519e-06,
"loss": 1.0973,
"step": 669
},
{
"epoch": 0.7754629629629629,
"grad_norm": 0.21917060017585754,
"learning_rate": 2.2569444444444448e-06,
"loss": 1.0308,
"step": 670
},
{
"epoch": 0.7766203703703703,
"grad_norm": 0.2081642746925354,
"learning_rate": 2.2453703703703707e-06,
"loss": 1.0731,
"step": 671
},
{
"epoch": 0.7777777777777778,
"grad_norm": 0.20313401520252228,
"learning_rate": 2.2337962962962966e-06,
"loss": 1.1032,
"step": 672
},
{
"epoch": 0.7789351851851852,
"grad_norm": 0.2097117006778717,
"learning_rate": 2.222222222222222e-06,
"loss": 1.0215,
"step": 673
},
{
"epoch": 0.7800925925925926,
"grad_norm": 0.20481272041797638,
"learning_rate": 2.2106481481481484e-06,
"loss": 1.0594,
"step": 674
},
{
"epoch": 0.78125,
"grad_norm": 0.21563556790351868,
"learning_rate": 2.1990740740740743e-06,
"loss": 0.996,
"step": 675
},
{
"epoch": 0.7824074074074074,
"grad_norm": 0.21871314942836761,
"learning_rate": 2.1875000000000002e-06,
"loss": 1.0673,
"step": 676
},
{
"epoch": 0.7835648148148148,
"grad_norm": 0.21116910874843597,
"learning_rate": 2.175925925925926e-06,
"loss": 1.0941,
"step": 677
},
{
"epoch": 0.7847222222222222,
"grad_norm": 0.20714472234249115,
"learning_rate": 2.164351851851852e-06,
"loss": 1.0641,
"step": 678
},
{
"epoch": 0.7858796296296297,
"grad_norm": 0.217753604054451,
"learning_rate": 2.152777777777778e-06,
"loss": 1.0157,
"step": 679
},
{
"epoch": 0.7870370370370371,
"grad_norm": 0.20730328559875488,
"learning_rate": 2.141203703703704e-06,
"loss": 1.0018,
"step": 680
},
{
"epoch": 0.7881944444444444,
"grad_norm": 0.21109417080879211,
"learning_rate": 2.1296296296296298e-06,
"loss": 0.9815,
"step": 681
},
{
"epoch": 0.7893518518518519,
"grad_norm": 0.221344456076622,
"learning_rate": 2.1180555555555557e-06,
"loss": 1.0124,
"step": 682
},
{
"epoch": 0.7905092592592593,
"grad_norm": 0.22139620780944824,
"learning_rate": 2.1064814814814816e-06,
"loss": 1.011,
"step": 683
},
{
"epoch": 0.7916666666666666,
"grad_norm": 0.21301385760307312,
"learning_rate": 2.0949074074074075e-06,
"loss": 1.0383,
"step": 684
},
{
"epoch": 0.7928240740740741,
"grad_norm": 0.20398671925067902,
"learning_rate": 2.0833333333333334e-06,
"loss": 1.0404,
"step": 685
},
{
"epoch": 0.7939814814814815,
"grad_norm": 0.2034938633441925,
"learning_rate": 2.0717592592592593e-06,
"loss": 1.0328,
"step": 686
},
{
"epoch": 0.7951388888888888,
"grad_norm": 0.22071969509124756,
"learning_rate": 2.0601851851851853e-06,
"loss": 0.9852,
"step": 687
},
{
"epoch": 0.7962962962962963,
"grad_norm": 0.20521746575832367,
"learning_rate": 2.048611111111111e-06,
"loss": 0.9691,
"step": 688
},
{
"epoch": 0.7974537037037037,
"grad_norm": 0.20581713318824768,
"learning_rate": 2.037037037037037e-06,
"loss": 1.0328,
"step": 689
},
{
"epoch": 0.7986111111111112,
"grad_norm": 0.20280128717422485,
"learning_rate": 2.0254629629629634e-06,
"loss": 0.9418,
"step": 690
},
{
"epoch": 0.7997685185185185,
"grad_norm": 0.20858825743198395,
"learning_rate": 2.0138888888888893e-06,
"loss": 1.04,
"step": 691
},
{
"epoch": 0.8009259259259259,
"grad_norm": 0.21282228827476501,
"learning_rate": 2.002314814814815e-06,
"loss": 1.0959,
"step": 692
},
{
"epoch": 0.8020833333333334,
"grad_norm": 0.21245089173316956,
"learning_rate": 1.9907407407407407e-06,
"loss": 1.0266,
"step": 693
},
{
"epoch": 0.8032407407407407,
"grad_norm": 0.21784378588199615,
"learning_rate": 1.9791666666666666e-06,
"loss": 1.0592,
"step": 694
},
{
"epoch": 0.8043981481481481,
"grad_norm": 0.22031240165233612,
"learning_rate": 1.967592592592593e-06,
"loss": 1.0245,
"step": 695
},
{
"epoch": 0.8055555555555556,
"grad_norm": 0.20002427697181702,
"learning_rate": 1.956018518518519e-06,
"loss": 1.0165,
"step": 696
},
{
"epoch": 0.8067129629629629,
"grad_norm": 0.20793163776397705,
"learning_rate": 1.944444444444445e-06,
"loss": 1.0229,
"step": 697
},
{
"epoch": 0.8078703703703703,
"grad_norm": 0.20959703624248505,
"learning_rate": 1.9328703703703707e-06,
"loss": 1.0051,
"step": 698
},
{
"epoch": 0.8090277777777778,
"grad_norm": 0.20717735588550568,
"learning_rate": 1.921296296296296e-06,
"loss": 0.961,
"step": 699
},
{
"epoch": 0.8101851851851852,
"grad_norm": 0.20636354386806488,
"learning_rate": 1.909722222222222e-06,
"loss": 1.0221,
"step": 700
},
{
"epoch": 0.8113425925925926,
"grad_norm": 0.2062767595052719,
"learning_rate": 1.8981481481481484e-06,
"loss": 1.0733,
"step": 701
},
{
"epoch": 0.8125,
"grad_norm": 0.21050766110420227,
"learning_rate": 1.8865740740740743e-06,
"loss": 0.9686,
"step": 702
},
{
"epoch": 0.8136574074074074,
"grad_norm": 0.21685615181922913,
"learning_rate": 1.8750000000000003e-06,
"loss": 0.9892,
"step": 703
},
{
"epoch": 0.8148148148148148,
"grad_norm": 0.24662281572818756,
"learning_rate": 1.863425925925926e-06,
"loss": 1.0709,
"step": 704
},
{
"epoch": 0.8159722222222222,
"grad_norm": 0.20373214781284332,
"learning_rate": 1.8518518518518519e-06,
"loss": 1.025,
"step": 705
},
{
"epoch": 0.8171296296296297,
"grad_norm": 0.212567999958992,
"learning_rate": 1.840277777777778e-06,
"loss": 1.003,
"step": 706
},
{
"epoch": 0.8182870370370371,
"grad_norm": 0.20584706962108612,
"learning_rate": 1.828703703703704e-06,
"loss": 1.0821,
"step": 707
},
{
"epoch": 0.8194444444444444,
"grad_norm": 0.21041174232959747,
"learning_rate": 1.8171296296296298e-06,
"loss": 0.9875,
"step": 708
},
{
"epoch": 0.8206018518518519,
"grad_norm": 0.2137048840522766,
"learning_rate": 1.8055555555555557e-06,
"loss": 1.0338,
"step": 709
},
{
"epoch": 0.8217592592592593,
"grad_norm": 0.2729160785675049,
"learning_rate": 1.7939814814814816e-06,
"loss": 1.0076,
"step": 710
},
{
"epoch": 0.8229166666666666,
"grad_norm": 0.2052346020936966,
"learning_rate": 1.7824074074074073e-06,
"loss": 1.0193,
"step": 711
},
{
"epoch": 0.8240740740740741,
"grad_norm": 0.22163906693458557,
"learning_rate": 1.7708333333333337e-06,
"loss": 1.0455,
"step": 712
},
{
"epoch": 0.8252314814814815,
"grad_norm": 0.21060538291931152,
"learning_rate": 1.7592592592592594e-06,
"loss": 1.051,
"step": 713
},
{
"epoch": 0.8263888888888888,
"grad_norm": 0.21227489411830902,
"learning_rate": 1.7476851851851853e-06,
"loss": 1.0054,
"step": 714
},
{
"epoch": 0.8275462962962963,
"grad_norm": 0.2057754248380661,
"learning_rate": 1.7361111111111112e-06,
"loss": 1.0488,
"step": 715
},
{
"epoch": 0.8287037037037037,
"grad_norm": 0.21524278819561005,
"learning_rate": 1.724537037037037e-06,
"loss": 1.0146,
"step": 716
},
{
"epoch": 0.8298611111111112,
"grad_norm": 0.2293296605348587,
"learning_rate": 1.7129629629629632e-06,
"loss": 1.0471,
"step": 717
},
{
"epoch": 0.8310185185185185,
"grad_norm": 0.20466309785842896,
"learning_rate": 1.7013888888888891e-06,
"loss": 0.9884,
"step": 718
},
{
"epoch": 0.8321759259259259,
"grad_norm": 0.215054452419281,
"learning_rate": 1.689814814814815e-06,
"loss": 1.1239,
"step": 719
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.21369078755378723,
"learning_rate": 1.6782407407407408e-06,
"loss": 1.0047,
"step": 720
},
{
"epoch": 0.8344907407407407,
"grad_norm": 0.23838470876216888,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.9937,
"step": 721
},
{
"epoch": 0.8356481481481481,
"grad_norm": 0.20185384154319763,
"learning_rate": 1.6550925925925928e-06,
"loss": 0.9759,
"step": 722
},
{
"epoch": 0.8368055555555556,
"grad_norm": 0.20911818742752075,
"learning_rate": 1.6435185185185187e-06,
"loss": 1.0018,
"step": 723
},
{
"epoch": 0.8379629629629629,
"grad_norm": 0.20076234638690948,
"learning_rate": 1.6319444444444446e-06,
"loss": 0.9895,
"step": 724
},
{
"epoch": 0.8391203703703703,
"grad_norm": 0.21198947727680206,
"learning_rate": 1.6203703703703705e-06,
"loss": 0.9969,
"step": 725
},
{
"epoch": 0.8402777777777778,
"grad_norm": 0.20863303542137146,
"learning_rate": 1.6087962962962964e-06,
"loss": 1.0044,
"step": 726
},
{
"epoch": 0.8414351851851852,
"grad_norm": 0.20545004308223724,
"learning_rate": 1.5972222222222221e-06,
"loss": 1.0147,
"step": 727
},
{
"epoch": 0.8425925925925926,
"grad_norm": 0.2052348256111145,
"learning_rate": 1.5856481481481485e-06,
"loss": 0.9703,
"step": 728
},
{
"epoch": 0.84375,
"grad_norm": 0.2435932457447052,
"learning_rate": 1.5740740740740742e-06,
"loss": 1.0181,
"step": 729
},
{
"epoch": 0.8449074074074074,
"grad_norm": 0.22731885313987732,
"learning_rate": 1.5625e-06,
"loss": 1.1076,
"step": 730
},
{
"epoch": 0.8460648148148148,
"grad_norm": 0.208094984292984,
"learning_rate": 1.550925925925926e-06,
"loss": 1.0641,
"step": 731
},
{
"epoch": 0.8472222222222222,
"grad_norm": 0.20480291545391083,
"learning_rate": 1.539351851851852e-06,
"loss": 1.0031,
"step": 732
},
{
"epoch": 0.8483796296296297,
"grad_norm": 0.21013250946998596,
"learning_rate": 1.527777777777778e-06,
"loss": 1.0306,
"step": 733
},
{
"epoch": 0.8495370370370371,
"grad_norm": 0.211978942155838,
"learning_rate": 1.516203703703704e-06,
"loss": 1.0357,
"step": 734
},
{
"epoch": 0.8506944444444444,
"grad_norm": 0.22962923347949982,
"learning_rate": 1.5046296296296298e-06,
"loss": 1.0954,
"step": 735
},
{
"epoch": 0.8518518518518519,
"grad_norm": 0.2159680426120758,
"learning_rate": 1.4930555555555555e-06,
"loss": 1.0309,
"step": 736
},
{
"epoch": 0.8530092592592593,
"grad_norm": 0.20775794982910156,
"learning_rate": 1.4814814814814815e-06,
"loss": 0.9869,
"step": 737
},
{
"epoch": 0.8541666666666666,
"grad_norm": 0.22548414766788483,
"learning_rate": 1.4699074074074074e-06,
"loss": 1.1086,
"step": 738
},
{
"epoch": 0.8553240740740741,
"grad_norm": 0.21975237131118774,
"learning_rate": 1.4583333333333335e-06,
"loss": 1.053,
"step": 739
},
{
"epoch": 0.8564814814814815,
"grad_norm": 0.20786741375923157,
"learning_rate": 1.4467592592592594e-06,
"loss": 1.0756,
"step": 740
},
{
"epoch": 0.8576388888888888,
"grad_norm": 0.20583467185497284,
"learning_rate": 1.4351851851851853e-06,
"loss": 1.0741,
"step": 741
},
{
"epoch": 0.8587962962962963,
"grad_norm": 0.2188231199979782,
"learning_rate": 1.4236111111111112e-06,
"loss": 0.9667,
"step": 742
},
{
"epoch": 0.8599537037037037,
"grad_norm": 0.20682111382484436,
"learning_rate": 1.4120370370370371e-06,
"loss": 1.0409,
"step": 743
},
{
"epoch": 0.8611111111111112,
"grad_norm": 0.21626965701580048,
"learning_rate": 1.4004629629629633e-06,
"loss": 1.0366,
"step": 744
},
{
"epoch": 0.8622685185185185,
"grad_norm": 0.21385551989078522,
"learning_rate": 1.3888888888888892e-06,
"loss": 1.091,
"step": 745
},
{
"epoch": 0.8634259259259259,
"grad_norm": 0.2148781716823578,
"learning_rate": 1.3773148148148149e-06,
"loss": 1.0033,
"step": 746
},
{
"epoch": 0.8645833333333334,
"grad_norm": 0.21703305840492249,
"learning_rate": 1.3657407407407408e-06,
"loss": 1.0228,
"step": 747
},
{
"epoch": 0.8657407407407407,
"grad_norm": 0.20812486112117767,
"learning_rate": 1.3541666666666667e-06,
"loss": 1.0227,
"step": 748
},
{
"epoch": 0.8668981481481481,
"grad_norm": 0.20546585321426392,
"learning_rate": 1.3425925925925928e-06,
"loss": 1.0391,
"step": 749
},
{
"epoch": 0.8680555555555556,
"grad_norm": 0.26289820671081543,
"learning_rate": 1.3310185185185187e-06,
"loss": 1.0451,
"step": 750
},
{
"epoch": 0.8692129629629629,
"grad_norm": 0.262515127658844,
"learning_rate": 1.3194444444444446e-06,
"loss": 1.0597,
"step": 751
},
{
"epoch": 0.8703703703703703,
"grad_norm": 0.21458126604557037,
"learning_rate": 1.3078703703703705e-06,
"loss": 1.0574,
"step": 752
},
{
"epoch": 0.8715277777777778,
"grad_norm": 0.20759519934654236,
"learning_rate": 1.2962962962962962e-06,
"loss": 1.0697,
"step": 753
},
{
"epoch": 0.8726851851851852,
"grad_norm": 0.21060067415237427,
"learning_rate": 1.2847222222222222e-06,
"loss": 0.956,
"step": 754
},
{
"epoch": 0.8738425925925926,
"grad_norm": 0.2437879592180252,
"learning_rate": 1.2731481481481483e-06,
"loss": 0.9785,
"step": 755
},
{
"epoch": 0.875,
"grad_norm": 0.22363758087158203,
"learning_rate": 1.2615740740740742e-06,
"loss": 1.0751,
"step": 756
},
{
"epoch": 0.8761574074074074,
"grad_norm": 0.2119099348783493,
"learning_rate": 1.25e-06,
"loss": 1.0775,
"step": 757
},
{
"epoch": 0.8773148148148148,
"grad_norm": 0.2066664844751358,
"learning_rate": 1.238425925925926e-06,
"loss": 0.9457,
"step": 758
},
{
"epoch": 0.8784722222222222,
"grad_norm": 0.21327026188373566,
"learning_rate": 1.226851851851852e-06,
"loss": 1.0065,
"step": 759
},
{
"epoch": 0.8796296296296297,
"grad_norm": 0.22070683538913727,
"learning_rate": 1.2152777777777778e-06,
"loss": 1.0822,
"step": 760
},
{
"epoch": 0.8807870370370371,
"grad_norm": 0.5572031140327454,
"learning_rate": 1.2037037037037037e-06,
"loss": 1.0996,
"step": 761
},
{
"epoch": 0.8819444444444444,
"grad_norm": 0.20942085981369019,
"learning_rate": 1.1921296296296297e-06,
"loss": 1.0156,
"step": 762
},
{
"epoch": 0.8831018518518519,
"grad_norm": 0.20444652438163757,
"learning_rate": 1.1805555555555556e-06,
"loss": 1.0382,
"step": 763
},
{
"epoch": 0.8842592592592593,
"grad_norm": 0.2016180455684662,
"learning_rate": 1.1689814814814817e-06,
"loss": 1.0174,
"step": 764
},
{
"epoch": 0.8854166666666666,
"grad_norm": 0.20109869539737701,
"learning_rate": 1.1574074074074076e-06,
"loss": 1.0037,
"step": 765
},
{
"epoch": 0.8865740740740741,
"grad_norm": 0.23390746116638184,
"learning_rate": 1.1458333333333333e-06,
"loss": 1.0423,
"step": 766
},
{
"epoch": 0.8877314814814815,
"grad_norm": 0.21662883460521698,
"learning_rate": 1.1342592592592594e-06,
"loss": 1.03,
"step": 767
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.21152052283287048,
"learning_rate": 1.1226851851851853e-06,
"loss": 1.0689,
"step": 768
},
{
"epoch": 0.8900462962962963,
"grad_norm": 0.2138381153345108,
"learning_rate": 1.111111111111111e-06,
"loss": 0.9821,
"step": 769
},
{
"epoch": 0.8912037037037037,
"grad_norm": 0.2141958475112915,
"learning_rate": 1.0995370370370372e-06,
"loss": 0.9939,
"step": 770
},
{
"epoch": 0.8923611111111112,
"grad_norm": 0.21247626841068268,
"learning_rate": 1.087962962962963e-06,
"loss": 1.0159,
"step": 771
},
{
"epoch": 0.8935185185185185,
"grad_norm": 0.20598827302455902,
"learning_rate": 1.076388888888889e-06,
"loss": 1.0364,
"step": 772
},
{
"epoch": 0.8946759259259259,
"grad_norm": 0.2089841365814209,
"learning_rate": 1.0648148148148149e-06,
"loss": 1.0458,
"step": 773
},
{
"epoch": 0.8958333333333334,
"grad_norm": 0.20895791053771973,
"learning_rate": 1.0532407407407408e-06,
"loss": 1.0413,
"step": 774
},
{
"epoch": 0.8969907407407407,
"grad_norm": 0.22381016612052917,
"learning_rate": 1.0416666666666667e-06,
"loss": 0.9736,
"step": 775
},
{
"epoch": 0.8981481481481481,
"grad_norm": 0.20567692816257477,
"learning_rate": 1.0300925925925926e-06,
"loss": 0.954,
"step": 776
},
{
"epoch": 0.8993055555555556,
"grad_norm": 0.2209719866514206,
"learning_rate": 1.0185185185185185e-06,
"loss": 1.0772,
"step": 777
},
{
"epoch": 0.9004629629629629,
"grad_norm": 0.2152528315782547,
"learning_rate": 1.0069444444444447e-06,
"loss": 1.0277,
"step": 778
},
{
"epoch": 0.9016203703703703,
"grad_norm": 0.2000735104084015,
"learning_rate": 9.953703703703704e-07,
"loss": 1.0367,
"step": 779
},
{
"epoch": 0.9027777777777778,
"grad_norm": 0.2128438502550125,
"learning_rate": 9.837962962962965e-07,
"loss": 0.9917,
"step": 780
},
{
"epoch": 0.9039351851851852,
"grad_norm": 0.21334069967269897,
"learning_rate": 9.722222222222224e-07,
"loss": 1.0571,
"step": 781
},
{
"epoch": 0.9050925925925926,
"grad_norm": 0.20610938966274261,
"learning_rate": 9.60648148148148e-07,
"loss": 1.0239,
"step": 782
},
{
"epoch": 0.90625,
"grad_norm": 0.21400775015354156,
"learning_rate": 9.490740740740742e-07,
"loss": 1.057,
"step": 783
},
{
"epoch": 0.9074074074074074,
"grad_norm": 0.22360017895698547,
"learning_rate": 9.375000000000001e-07,
"loss": 1.0245,
"step": 784
},
{
"epoch": 0.9085648148148148,
"grad_norm": 0.22137287259101868,
"learning_rate": 9.259259259259259e-07,
"loss": 0.9877,
"step": 785
},
{
"epoch": 0.9097222222222222,
"grad_norm": 0.2224033772945404,
"learning_rate": 9.14351851851852e-07,
"loss": 1.1311,
"step": 786
},
{
"epoch": 0.9108796296296297,
"grad_norm": 0.20913951098918915,
"learning_rate": 9.027777777777779e-07,
"loss": 0.9691,
"step": 787
},
{
"epoch": 0.9120370370370371,
"grad_norm": 0.21408195793628693,
"learning_rate": 8.912037037037037e-07,
"loss": 1.0495,
"step": 788
},
{
"epoch": 0.9131944444444444,
"grad_norm": 0.21985873579978943,
"learning_rate": 8.796296296296297e-07,
"loss": 1.0578,
"step": 789
},
{
"epoch": 0.9143518518518519,
"grad_norm": 0.21710622310638428,
"learning_rate": 8.680555555555556e-07,
"loss": 1.0115,
"step": 790
},
{
"epoch": 0.9155092592592593,
"grad_norm": 0.2131613790988922,
"learning_rate": 8.564814814814816e-07,
"loss": 0.9411,
"step": 791
},
{
"epoch": 0.9166666666666666,
"grad_norm": 0.2729133665561676,
"learning_rate": 8.449074074074075e-07,
"loss": 1.0428,
"step": 792
},
{
"epoch": 0.9178240740740741,
"grad_norm": 0.21577362716197968,
"learning_rate": 8.333333333333333e-07,
"loss": 1.0285,
"step": 793
},
{
"epoch": 0.9189814814814815,
"grad_norm": 0.21308249235153198,
"learning_rate": 8.217592592592593e-07,
"loss": 0.9985,
"step": 794
},
{
"epoch": 0.9201388888888888,
"grad_norm": 0.21895642578601837,
"learning_rate": 8.101851851851853e-07,
"loss": 1.1138,
"step": 795
},
{
"epoch": 0.9212962962962963,
"grad_norm": 0.2174258530139923,
"learning_rate": 7.986111111111111e-07,
"loss": 0.9322,
"step": 796
},
{
"epoch": 0.9224537037037037,
"grad_norm": 0.20960073173046112,
"learning_rate": 7.870370370370371e-07,
"loss": 1.0716,
"step": 797
},
{
"epoch": 0.9236111111111112,
"grad_norm": 0.2102329581975937,
"learning_rate": 7.75462962962963e-07,
"loss": 1.0035,
"step": 798
},
{
"epoch": 0.9247685185185185,
"grad_norm": 0.22140781581401825,
"learning_rate": 7.63888888888889e-07,
"loss": 1.0538,
"step": 799
},
{
"epoch": 0.9259259259259259,
"grad_norm": 0.2097562849521637,
"learning_rate": 7.523148148148149e-07,
"loss": 1.0062,
"step": 800
},
{
"epoch": 0.9270833333333334,
"grad_norm": 0.21233950555324554,
"learning_rate": 7.407407407407407e-07,
"loss": 0.9928,
"step": 801
},
{
"epoch": 0.9282407407407407,
"grad_norm": 0.223008394241333,
"learning_rate": 7.291666666666667e-07,
"loss": 1.0258,
"step": 802
},
{
"epoch": 0.9293981481481481,
"grad_norm": 0.21271130442619324,
"learning_rate": 7.175925925925927e-07,
"loss": 1.0305,
"step": 803
},
{
"epoch": 0.9305555555555556,
"grad_norm": 0.2107594758272171,
"learning_rate": 7.060185185185186e-07,
"loss": 0.9713,
"step": 804
},
{
"epoch": 0.9317129629629629,
"grad_norm": 0.21259775757789612,
"learning_rate": 6.944444444444446e-07,
"loss": 1.0238,
"step": 805
},
{
"epoch": 0.9328703703703703,
"grad_norm": 0.22627367079257965,
"learning_rate": 6.828703703703704e-07,
"loss": 1.0147,
"step": 806
},
{
"epoch": 0.9340277777777778,
"grad_norm": 0.22005265951156616,
"learning_rate": 6.712962962962964e-07,
"loss": 0.9688,
"step": 807
},
{
"epoch": 0.9351851851851852,
"grad_norm": 0.198886439204216,
"learning_rate": 6.597222222222223e-07,
"loss": 1.0142,
"step": 808
},
{
"epoch": 0.9363425925925926,
"grad_norm": 0.21684283018112183,
"learning_rate": 6.481481481481481e-07,
"loss": 0.9538,
"step": 809
},
{
"epoch": 0.9375,
"grad_norm": 0.2184874415397644,
"learning_rate": 6.365740740740741e-07,
"loss": 1.0073,
"step": 810
},
{
"epoch": 0.9386574074074074,
"grad_norm": 0.2754838764667511,
"learning_rate": 6.25e-07,
"loss": 0.9981,
"step": 811
},
{
"epoch": 0.9398148148148148,
"grad_norm": 0.21016815304756165,
"learning_rate": 6.13425925925926e-07,
"loss": 1.0259,
"step": 812
},
{
"epoch": 0.9409722222222222,
"grad_norm": 0.209848091006279,
"learning_rate": 6.018518518518519e-07,
"loss": 1.0265,
"step": 813
},
{
"epoch": 0.9421296296296297,
"grad_norm": 0.2134370058774948,
"learning_rate": 5.902777777777778e-07,
"loss": 1.0218,
"step": 814
},
{
"epoch": 0.9432870370370371,
"grad_norm": 0.21250751614570618,
"learning_rate": 5.787037037037038e-07,
"loss": 1.0203,
"step": 815
},
{
"epoch": 0.9444444444444444,
"grad_norm": 0.21034426987171173,
"learning_rate": 5.671296296296297e-07,
"loss": 1.0716,
"step": 816
},
{
"epoch": 0.9456018518518519,
"grad_norm": 0.20719684660434723,
"learning_rate": 5.555555555555555e-07,
"loss": 0.9431,
"step": 817
},
{
"epoch": 0.9467592592592593,
"grad_norm": 0.20388469099998474,
"learning_rate": 5.439814814814815e-07,
"loss": 1.0351,
"step": 818
},
{
"epoch": 0.9479166666666666,
"grad_norm": 0.20847779512405396,
"learning_rate": 5.324074074074074e-07,
"loss": 1.1189,
"step": 819
},
{
"epoch": 0.9490740740740741,
"grad_norm": 0.22145158052444458,
"learning_rate": 5.208333333333334e-07,
"loss": 1.067,
"step": 820
},
{
"epoch": 0.9502314814814815,
"grad_norm": 0.21092523634433746,
"learning_rate": 5.092592592592593e-07,
"loss": 1.0551,
"step": 821
},
{
"epoch": 0.9513888888888888,
"grad_norm": 0.20877982676029205,
"learning_rate": 4.976851851851852e-07,
"loss": 1.052,
"step": 822
},
{
"epoch": 0.9525462962962963,
"grad_norm": 0.21412430703639984,
"learning_rate": 4.861111111111112e-07,
"loss": 1.0205,
"step": 823
},
{
"epoch": 0.9537037037037037,
"grad_norm": 0.21050423383712769,
"learning_rate": 4.745370370370371e-07,
"loss": 0.9803,
"step": 824
},
{
"epoch": 0.9548611111111112,
"grad_norm": 0.21291252970695496,
"learning_rate": 4.6296296296296297e-07,
"loss": 1.0225,
"step": 825
},
{
"epoch": 0.9560185185185185,
"grad_norm": 0.20651710033416748,
"learning_rate": 4.5138888888888893e-07,
"loss": 1.0051,
"step": 826
},
{
"epoch": 0.9571759259259259,
"grad_norm": 0.31924715638160706,
"learning_rate": 4.3981481481481484e-07,
"loss": 0.9834,
"step": 827
},
{
"epoch": 0.9583333333333334,
"grad_norm": 0.21879231929779053,
"learning_rate": 4.282407407407408e-07,
"loss": 1.0335,
"step": 828
},
{
"epoch": 0.9594907407407407,
"grad_norm": 0.20607353746891022,
"learning_rate": 4.1666666666666667e-07,
"loss": 1.0727,
"step": 829
},
{
"epoch": 0.9606481481481481,
"grad_norm": 0.2167244255542755,
"learning_rate": 4.0509259259259263e-07,
"loss": 1.0587,
"step": 830
},
{
"epoch": 0.9618055555555556,
"grad_norm": 0.23660607635974884,
"learning_rate": 3.9351851851851854e-07,
"loss": 1.0287,
"step": 831
},
{
"epoch": 0.9629629629629629,
"grad_norm": 0.21072295308113098,
"learning_rate": 3.819444444444445e-07,
"loss": 1.0098,
"step": 832
},
{
"epoch": 0.9641203703703703,
"grad_norm": 0.2014254480600357,
"learning_rate": 3.7037037037037036e-07,
"loss": 0.9951,
"step": 833
},
{
"epoch": 0.9652777777777778,
"grad_norm": 0.20475035905838013,
"learning_rate": 3.5879629629629633e-07,
"loss": 0.953,
"step": 834
},
{
"epoch": 0.9664351851851852,
"grad_norm": 0.2787926495075226,
"learning_rate": 3.472222222222223e-07,
"loss": 1.0162,
"step": 835
},
{
"epoch": 0.9675925925925926,
"grad_norm": 0.21481136977672577,
"learning_rate": 3.356481481481482e-07,
"loss": 0.9762,
"step": 836
},
{
"epoch": 0.96875,
"grad_norm": 0.25820887088775635,
"learning_rate": 3.2407407407407406e-07,
"loss": 1.0382,
"step": 837
},
{
"epoch": 0.9699074074074074,
"grad_norm": 0.20729532837867737,
"learning_rate": 3.125e-07,
"loss": 1.041,
"step": 838
},
{
"epoch": 0.9710648148148148,
"grad_norm": 0.23142774403095245,
"learning_rate": 3.0092592592592594e-07,
"loss": 1.0654,
"step": 839
},
{
"epoch": 0.9722222222222222,
"grad_norm": 0.2782406210899353,
"learning_rate": 2.893518518518519e-07,
"loss": 0.9969,
"step": 840
},
{
"epoch": 0.9733796296296297,
"grad_norm": 0.2658650279045105,
"learning_rate": 2.7777777777777776e-07,
"loss": 1.122,
"step": 841
},
{
"epoch": 0.9745370370370371,
"grad_norm": 0.21585489809513092,
"learning_rate": 2.662037037037037e-07,
"loss": 0.9342,
"step": 842
},
{
"epoch": 0.9756944444444444,
"grad_norm": 0.20729321241378784,
"learning_rate": 2.5462962962962963e-07,
"loss": 0.9905,
"step": 843
},
{
"epoch": 0.9768518518518519,
"grad_norm": 0.20832544565200806,
"learning_rate": 2.430555555555556e-07,
"loss": 1.0455,
"step": 844
},
{
"epoch": 0.9780092592592593,
"grad_norm": 0.20782917737960815,
"learning_rate": 2.3148148148148148e-07,
"loss": 1.0264,
"step": 845
},
{
"epoch": 0.9791666666666666,
"grad_norm": 0.2169673889875412,
"learning_rate": 2.1990740740740742e-07,
"loss": 1.0178,
"step": 846
},
{
"epoch": 0.9803240740740741,
"grad_norm": 0.20999179780483246,
"learning_rate": 2.0833333333333333e-07,
"loss": 0.9974,
"step": 847
},
{
"epoch": 0.9814814814814815,
"grad_norm": 0.21232061088085175,
"learning_rate": 1.9675925925925927e-07,
"loss": 1.0486,
"step": 848
},
{
"epoch": 0.9826388888888888,
"grad_norm": 0.21031907200813293,
"learning_rate": 1.8518518518518518e-07,
"loss": 0.9641,
"step": 849
},
{
"epoch": 0.9837962962962963,
"grad_norm": 0.22616001963615417,
"learning_rate": 1.7361111111111115e-07,
"loss": 0.9608,
"step": 850
},
{
"epoch": 0.9849537037037037,
"grad_norm": 0.21996359527111053,
"learning_rate": 1.6203703703703703e-07,
"loss": 1.0732,
"step": 851
},
{
"epoch": 0.9861111111111112,
"grad_norm": 0.24073836207389832,
"learning_rate": 1.5046296296296297e-07,
"loss": 1.0632,
"step": 852
},
{
"epoch": 0.9872685185185185,
"grad_norm": 0.21955423057079315,
"learning_rate": 1.3888888888888888e-07,
"loss": 1.0028,
"step": 853
},
{
"epoch": 0.9884259259259259,
"grad_norm": 0.211786687374115,
"learning_rate": 1.2731481481481482e-07,
"loss": 1.0355,
"step": 854
},
{
"epoch": 0.9895833333333334,
"grad_norm": 0.21731968224048615,
"learning_rate": 1.1574074074074074e-07,
"loss": 1.041,
"step": 855
},
{
"epoch": 0.9907407407407407,
"grad_norm": 0.27528902888298035,
"learning_rate": 1.0416666666666667e-07,
"loss": 0.9799,
"step": 856
},
{
"epoch": 0.9918981481481481,
"grad_norm": 0.20832359790802002,
"learning_rate": 9.259259259259259e-08,
"loss": 0.9987,
"step": 857
},
{
"epoch": 0.9930555555555556,
"grad_norm": 0.22494304180145264,
"learning_rate": 8.101851851851852e-08,
"loss": 1.02,
"step": 858
},
{
"epoch": 0.9942129629629629,
"grad_norm": 0.2141268104314804,
"learning_rate": 6.944444444444444e-08,
"loss": 1.0446,
"step": 859
},
{
"epoch": 0.9953703703703703,
"grad_norm": 0.21849359571933746,
"learning_rate": 5.787037037037037e-08,
"loss": 1.1139,
"step": 860
},
{
"epoch": 0.9965277777777778,
"grad_norm": 0.21971233189105988,
"learning_rate": 4.6296296296296295e-08,
"loss": 1.021,
"step": 861
},
{
"epoch": 0.9976851851851852,
"grad_norm": 0.21371498703956604,
"learning_rate": 3.472222222222222e-08,
"loss": 1.0028,
"step": 862
},
{
"epoch": 0.9988425925925926,
"grad_norm": 0.21930085122585297,
"learning_rate": 2.3148148148148148e-08,
"loss": 1.0663,
"step": 863
},
{
"epoch": 1.0,
"grad_norm": 0.2178175002336502,
"learning_rate": 1.1574074074074074e-08,
"loss": 0.9601,
"step": 864
}
],
"logging_steps": 1.0,
"max_steps": 864,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2938073541516984e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}