Stewart Slocum
Add fine-tuned model
da5e62b
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 170,
"global_step": 170,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0058823529411764705,
"grad_norm": 0.740064799785614,
"learning_rate": 1e-05,
"loss": 2.4395,
"step": 1
},
{
"epoch": 0.011764705882352941,
"grad_norm": 0.7219232320785522,
"learning_rate": 9.941176470588236e-06,
"loss": 2.3902,
"step": 2
},
{
"epoch": 0.01764705882352941,
"grad_norm": 0.77315753698349,
"learning_rate": 9.882352941176472e-06,
"loss": 2.4516,
"step": 3
},
{
"epoch": 0.023529411764705882,
"grad_norm": 0.7578166127204895,
"learning_rate": 9.823529411764706e-06,
"loss": 2.4148,
"step": 4
},
{
"epoch": 0.029411764705882353,
"grad_norm": 0.7205833196640015,
"learning_rate": 9.764705882352942e-06,
"loss": 2.3372,
"step": 5
},
{
"epoch": 0.03529411764705882,
"grad_norm": 0.7160035967826843,
"learning_rate": 9.705882352941177e-06,
"loss": 2.2849,
"step": 6
},
{
"epoch": 0.041176470588235294,
"grad_norm": 0.8269237875938416,
"learning_rate": 9.647058823529412e-06,
"loss": 2.3719,
"step": 7
},
{
"epoch": 0.047058823529411764,
"grad_norm": 0.7316713333129883,
"learning_rate": 9.588235294117649e-06,
"loss": 2.2175,
"step": 8
},
{
"epoch": 0.052941176470588235,
"grad_norm": 0.7852907776832581,
"learning_rate": 9.529411764705882e-06,
"loss": 2.2489,
"step": 9
},
{
"epoch": 0.058823529411764705,
"grad_norm": 0.7100040316581726,
"learning_rate": 9.470588235294119e-06,
"loss": 2.1828,
"step": 10
},
{
"epoch": 0.06470588235294118,
"grad_norm": 0.6905198693275452,
"learning_rate": 9.411764705882354e-06,
"loss": 2.1709,
"step": 11
},
{
"epoch": 0.07058823529411765,
"grad_norm": 0.6189457774162292,
"learning_rate": 9.352941176470589e-06,
"loss": 2.1152,
"step": 12
},
{
"epoch": 0.07647058823529412,
"grad_norm": 0.5859349370002747,
"learning_rate": 9.294117647058824e-06,
"loss": 2.0362,
"step": 13
},
{
"epoch": 0.08235294117647059,
"grad_norm": 0.6242568492889404,
"learning_rate": 9.23529411764706e-06,
"loss": 2.0808,
"step": 14
},
{
"epoch": 0.08823529411764706,
"grad_norm": 0.6139904856681824,
"learning_rate": 9.176470588235294e-06,
"loss": 2.017,
"step": 15
},
{
"epoch": 0.09411764705882353,
"grad_norm": 0.6155012249946594,
"learning_rate": 9.11764705882353e-06,
"loss": 2.0315,
"step": 16
},
{
"epoch": 0.1,
"grad_norm": 0.6213613152503967,
"learning_rate": 9.058823529411765e-06,
"loss": 1.9902,
"step": 17
},
{
"epoch": 0.10588235294117647,
"grad_norm": 0.584740936756134,
"learning_rate": 9e-06,
"loss": 1.9679,
"step": 18
},
{
"epoch": 0.11176470588235295,
"grad_norm": 0.5694301128387451,
"learning_rate": 8.941176470588237e-06,
"loss": 1.9416,
"step": 19
},
{
"epoch": 0.11764705882352941,
"grad_norm": 0.5494748950004578,
"learning_rate": 8.88235294117647e-06,
"loss": 1.9129,
"step": 20
},
{
"epoch": 0.12352941176470589,
"grad_norm": 0.5430072546005249,
"learning_rate": 8.823529411764707e-06,
"loss": 1.89,
"step": 21
},
{
"epoch": 0.12941176470588237,
"grad_norm": 0.5303496718406677,
"learning_rate": 8.764705882352942e-06,
"loss": 1.8751,
"step": 22
},
{
"epoch": 0.13529411764705881,
"grad_norm": 0.5339208841323853,
"learning_rate": 8.705882352941177e-06,
"loss": 1.8598,
"step": 23
},
{
"epoch": 0.1411764705882353,
"grad_norm": 0.5348221659660339,
"learning_rate": 8.647058823529413e-06,
"loss": 1.8426,
"step": 24
},
{
"epoch": 0.14705882352941177,
"grad_norm": 0.4850575923919678,
"learning_rate": 8.588235294117647e-06,
"loss": 1.8126,
"step": 25
},
{
"epoch": 0.15294117647058825,
"grad_norm": 0.5005661845207214,
"learning_rate": 8.529411764705883e-06,
"loss": 1.8054,
"step": 26
},
{
"epoch": 0.1588235294117647,
"grad_norm": 0.47416189312934875,
"learning_rate": 8.470588235294118e-06,
"loss": 1.7775,
"step": 27
},
{
"epoch": 0.16470588235294117,
"grad_norm": 0.49917134642601013,
"learning_rate": 8.411764705882353e-06,
"loss": 1.7834,
"step": 28
},
{
"epoch": 0.17058823529411765,
"grad_norm": 0.4690726101398468,
"learning_rate": 8.35294117647059e-06,
"loss": 1.769,
"step": 29
},
{
"epoch": 0.17647058823529413,
"grad_norm": 0.4899074137210846,
"learning_rate": 8.294117647058825e-06,
"loss": 1.7534,
"step": 30
},
{
"epoch": 0.18235294117647058,
"grad_norm": 0.4322926104068756,
"learning_rate": 8.23529411764706e-06,
"loss": 1.7127,
"step": 31
},
{
"epoch": 0.18823529411764706,
"grad_norm": 0.4963333010673523,
"learning_rate": 8.176470588235295e-06,
"loss": 1.7316,
"step": 32
},
{
"epoch": 0.19411764705882353,
"grad_norm": 0.4416678547859192,
"learning_rate": 8.11764705882353e-06,
"loss": 1.6911,
"step": 33
},
{
"epoch": 0.2,
"grad_norm": 0.44732019305229187,
"learning_rate": 8.058823529411766e-06,
"loss": 1.6832,
"step": 34
},
{
"epoch": 0.20588235294117646,
"grad_norm": 0.4325319528579712,
"learning_rate": 8.000000000000001e-06,
"loss": 1.6849,
"step": 35
},
{
"epoch": 0.21176470588235294,
"grad_norm": 0.4243956506252289,
"learning_rate": 7.941176470588236e-06,
"loss": 1.6471,
"step": 36
},
{
"epoch": 0.21764705882352942,
"grad_norm": 0.41187071800231934,
"learning_rate": 7.882352941176471e-06,
"loss": 1.654,
"step": 37
},
{
"epoch": 0.2235294117647059,
"grad_norm": 0.40401241183280945,
"learning_rate": 7.823529411764706e-06,
"loss": 1.644,
"step": 38
},
{
"epoch": 0.22941176470588234,
"grad_norm": 0.4079605042934418,
"learning_rate": 7.764705882352941e-06,
"loss": 1.6209,
"step": 39
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.37295785546302795,
"learning_rate": 7.705882352941178e-06,
"loss": 1.6111,
"step": 40
},
{
"epoch": 0.2411764705882353,
"grad_norm": 0.37890729308128357,
"learning_rate": 7.647058823529411e-06,
"loss": 1.6122,
"step": 41
},
{
"epoch": 0.24705882352941178,
"grad_norm": 0.3897000849246979,
"learning_rate": 7.588235294117648e-06,
"loss": 1.594,
"step": 42
},
{
"epoch": 0.2529411764705882,
"grad_norm": 0.37150734663009644,
"learning_rate": 7.529411764705883e-06,
"loss": 1.5683,
"step": 43
},
{
"epoch": 0.25882352941176473,
"grad_norm": 0.3686462342739105,
"learning_rate": 7.4705882352941185e-06,
"loss": 1.5578,
"step": 44
},
{
"epoch": 0.2647058823529412,
"grad_norm": 0.3615223467350006,
"learning_rate": 7.4117647058823535e-06,
"loss": 1.5553,
"step": 45
},
{
"epoch": 0.27058823529411763,
"grad_norm": 0.341239333152771,
"learning_rate": 7.352941176470589e-06,
"loss": 1.5504,
"step": 46
},
{
"epoch": 0.27647058823529413,
"grad_norm": 0.32972443103790283,
"learning_rate": 7.294117647058823e-06,
"loss": 1.5523,
"step": 47
},
{
"epoch": 0.2823529411764706,
"grad_norm": 0.3313795328140259,
"learning_rate": 7.235294117647059e-06,
"loss": 1.5367,
"step": 48
},
{
"epoch": 0.28823529411764703,
"grad_norm": 0.3319094479084015,
"learning_rate": 7.176470588235295e-06,
"loss": 1.5233,
"step": 49
},
{
"epoch": 0.29411764705882354,
"grad_norm": 0.3231871426105499,
"learning_rate": 7.11764705882353e-06,
"loss": 1.5064,
"step": 50
},
{
"epoch": 0.3,
"grad_norm": 0.3074081838130951,
"learning_rate": 7.058823529411766e-06,
"loss": 1.4804,
"step": 51
},
{
"epoch": 0.3058823529411765,
"grad_norm": 0.329453706741333,
"learning_rate": 7e-06,
"loss": 1.5033,
"step": 52
},
{
"epoch": 0.31176470588235294,
"grad_norm": 0.3119613826274872,
"learning_rate": 6.941176470588236e-06,
"loss": 1.4898,
"step": 53
},
{
"epoch": 0.3176470588235294,
"grad_norm": 0.31654036045074463,
"learning_rate": 6.8823529411764715e-06,
"loss": 1.4599,
"step": 54
},
{
"epoch": 0.3235294117647059,
"grad_norm": 0.29753053188323975,
"learning_rate": 6.8235294117647065e-06,
"loss": 1.4625,
"step": 55
},
{
"epoch": 0.32941176470588235,
"grad_norm": 0.30820533633232117,
"learning_rate": 6.764705882352942e-06,
"loss": 1.4759,
"step": 56
},
{
"epoch": 0.3352941176470588,
"grad_norm": 0.29135259985923767,
"learning_rate": 6.705882352941176e-06,
"loss": 1.4699,
"step": 57
},
{
"epoch": 0.3411764705882353,
"grad_norm": 0.2927163243293762,
"learning_rate": 6.647058823529412e-06,
"loss": 1.4428,
"step": 58
},
{
"epoch": 0.34705882352941175,
"grad_norm": 0.3006676137447357,
"learning_rate": 6.588235294117647e-06,
"loss": 1.451,
"step": 59
},
{
"epoch": 0.35294117647058826,
"grad_norm": 0.29078030586242676,
"learning_rate": 6.529411764705883e-06,
"loss": 1.4352,
"step": 60
},
{
"epoch": 0.3588235294117647,
"grad_norm": 0.28280261158943176,
"learning_rate": 6.470588235294119e-06,
"loss": 1.4295,
"step": 61
},
{
"epoch": 0.36470588235294116,
"grad_norm": 0.3001053035259247,
"learning_rate": 6.411764705882354e-06,
"loss": 1.4375,
"step": 62
},
{
"epoch": 0.37058823529411766,
"grad_norm": 0.28294065594673157,
"learning_rate": 6.352941176470589e-06,
"loss": 1.4144,
"step": 63
},
{
"epoch": 0.3764705882352941,
"grad_norm": 0.2832286059856415,
"learning_rate": 6.294117647058824e-06,
"loss": 1.4207,
"step": 64
},
{
"epoch": 0.38235294117647056,
"grad_norm": 0.2754327952861786,
"learning_rate": 6.2352941176470595e-06,
"loss": 1.4362,
"step": 65
},
{
"epoch": 0.38823529411764707,
"grad_norm": 0.28400981426239014,
"learning_rate": 6.176470588235295e-06,
"loss": 1.382,
"step": 66
},
{
"epoch": 0.3941176470588235,
"grad_norm": 0.2783932387828827,
"learning_rate": 6.11764705882353e-06,
"loss": 1.4018,
"step": 67
},
{
"epoch": 0.4,
"grad_norm": 0.270181268453598,
"learning_rate": 6.058823529411765e-06,
"loss": 1.4002,
"step": 68
},
{
"epoch": 0.40588235294117647,
"grad_norm": 0.28010931611061096,
"learning_rate": 6e-06,
"loss": 1.3927,
"step": 69
},
{
"epoch": 0.4117647058823529,
"grad_norm": 0.28210070729255676,
"learning_rate": 5.941176470588236e-06,
"loss": 1.3775,
"step": 70
},
{
"epoch": 0.4176470588235294,
"grad_norm": 0.26174265146255493,
"learning_rate": 5.882352941176471e-06,
"loss": 1.3791,
"step": 71
},
{
"epoch": 0.4235294117647059,
"grad_norm": 0.2730426788330078,
"learning_rate": 5.823529411764707e-06,
"loss": 1.3865,
"step": 72
},
{
"epoch": 0.4294117647058823,
"grad_norm": 0.25816625356674194,
"learning_rate": 5.764705882352941e-06,
"loss": 1.357,
"step": 73
},
{
"epoch": 0.43529411764705883,
"grad_norm": 0.25862398743629456,
"learning_rate": 5.705882352941177e-06,
"loss": 1.3597,
"step": 74
},
{
"epoch": 0.4411764705882353,
"grad_norm": 0.2514458894729614,
"learning_rate": 5.6470588235294125e-06,
"loss": 1.3971,
"step": 75
},
{
"epoch": 0.4470588235294118,
"grad_norm": 0.2639279067516327,
"learning_rate": 5.588235294117647e-06,
"loss": 1.3693,
"step": 76
},
{
"epoch": 0.45294117647058824,
"grad_norm": 0.26090630888938904,
"learning_rate": 5.529411764705883e-06,
"loss": 1.3681,
"step": 77
},
{
"epoch": 0.4588235294117647,
"grad_norm": 0.2618473470211029,
"learning_rate": 5.470588235294119e-06,
"loss": 1.3568,
"step": 78
},
{
"epoch": 0.4647058823529412,
"grad_norm": 0.25189754366874695,
"learning_rate": 5.411764705882353e-06,
"loss": 1.3628,
"step": 79
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.2481844574213028,
"learning_rate": 5.352941176470589e-06,
"loss": 1.3382,
"step": 80
},
{
"epoch": 0.4764705882352941,
"grad_norm": 0.24728593230247498,
"learning_rate": 5.294117647058824e-06,
"loss": 1.3288,
"step": 81
},
{
"epoch": 0.4823529411764706,
"grad_norm": 0.25381624698638916,
"learning_rate": 5.23529411764706e-06,
"loss": 1.3215,
"step": 82
},
{
"epoch": 0.48823529411764705,
"grad_norm": 0.2516557276248932,
"learning_rate": 5.176470588235295e-06,
"loss": 1.3264,
"step": 83
},
{
"epoch": 0.49411764705882355,
"grad_norm": 0.24683943390846252,
"learning_rate": 5.11764705882353e-06,
"loss": 1.3244,
"step": 84
},
{
"epoch": 0.5,
"grad_norm": 0.24650059640407562,
"learning_rate": 5.058823529411765e-06,
"loss": 1.3259,
"step": 85
},
{
"epoch": 0.5058823529411764,
"grad_norm": 0.2529411017894745,
"learning_rate": 5e-06,
"loss": 1.3313,
"step": 86
},
{
"epoch": 0.5117647058823529,
"grad_norm": 0.2540332078933716,
"learning_rate": 4.941176470588236e-06,
"loss": 1.33,
"step": 87
},
{
"epoch": 0.5176470588235295,
"grad_norm": 0.25214681029319763,
"learning_rate": 4.882352941176471e-06,
"loss": 1.2992,
"step": 88
},
{
"epoch": 0.5235294117647059,
"grad_norm": 0.27215129137039185,
"learning_rate": 4.823529411764706e-06,
"loss": 1.3119,
"step": 89
},
{
"epoch": 0.5294117647058824,
"grad_norm": 0.2611463665962219,
"learning_rate": 4.764705882352941e-06,
"loss": 1.3265,
"step": 90
},
{
"epoch": 0.5352941176470588,
"grad_norm": 0.2502508759498596,
"learning_rate": 4.705882352941177e-06,
"loss": 1.2926,
"step": 91
},
{
"epoch": 0.5411764705882353,
"grad_norm": 0.26345929503440857,
"learning_rate": 4.647058823529412e-06,
"loss": 1.2975,
"step": 92
},
{
"epoch": 0.5470588235294118,
"grad_norm": 0.2609890401363373,
"learning_rate": 4.588235294117647e-06,
"loss": 1.2921,
"step": 93
},
{
"epoch": 0.5529411764705883,
"grad_norm": 0.2622078061103821,
"learning_rate": 4.529411764705883e-06,
"loss": 1.3016,
"step": 94
},
{
"epoch": 0.5588235294117647,
"grad_norm": 0.2562355101108551,
"learning_rate": 4.4705882352941184e-06,
"loss": 1.2908,
"step": 95
},
{
"epoch": 0.5647058823529412,
"grad_norm": 0.25484997034072876,
"learning_rate": 4.411764705882353e-06,
"loss": 1.3199,
"step": 96
},
{
"epoch": 0.5705882352941176,
"grad_norm": 0.25862494111061096,
"learning_rate": 4.352941176470588e-06,
"loss": 1.2855,
"step": 97
},
{
"epoch": 0.5764705882352941,
"grad_norm": 0.27047714591026306,
"learning_rate": 4.294117647058823e-06,
"loss": 1.3165,
"step": 98
},
{
"epoch": 0.5823529411764706,
"grad_norm": 0.2632170021533966,
"learning_rate": 4.235294117647059e-06,
"loss": 1.2912,
"step": 99
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.25326305627822876,
"learning_rate": 4.176470588235295e-06,
"loss": 1.3053,
"step": 100
},
{
"epoch": 0.5941176470588235,
"grad_norm": 0.26147395372390747,
"learning_rate": 4.11764705882353e-06,
"loss": 1.2973,
"step": 101
},
{
"epoch": 0.6,
"grad_norm": 0.26799634099006653,
"learning_rate": 4.058823529411765e-06,
"loss": 1.2794,
"step": 102
},
{
"epoch": 0.6058823529411764,
"grad_norm": 0.2632071077823639,
"learning_rate": 4.000000000000001e-06,
"loss": 1.2867,
"step": 103
},
{
"epoch": 0.611764705882353,
"grad_norm": 0.27080872654914856,
"learning_rate": 3.941176470588236e-06,
"loss": 1.277,
"step": 104
},
{
"epoch": 0.6176470588235294,
"grad_norm": 0.2697356939315796,
"learning_rate": 3.882352941176471e-06,
"loss": 1.2697,
"step": 105
},
{
"epoch": 0.6235294117647059,
"grad_norm": 0.27979159355163574,
"learning_rate": 3.8235294117647055e-06,
"loss": 1.2746,
"step": 106
},
{
"epoch": 0.6294117647058823,
"grad_norm": 0.2690213620662689,
"learning_rate": 3.7647058823529414e-06,
"loss": 1.2734,
"step": 107
},
{
"epoch": 0.6352941176470588,
"grad_norm": 0.27870768308639526,
"learning_rate": 3.7058823529411767e-06,
"loss": 1.2707,
"step": 108
},
{
"epoch": 0.6411764705882353,
"grad_norm": 0.29579660296440125,
"learning_rate": 3.6470588235294117e-06,
"loss": 1.2616,
"step": 109
},
{
"epoch": 0.6470588235294118,
"grad_norm": 0.2851077914237976,
"learning_rate": 3.5882352941176475e-06,
"loss": 1.2591,
"step": 110
},
{
"epoch": 0.6529411764705882,
"grad_norm": 0.307041198015213,
"learning_rate": 3.529411764705883e-06,
"loss": 1.2522,
"step": 111
},
{
"epoch": 0.6588235294117647,
"grad_norm": 0.29607197642326355,
"learning_rate": 3.470588235294118e-06,
"loss": 1.2831,
"step": 112
},
{
"epoch": 0.6647058823529411,
"grad_norm": 0.29029569029808044,
"learning_rate": 3.4117647058823532e-06,
"loss": 1.2539,
"step": 113
},
{
"epoch": 0.6705882352941176,
"grad_norm": 0.28268927335739136,
"learning_rate": 3.352941176470588e-06,
"loss": 1.2652,
"step": 114
},
{
"epoch": 0.6764705882352942,
"grad_norm": 0.28747496008872986,
"learning_rate": 3.2941176470588236e-06,
"loss": 1.2394,
"step": 115
},
{
"epoch": 0.6823529411764706,
"grad_norm": 0.2939983904361725,
"learning_rate": 3.2352941176470594e-06,
"loss": 1.2639,
"step": 116
},
{
"epoch": 0.6882352941176471,
"grad_norm": 0.2975703179836273,
"learning_rate": 3.1764705882352943e-06,
"loss": 1.2762,
"step": 117
},
{
"epoch": 0.6941176470588235,
"grad_norm": 0.2900603413581848,
"learning_rate": 3.1176470588235297e-06,
"loss": 1.2623,
"step": 118
},
{
"epoch": 0.7,
"grad_norm": 0.2925064265727997,
"learning_rate": 3.058823529411765e-06,
"loss": 1.27,
"step": 119
},
{
"epoch": 0.7058823529411765,
"grad_norm": 0.2913402318954468,
"learning_rate": 3e-06,
"loss": 1.2558,
"step": 120
},
{
"epoch": 0.711764705882353,
"grad_norm": 0.3211301863193512,
"learning_rate": 2.9411764705882355e-06,
"loss": 1.2397,
"step": 121
},
{
"epoch": 0.7176470588235294,
"grad_norm": 0.3004200756549835,
"learning_rate": 2.8823529411764704e-06,
"loss": 1.2627,
"step": 122
},
{
"epoch": 0.7235294117647059,
"grad_norm": 0.3165768086910248,
"learning_rate": 2.8235294117647062e-06,
"loss": 1.2388,
"step": 123
},
{
"epoch": 0.7294117647058823,
"grad_norm": 0.29654860496520996,
"learning_rate": 2.7647058823529416e-06,
"loss": 1.2332,
"step": 124
},
{
"epoch": 0.7352941176470589,
"grad_norm": 0.3117150068283081,
"learning_rate": 2.7058823529411766e-06,
"loss": 1.2588,
"step": 125
},
{
"epoch": 0.7411764705882353,
"grad_norm": 0.33643701672554016,
"learning_rate": 2.647058823529412e-06,
"loss": 1.2289,
"step": 126
},
{
"epoch": 0.7470588235294118,
"grad_norm": 0.3130914270877838,
"learning_rate": 2.5882352941176473e-06,
"loss": 1.263,
"step": 127
},
{
"epoch": 0.7529411764705882,
"grad_norm": 0.3396664559841156,
"learning_rate": 2.5294117647058823e-06,
"loss": 1.2592,
"step": 128
},
{
"epoch": 0.7588235294117647,
"grad_norm": 0.30291828513145447,
"learning_rate": 2.470588235294118e-06,
"loss": 1.2577,
"step": 129
},
{
"epoch": 0.7647058823529411,
"grad_norm": 0.32175707817077637,
"learning_rate": 2.411764705882353e-06,
"loss": 1.247,
"step": 130
},
{
"epoch": 0.7705882352941177,
"grad_norm": 0.346138596534729,
"learning_rate": 2.3529411764705885e-06,
"loss": 1.2476,
"step": 131
},
{
"epoch": 0.7764705882352941,
"grad_norm": 0.3127652406692505,
"learning_rate": 2.2941176470588234e-06,
"loss": 1.2392,
"step": 132
},
{
"epoch": 0.7823529411764706,
"grad_norm": 0.349590927362442,
"learning_rate": 2.2352941176470592e-06,
"loss": 1.2377,
"step": 133
},
{
"epoch": 0.788235294117647,
"grad_norm": 0.3107239305973053,
"learning_rate": 2.176470588235294e-06,
"loss": 1.239,
"step": 134
},
{
"epoch": 0.7941176470588235,
"grad_norm": 0.33791080117225647,
"learning_rate": 2.1176470588235296e-06,
"loss": 1.246,
"step": 135
},
{
"epoch": 0.8,
"grad_norm": 0.3314568102359772,
"learning_rate": 2.058823529411765e-06,
"loss": 1.2402,
"step": 136
},
{
"epoch": 0.8058823529411765,
"grad_norm": 0.3275523781776428,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.2348,
"step": 137
},
{
"epoch": 0.8117647058823529,
"grad_norm": 0.33062854409217834,
"learning_rate": 1.9411764705882353e-06,
"loss": 1.2427,
"step": 138
},
{
"epoch": 0.8176470588235294,
"grad_norm": 0.35148942470550537,
"learning_rate": 1.8823529411764707e-06,
"loss": 1.2261,
"step": 139
},
{
"epoch": 0.8235294117647058,
"grad_norm": 0.3389197289943695,
"learning_rate": 1.8235294117647058e-06,
"loss": 1.2362,
"step": 140
},
{
"epoch": 0.8294117647058824,
"grad_norm": 0.3360951244831085,
"learning_rate": 1.7647058823529414e-06,
"loss": 1.2302,
"step": 141
},
{
"epoch": 0.8352941176470589,
"grad_norm": 0.34131404757499695,
"learning_rate": 1.7058823529411766e-06,
"loss": 1.2266,
"step": 142
},
{
"epoch": 0.8411764705882353,
"grad_norm": 0.328914076089859,
"learning_rate": 1.6470588235294118e-06,
"loss": 1.2308,
"step": 143
},
{
"epoch": 0.8470588235294118,
"grad_norm": 0.34804269671440125,
"learning_rate": 1.5882352941176472e-06,
"loss": 1.2212,
"step": 144
},
{
"epoch": 0.8529411764705882,
"grad_norm": 0.35386762022972107,
"learning_rate": 1.5294117647058826e-06,
"loss": 1.229,
"step": 145
},
{
"epoch": 0.8588235294117647,
"grad_norm": 0.33942756056785583,
"learning_rate": 1.4705882352941177e-06,
"loss": 1.2434,
"step": 146
},
{
"epoch": 0.8647058823529412,
"grad_norm": 0.32963618636131287,
"learning_rate": 1.4117647058823531e-06,
"loss": 1.2385,
"step": 147
},
{
"epoch": 0.8705882352941177,
"grad_norm": 0.3417942225933075,
"learning_rate": 1.3529411764705883e-06,
"loss": 1.2242,
"step": 148
},
{
"epoch": 0.8764705882352941,
"grad_norm": 0.33753451704978943,
"learning_rate": 1.2941176470588237e-06,
"loss": 1.2234,
"step": 149
},
{
"epoch": 0.8823529411764706,
"grad_norm": 0.3514120876789093,
"learning_rate": 1.235294117647059e-06,
"loss": 1.2241,
"step": 150
},
{
"epoch": 0.888235294117647,
"grad_norm": 0.35951969027519226,
"learning_rate": 1.1764705882352942e-06,
"loss": 1.2347,
"step": 151
},
{
"epoch": 0.8941176470588236,
"grad_norm": 0.3717687726020813,
"learning_rate": 1.1176470588235296e-06,
"loss": 1.2224,
"step": 152
},
{
"epoch": 0.9,
"grad_norm": 0.3542497754096985,
"learning_rate": 1.0588235294117648e-06,
"loss": 1.2236,
"step": 153
},
{
"epoch": 0.9058823529411765,
"grad_norm": 0.3436025083065033,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.2294,
"step": 154
},
{
"epoch": 0.9117647058823529,
"grad_norm": 0.37331488728523254,
"learning_rate": 9.411764705882353e-07,
"loss": 1.2046,
"step": 155
},
{
"epoch": 0.9176470588235294,
"grad_norm": 0.34907183051109314,
"learning_rate": 8.823529411764707e-07,
"loss": 1.2213,
"step": 156
},
{
"epoch": 0.9235294117647059,
"grad_norm": 0.36500322818756104,
"learning_rate": 8.235294117647059e-07,
"loss": 1.2091,
"step": 157
},
{
"epoch": 0.9294117647058824,
"grad_norm": 0.38440433144569397,
"learning_rate": 7.647058823529413e-07,
"loss": 1.2249,
"step": 158
},
{
"epoch": 0.9352941176470588,
"grad_norm": 0.3387817144393921,
"learning_rate": 7.058823529411766e-07,
"loss": 1.2154,
"step": 159
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.34928762912750244,
"learning_rate": 6.470588235294118e-07,
"loss": 1.2227,
"step": 160
},
{
"epoch": 0.9470588235294117,
"grad_norm": 0.36257097125053406,
"learning_rate": 5.882352941176471e-07,
"loss": 1.2211,
"step": 161
},
{
"epoch": 0.9529411764705882,
"grad_norm": 0.3723115026950836,
"learning_rate": 5.294117647058824e-07,
"loss": 1.2283,
"step": 162
},
{
"epoch": 0.9588235294117647,
"grad_norm": 0.3423607349395752,
"learning_rate": 4.7058823529411767e-07,
"loss": 1.2295,
"step": 163
},
{
"epoch": 0.9647058823529412,
"grad_norm": 0.3787173628807068,
"learning_rate": 4.1176470588235295e-07,
"loss": 1.2201,
"step": 164
},
{
"epoch": 0.9705882352941176,
"grad_norm": 0.36642688512802124,
"learning_rate": 3.529411764705883e-07,
"loss": 1.2313,
"step": 165
},
{
"epoch": 0.9764705882352941,
"grad_norm": 0.3594622313976288,
"learning_rate": 2.9411764705882356e-07,
"loss": 1.2128,
"step": 166
},
{
"epoch": 0.9823529411764705,
"grad_norm": 0.3701726496219635,
"learning_rate": 2.3529411764705883e-07,
"loss": 1.2324,
"step": 167
},
{
"epoch": 0.9882352941176471,
"grad_norm": 0.34158623218536377,
"learning_rate": 1.7647058823529414e-07,
"loss": 1.2364,
"step": 168
},
{
"epoch": 0.9941176470588236,
"grad_norm": 0.3631001114845276,
"learning_rate": 1.1764705882352942e-07,
"loss": 1.2191,
"step": 169
},
{
"epoch": 1.0,
"grad_norm": 0.40616530179977417,
"learning_rate": 5.882352941176471e-08,
"loss": 1.2029,
"step": 170
},
{
"epoch": 1.0,
"eval_loss": 1.2080979347229004,
"eval_runtime": 4.1103,
"eval_samples_per_second": 4.379,
"eval_steps_per_second": 0.73,
"step": 170
}
],
"logging_steps": 1.0,
"max_steps": 170,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.7061227320088986e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}