init commit

Browse files

Files changed (4) hide show

README.md +14 -0
attn_gate_weights.pth +3 -0
config.json +33 -0
trainer_state.json +3542 -0

README.md CHANGED Viewed

@@ -1,3 +1,17 @@
 ---
 license: mit
 ---

 ---
 license: mit
+library_name: transformers
+base_model:
+  - deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
+base_model_relation: adapter
 ---
+## SeerAttention-DeepSeek-R1-Distill-Qwen-32B-AttnGates
+This repo only contains the AttnGates' weights for deepseek-ai/DeepSeek-R1-Distill-Qwen-32B.
+[SeerAttention](https://arxiv.org/abs/2410.13276) introduces learnable AttnGate modules to accelerate the computationally intensive prefill stage of long-context large language models (LLMs) via dynamic block-level sparsity. The AttnGates are trained in a parameter-efficient self-distillation framework, where they learn to mimic the 2D max-pooled attention patterns of the original frozen model, preserving its integrity while avoiding costly retraining. During inference, these gates generate block-sparse binary masks by applying threshold/TopK to their learned soft scores, enabling efficient computation through a custom block-sparse FlashAttention kernel.
+Original Github Repo
+https://github.com/microsoft/SeerAttention.

attn_gate_weights.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91a77643b3dfdcc14e50160f1f2ba6bd493fd456babd8d02df06d9991d5a9ab3
+size 251668686

config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "base_model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 27648,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 64,
+  "model_type": "qwen2",
+  "num_attention_heads": 40,
+  "num_hidden_layers": 64,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000.0,
+  "seerattn_gate_block_size": 64,
+  "seerattn_gate_force_double": false,
+  "seerattn_gate_hidden_size": 128,
+  "seerattn_gate_type": "Qavg_Kmaxminavg",
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.44.2",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,3542 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.4284490145672665,
+  "eval_steps": 500,
+  "global_step": 500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.000856898029134533,
+      "grad_norm": 0.2508656084537506,
+      "learning_rate": 5e-05,
+      "loss": 2.5931,
+      "step": 1
+    },
+    {
+      "epoch": 0.001713796058269066,
+      "grad_norm": 0.24156232178211212,
+      "learning_rate": 0.0001,
+      "loss": 2.6137,
+      "step": 2
+    },
+    {
+      "epoch": 0.002570694087403599,
+      "grad_norm": 0.25357234477996826,
+      "learning_rate": 0.00015,
+      "loss": 2.5697,
+      "step": 3
+    },
+    {
+      "epoch": 0.003427592116538132,
+      "grad_norm": 0.23629046976566315,
+      "learning_rate": 0.0002,
+      "loss": 2.3679,
+      "step": 4
+    },
+    {
+      "epoch": 0.004284490145672665,
+      "grad_norm": 0.23719146847724915,
+      "learning_rate": 0.00025,
+      "loss": 2.3205,
+      "step": 5
+    },
+    {
+      "epoch": 0.005141388174807198,
+      "grad_norm": 0.22218209505081177,
+      "learning_rate": 0.0003,
+      "loss": 2.1263,
+      "step": 6
+    },
+    {
+      "epoch": 0.005998286203941731,
+      "grad_norm": 0.20407630503177643,
+      "learning_rate": 0.00035,
+      "loss": 1.9481,
+      "step": 7
+    },
+    {
+      "epoch": 0.006855184233076264,
+      "grad_norm": 0.1823188215494156,
+      "learning_rate": 0.0004,
+      "loss": 1.7466,
+      "step": 8
+    },
+    {
+      "epoch": 0.007712082262210797,
+      "grad_norm": 0.15546730160713196,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 1.5499,
+      "step": 9
+    },
+    {
+      "epoch": 0.00856898029134533,
+      "grad_norm": 0.14485396444797516,
+      "learning_rate": 0.0005,
+      "loss": 1.4354,
+      "step": 10
+    },
+    {
+      "epoch": 0.009425878320479864,
+      "grad_norm": 0.16174261271953583,
+      "learning_rate": 0.00055,
+      "loss": 1.4151,
+      "step": 11
+    },
+    {
+      "epoch": 0.010282776349614395,
+      "grad_norm": 0.1819111704826355,
+      "learning_rate": 0.0006,
+      "loss": 1.3921,
+      "step": 12
+    },
+    {
+      "epoch": 0.011139674378748929,
+      "grad_norm": 0.18399009108543396,
+      "learning_rate": 0.0006500000000000001,
+      "loss": 1.3712,
+      "step": 13
+    },
+    {
+      "epoch": 0.011996572407883462,
+      "grad_norm": 0.17457200586795807,
+      "learning_rate": 0.0007,
+      "loss": 1.329,
+      "step": 14
+    },
+    {
+      "epoch": 0.012853470437017995,
+      "grad_norm": 0.1598544865846634,
+      "learning_rate": 0.00075,
+      "loss": 1.2791,
+      "step": 15
+    },
+    {
+      "epoch": 0.013710368466152529,
+      "grad_norm": 0.1446290910243988,
+      "learning_rate": 0.0008,
+      "loss": 1.1637,
+      "step": 16
+    },
+    {
+      "epoch": 0.01456726649528706,
+      "grad_norm": 0.130220428109169,
+      "learning_rate": 0.00085,
+      "loss": 1.1213,
+      "step": 17
+    },
+    {
+      "epoch": 0.015424164524421594,
+      "grad_norm": 0.13525986671447754,
+      "learning_rate": 0.0009000000000000001,
+      "loss": 1.0648,
+      "step": 18
+    },
+    {
+      "epoch": 0.016281062553556127,
+      "grad_norm": 0.13430772721767426,
+      "learning_rate": 0.00095,
+      "loss": 1.0397,
+      "step": 19
+    },
+    {
+      "epoch": 0.01713796058269066,
+      "grad_norm": 0.12910906970500946,
+      "learning_rate": 0.001,
+      "loss": 0.9857,
+      "step": 20
+    },
+    {
+      "epoch": 0.017994858611825194,
+      "grad_norm": 0.13353696465492249,
+      "learning_rate": 0.0009999892908320648,
+      "loss": 0.9221,
+      "step": 21
+    },
+    {
+      "epoch": 0.018851756640959727,
+      "grad_norm": 0.1093161553144455,
+      "learning_rate": 0.0009999571637870036,
+      "loss": 0.8759,
+      "step": 22
+    },
+    {
+      "epoch": 0.01970865467009426,
+      "grad_norm": 0.11288490891456604,
+      "learning_rate": 0.0009999036202410325,
+      "loss": 0.804,
+      "step": 23
+    },
+    {
+      "epoch": 0.02056555269922879,
+      "grad_norm": 0.104631707072258,
+      "learning_rate": 0.0009998286624877785,
+      "loss": 0.7662,
+      "step": 24
+    },
+    {
+      "epoch": 0.021422450728363324,
+      "grad_norm": 0.0997966080904007,
+      "learning_rate": 0.0009997322937381828,
+      "loss": 0.7223,
+      "step": 25
+    },
+    {
+      "epoch": 0.022279348757497857,
+      "grad_norm": 0.09641406685113907,
+      "learning_rate": 0.0009996145181203615,
+      "loss": 0.6928,
+      "step": 26
+    },
+    {
+      "epoch": 0.02313624678663239,
+      "grad_norm": 0.08281794935464859,
+      "learning_rate": 0.00099947534067943,
+      "loss": 0.6405,
+      "step": 27
+    },
+    {
+      "epoch": 0.023993144815766924,
+      "grad_norm": 0.08452638983726501,
+      "learning_rate": 0.0009993147673772868,
+      "loss": 0.6556,
+      "step": 28
+    },
+    {
+      "epoch": 0.024850042844901457,
+      "grad_norm": 0.08429277688264847,
+      "learning_rate": 0.000999132805092358,
+      "loss": 0.594,
+      "step": 29
+    },
+    {
+      "epoch": 0.02570694087403599,
+      "grad_norm": 0.0761115625500679,
+      "learning_rate": 0.0009989294616193018,
+      "loss": 0.5755,
+      "step": 30
+    },
+    {
+      "epoch": 0.026563838903170524,
+      "grad_norm": 0.07302208989858627,
+      "learning_rate": 0.000998704745668676,
+      "loss": 0.5689,
+      "step": 31
+    },
+    {
+      "epoch": 0.027420736932305057,
+      "grad_norm": 0.07030151039361954,
+      "learning_rate": 0.000998458666866564,
+      "loss": 0.5309,
+      "step": 32
+    },
+    {
+      "epoch": 0.028277634961439587,
+      "grad_norm": 0.06726360321044922,
+      "learning_rate": 0.0009981912357541628,
+      "loss": 0.522,
+      "step": 33
+    },
+    {
+      "epoch": 0.02913453299057412,
+      "grad_norm": 0.06689871847629547,
+      "learning_rate": 0.0009979024637873308,
+      "loss": 0.5076,
+      "step": 34
+    },
+    {
+      "epoch": 0.029991431019708654,
+      "grad_norm": 0.06583524495363235,
+      "learning_rate": 0.0009975923633360985,
+      "loss": 0.4853,
+      "step": 35
+    },
+    {
+      "epoch": 0.030848329048843187,
+      "grad_norm": 0.056236833333969116,
+      "learning_rate": 0.0009972609476841367,
+      "loss": 0.4725,
+      "step": 36
+    },
+    {
+      "epoch": 0.031705227077977724,
+      "grad_norm": 0.060708239674568176,
+      "learning_rate": 0.0009969082310281891,
+      "loss": 0.4617,
+      "step": 37
+    },
+    {
+      "epoch": 0.032562125107112254,
+      "grad_norm": 0.05682970583438873,
+      "learning_rate": 0.0009965342284774632,
+      "loss": 0.4696,
+      "step": 38
+    },
+    {
+      "epoch": 0.033419023136246784,
+      "grad_norm": 0.05259707570075989,
+      "learning_rate": 0.0009961389560529835,
+      "loss": 0.4548,
+      "step": 39
+    },
+    {
+      "epoch": 0.03427592116538132,
+      "grad_norm": 0.053453583270311356,
+      "learning_rate": 0.0009957224306869053,
+      "loss": 0.4514,
+      "step": 40
+    },
+    {
+      "epoch": 0.03513281919451585,
+      "grad_norm": 0.05121961236000061,
+      "learning_rate": 0.0009952846702217886,
+      "loss": 0.4327,
+      "step": 41
+    },
+    {
+      "epoch": 0.03598971722365039,
+      "grad_norm": 0.04787424951791763,
+      "learning_rate": 0.0009948256934098352,
+      "loss": 0.4184,
+      "step": 42
+    },
+    {
+      "epoch": 0.03684661525278492,
+      "grad_norm": 0.052464596927165985,
+      "learning_rate": 0.0009943455199120836,
+      "loss": 0.4039,
+      "step": 43
+    },
+    {
+      "epoch": 0.037703513281919454,
+      "grad_norm": 0.04593312367796898,
+      "learning_rate": 0.0009938441702975688,
+      "loss": 0.4027,
+      "step": 44
+    },
+    {
+      "epoch": 0.038560411311053984,
+      "grad_norm": 0.04613376036286354,
+      "learning_rate": 0.0009933216660424397,
+      "loss": 0.3833,
+      "step": 45
+    },
+    {
+      "epoch": 0.03941730934018852,
+      "grad_norm": 0.0438942089676857,
+      "learning_rate": 0.0009927780295290389,
+      "loss": 0.3943,
+      "step": 46
+    },
+    {
+      "epoch": 0.04027420736932305,
+      "grad_norm": 0.03944185748696327,
+      "learning_rate": 0.0009922132840449458,
+      "loss": 0.3751,
+      "step": 47
+    },
+    {
+      "epoch": 0.04113110539845758,
+      "grad_norm": 0.0458449125289917,
+      "learning_rate": 0.0009916274537819774,
+      "loss": 0.3691,
+      "step": 48
+    },
+    {
+      "epoch": 0.04198800342759212,
+      "grad_norm": 0.040158357471227646,
+      "learning_rate": 0.000991020563835152,
+      "loss": 0.3793,
+      "step": 49
+    },
+    {
+      "epoch": 0.04284490145672665,
+      "grad_norm": 0.044870492070913315,
+      "learning_rate": 0.0009903926402016153,
+      "loss": 0.3533,
+      "step": 50
+    },
+    {
+      "epoch": 0.043701799485861184,
+      "grad_norm": 0.037956662476062775,
+      "learning_rate": 0.0009897437097795257,
+      "loss": 0.3773,
+      "step": 51
+    },
+    {
+      "epoch": 0.044558697514995714,
+      "grad_norm": 0.03750976547598839,
+      "learning_rate": 0.0009890738003669028,
+      "loss": 0.3547,
+      "step": 52
+    },
+    {
+      "epoch": 0.04541559554413025,
+      "grad_norm": 0.03590930253267288,
+      "learning_rate": 0.0009883829406604362,
+      "loss": 0.3447,
+      "step": 53
+    },
+    {
+      "epoch": 0.04627249357326478,
+      "grad_norm": 0.03637269139289856,
+      "learning_rate": 0.0009876711602542563,
+      "loss": 0.3557,
+      "step": 54
+    },
+    {
+      "epoch": 0.04712939160239932,
+      "grad_norm": 0.0436246357858181,
+      "learning_rate": 0.0009869384896386668,
+      "loss": 0.3361,
+      "step": 55
+    },
+    {
+      "epoch": 0.04798628963153385,
+      "grad_norm": 0.03389494866132736,
+      "learning_rate": 0.0009861849601988384,
+      "loss": 0.3229,
+      "step": 56
+    },
+    {
+      "epoch": 0.04884318766066838,
+      "grad_norm": 0.03282163292169571,
+      "learning_rate": 0.0009854106042134641,
+      "loss": 0.2967,
+      "step": 57
+    },
+    {
+      "epoch": 0.049700085689802914,
+      "grad_norm": 0.03293761610984802,
+      "learning_rate": 0.0009846154548533773,
+      "loss": 0.3236,
+      "step": 58
+    },
+    {
+      "epoch": 0.050556983718937444,
+      "grad_norm": 0.03896806761622429,
+      "learning_rate": 0.0009837995461801298,
+      "loss": 0.3117,
+      "step": 59
+    },
+    {
+      "epoch": 0.05141388174807198,
+      "grad_norm": 0.03278586268424988,
+      "learning_rate": 0.0009829629131445341,
+      "loss": 0.3068,
+      "step": 60
+    },
+    {
+      "epoch": 0.05227077977720651,
+      "grad_norm": 0.0321279913187027,
+      "learning_rate": 0.0009821055915851646,
+      "loss": 0.3118,
+      "step": 61
+    },
+    {
+      "epoch": 0.05312767780634105,
+      "grad_norm": 0.03264940157532692,
+      "learning_rate": 0.0009812276182268236,
+      "loss": 0.3125,
+      "step": 62
+    },
+    {
+      "epoch": 0.05398457583547558,
+      "grad_norm": 0.03603900223970413,
+      "learning_rate": 0.0009803290306789677,
+      "loss": 0.3,
+      "step": 63
+    },
+    {
+      "epoch": 0.054841473864610114,
+      "grad_norm": 0.029965505003929138,
+      "learning_rate": 0.0009794098674340967,
+      "loss": 0.3096,
+      "step": 64
+    },
+    {
+      "epoch": 0.055698371893744644,
+      "grad_norm": 0.036456797271966934,
+      "learning_rate": 0.0009784701678661044,
+      "loss": 0.2901,
+      "step": 65
+    },
+    {
+      "epoch": 0.056555269922879174,
+      "grad_norm": 0.032042618840932846,
+      "learning_rate": 0.0009775099722285933,
+      "loss": 0.2864,
+      "step": 66
+    },
+    {
+      "epoch": 0.05741216795201371,
+      "grad_norm": 0.03345092013478279,
+      "learning_rate": 0.0009765293216531485,
+      "loss": 0.2854,
+      "step": 67
+    },
+    {
+      "epoch": 0.05826906598114824,
+      "grad_norm": 0.029210660606622696,
+      "learning_rate": 0.0009755282581475768,
+      "loss": 0.2862,
+      "step": 68
+    },
+    {
+      "epoch": 0.05912596401028278,
+      "grad_norm": 0.02906818874180317,
+      "learning_rate": 0.000974506824594107,
+      "loss": 0.2756,
+      "step": 69
+    },
+    {
+      "epoch": 0.05998286203941731,
+      "grad_norm": 0.032076746225357056,
+      "learning_rate": 0.0009734650647475529,
+      "loss": 0.274,
+      "step": 70
+    },
+    {
+      "epoch": 0.060839760068551844,
+      "grad_norm": 0.02805478870868683,
+      "learning_rate": 0.0009724030232334391,
+      "loss": 0.2807,
+      "step": 71
+    },
+    {
+      "epoch": 0.061696658097686374,
+      "grad_norm": 0.027483096346259117,
+      "learning_rate": 0.0009713207455460893,
+      "loss": 0.2657,
+      "step": 72
+    },
+    {
+      "epoch": 0.06255355612682091,
+      "grad_norm": 0.026773786172270775,
+      "learning_rate": 0.0009702182780466775,
+      "loss": 0.2638,
+      "step": 73
+    },
+    {
+      "epoch": 0.06341045415595545,
+      "grad_norm": 0.025541041046380997,
+      "learning_rate": 0.0009690956679612422,
+      "loss": 0.2621,
+      "step": 74
+    },
+    {
+      "epoch": 0.06426735218508997,
+      "grad_norm": 0.028647374361753464,
+      "learning_rate": 0.0009679529633786629,
+      "loss": 0.268,
+      "step": 75
+    },
+    {
+      "epoch": 0.06512425021422451,
+      "grad_norm": 0.025853468105196953,
+      "learning_rate": 0.0009667902132486009,
+      "loss": 0.2665,
+      "step": 76
+    },
+    {
+      "epoch": 0.06598114824335904,
+      "grad_norm": 0.02473859116435051,
+      "learning_rate": 0.0009656074673794017,
+      "loss": 0.2633,
+      "step": 77
+    },
+    {
+      "epoch": 0.06683804627249357,
+      "grad_norm": 0.027110164985060692,
+      "learning_rate": 0.0009644047764359622,
+      "loss": 0.2591,
+      "step": 78
+    },
+    {
+      "epoch": 0.0676949443016281,
+      "grad_norm": 0.02725694142282009,
+      "learning_rate": 0.0009631821919375591,
+      "loss": 0.2486,
+      "step": 79
+    },
+    {
+      "epoch": 0.06855184233076264,
+      "grad_norm": 0.023004408925771713,
+      "learning_rate": 0.0009619397662556434,
+      "loss": 0.2477,
+      "step": 80
+    },
+    {
+      "epoch": 0.06940874035989718,
+      "grad_norm": 0.03091372549533844,
+      "learning_rate": 0.0009606775526115963,
+      "loss": 0.2625,
+      "step": 81
+    },
+    {
+      "epoch": 0.0702656383890317,
+      "grad_norm": 0.02504062093794346,
+      "learning_rate": 0.0009593956050744492,
+      "loss": 0.2603,
+      "step": 82
+    },
+    {
+      "epoch": 0.07112253641816624,
+      "grad_norm": 0.028378132730722427,
+      "learning_rate": 0.0009580939785585681,
+      "loss": 0.2501,
+      "step": 83
+    },
+    {
+      "epoch": 0.07197943444730077,
+      "grad_norm": 0.031088434159755707,
+      "learning_rate": 0.0009567727288213005,
+      "loss": 0.2463,
+      "step": 84
+    },
+    {
+      "epoch": 0.0728363324764353,
+      "grad_norm": 0.02509693056344986,
+      "learning_rate": 0.000955431912460588,
+      "loss": 0.2619,
+      "step": 85
+    },
+    {
+      "epoch": 0.07369323050556983,
+      "grad_norm": 0.024940533563494682,
+      "learning_rate": 0.0009540715869125407,
+      "loss": 0.2565,
+      "step": 86
+    },
+    {
+      "epoch": 0.07455012853470437,
+      "grad_norm": 0.026020990684628487,
+      "learning_rate": 0.0009526918104489777,
+      "loss": 0.2436,
+      "step": 87
+    },
+    {
+      "epoch": 0.07540702656383891,
+      "grad_norm": 0.023458324372768402,
+      "learning_rate": 0.0009512926421749304,
+      "loss": 0.2362,
+      "step": 88
+    },
+    {
+      "epoch": 0.07626392459297343,
+      "grad_norm": 0.0243577491492033,
+      "learning_rate": 0.0009498741420261108,
+      "loss": 0.243,
+      "step": 89
+    },
+    {
+      "epoch": 0.07712082262210797,
+      "grad_norm": 0.0244216900318861,
+      "learning_rate": 0.0009484363707663442,
+      "loss": 0.2378,
+      "step": 90
+    },
+    {
+      "epoch": 0.0779777206512425,
+      "grad_norm": 0.02734869159758091,
+      "learning_rate": 0.0009469793899849661,
+      "loss": 0.2344,
+      "step": 91
+    },
+    {
+      "epoch": 0.07883461868037704,
+      "grad_norm": 0.022375451400876045,
+      "learning_rate": 0.0009455032620941839,
+      "loss": 0.239,
+      "step": 92
+    },
+    {
+      "epoch": 0.07969151670951156,
+      "grad_norm": 0.026033056899905205,
+      "learning_rate": 0.0009440080503264037,
+      "loss": 0.2334,
+      "step": 93
+    },
+    {
+      "epoch": 0.0805484147386461,
+      "grad_norm": 0.022317685186862946,
+      "learning_rate": 0.0009424938187315209,
+      "loss": 0.2327,
+      "step": 94
+    },
+    {
+      "epoch": 0.08140531276778064,
+      "grad_norm": 0.02240647003054619,
+      "learning_rate": 0.0009409606321741775,
+      "loss": 0.2335,
+      "step": 95
+    },
+    {
+      "epoch": 0.08226221079691516,
+      "grad_norm": 0.022393332794308662,
+      "learning_rate": 0.0009394085563309827,
+      "loss": 0.2237,
+      "step": 96
+    },
+    {
+      "epoch": 0.0831191088260497,
+      "grad_norm": 0.02859143353998661,
+      "learning_rate": 0.0009378376576876999,
+      "loss": 0.2243,
+      "step": 97
+    },
+    {
+      "epoch": 0.08397600685518423,
+      "grad_norm": 0.02249022200703621,
+      "learning_rate": 0.0009362480035363986,
+      "loss": 0.2348,
+      "step": 98
+    },
+    {
+      "epoch": 0.08483290488431877,
+      "grad_norm": 0.031574804335832596,
+      "learning_rate": 0.0009346396619725719,
+      "loss": 0.2209,
+      "step": 99
+    },
+    {
+      "epoch": 0.0856898029134533,
+      "grad_norm": 0.024664921686053276,
+      "learning_rate": 0.0009330127018922195,
+      "loss": 0.2437,
+      "step": 100
+    },
+    {
+      "epoch": 0.08654670094258783,
+      "grad_norm": 0.023365097120404243,
+      "learning_rate": 0.0009313671929888959,
+      "loss": 0.2271,
+      "step": 101
+    },
+    {
+      "epoch": 0.08740359897172237,
+      "grad_norm": 0.02179112657904625,
+      "learning_rate": 0.0009297032057507264,
+      "loss": 0.2251,
+      "step": 102
+    },
+    {
+      "epoch": 0.08826049700085689,
+      "grad_norm": 0.02501068077981472,
+      "learning_rate": 0.0009280208114573858,
+      "loss": 0.2177,
+      "step": 103
+    },
+    {
+      "epoch": 0.08911739502999143,
+      "grad_norm": 0.022900037467479706,
+      "learning_rate": 0.0009263200821770461,
+      "loss": 0.2317,
+      "step": 104
+    },
+    {
+      "epoch": 0.08997429305912596,
+      "grad_norm": 0.02185271680355072,
+      "learning_rate": 0.0009246010907632895,
+      "loss": 0.2209,
+      "step": 105
+    },
+    {
+      "epoch": 0.0908311910882605,
+      "grad_norm": 0.021373869851231575,
+      "learning_rate": 0.0009228639108519867,
+      "loss": 0.2261,
+      "step": 106
+    },
+    {
+      "epoch": 0.09168808911739502,
+      "grad_norm": 0.02603291906416416,
+      "learning_rate": 0.0009211086168581433,
+      "loss": 0.2212,
+      "step": 107
+    },
+    {
+      "epoch": 0.09254498714652956,
+      "grad_norm": 0.021860316395759583,
+      "learning_rate": 0.0009193352839727121,
+      "loss": 0.2163,
+      "step": 108
+    },
+    {
+      "epoch": 0.0934018851756641,
+      "grad_norm": 0.021575644612312317,
+      "learning_rate": 0.0009175439881593715,
+      "loss": 0.2252,
+      "step": 109
+    },
+    {
+      "epoch": 0.09425878320479864,
+      "grad_norm": 0.02140972390770912,
+      "learning_rate": 0.0009157348061512727,
+      "loss": 0.2076,
+      "step": 110
+    },
+    {
+      "epoch": 0.09511568123393316,
+      "grad_norm": 0.023880021646618843,
+      "learning_rate": 0.0009139078154477511,
+      "loss": 0.2147,
+      "step": 111
+    },
+    {
+      "epoch": 0.0959725792630677,
+      "grad_norm": 0.02249889075756073,
+      "learning_rate": 0.0009120630943110077,
+      "loss": 0.2047,
+      "step": 112
+    },
+    {
+      "epoch": 0.09682947729220223,
+      "grad_norm": 0.020879626274108887,
+      "learning_rate": 0.0009102007217627568,
+      "loss": 0.2231,
+      "step": 113
+    },
+    {
+      "epoch": 0.09768637532133675,
+      "grad_norm": 0.021978724747896194,
+      "learning_rate": 0.0009083207775808396,
+      "loss": 0.2145,
+      "step": 114
+    },
+    {
+      "epoch": 0.09854327335047129,
+      "grad_norm": 0.023305930197238922,
+      "learning_rate": 0.0009064233422958076,
+      "loss": 0.2156,
+      "step": 115
+    },
+    {
+      "epoch": 0.09940017137960583,
+      "grad_norm": 0.022691812366247177,
+      "learning_rate": 0.0009045084971874737,
+      "loss": 0.2142,
+      "step": 116
+    },
+    {
+      "epoch": 0.10025706940874037,
+      "grad_norm": 0.019997967407107353,
+      "learning_rate": 0.0009025763242814291,
+      "loss": 0.2141,
+      "step": 117
+    },
+    {
+      "epoch": 0.10111396743787489,
+      "grad_norm": 0.021491916850209236,
+      "learning_rate": 0.0009006269063455304,
+      "loss": 0.2095,
+      "step": 118
+    },
+    {
+      "epoch": 0.10197086546700942,
+      "grad_norm": 0.019741587340831757,
+      "learning_rate": 0.0008986603268863536,
+      "loss": 0.2213,
+      "step": 119
+    },
+    {
+      "epoch": 0.10282776349614396,
+      "grad_norm": 0.02245885692536831,
+      "learning_rate": 0.0008966766701456176,
+      "loss": 0.2041,
+      "step": 120
+    },
+    {
+      "epoch": 0.1036846615252785,
+      "grad_norm": 0.020367003977298737,
+      "learning_rate": 0.000894676021096575,
+      "loss": 0.2052,
+      "step": 121
+    },
+    {
+      "epoch": 0.10454155955441302,
+      "grad_norm": 0.021478816866874695,
+      "learning_rate": 0.0008926584654403724,
+      "loss": 0.1999,
+      "step": 122
+    },
+    {
+      "epoch": 0.10539845758354756,
+      "grad_norm": 0.022485392168164253,
+      "learning_rate": 0.0008906240896023794,
+      "loss": 0.2066,
+      "step": 123
+    },
+    {
+      "epoch": 0.1062553556126821,
+      "grad_norm": 0.02136247418820858,
+      "learning_rate": 0.0008885729807284854,
+      "loss": 0.1972,
+      "step": 124
+    },
+    {
+      "epoch": 0.10711225364181662,
+      "grad_norm": 0.020799005404114723,
+      "learning_rate": 0.0008865052266813684,
+      "loss": 0.2062,
+      "step": 125
+    },
+    {
+      "epoch": 0.10796915167095116,
+      "grad_norm": 0.02335492894053459,
+      "learning_rate": 0.0008844209160367298,
+      "loss": 0.2004,
+      "step": 126
+    },
+    {
+      "epoch": 0.10882604970008569,
+      "grad_norm": 0.025448890402913094,
+      "learning_rate": 0.0008823201380795002,
+      "loss": 0.2053,
+      "step": 127
+    },
+    {
+      "epoch": 0.10968294772922023,
+      "grad_norm": 0.024005141109228134,
+      "learning_rate": 0.0008802029828000156,
+      "loss": 0.1926,
+      "step": 128
+    },
+    {
+      "epoch": 0.11053984575835475,
+      "grad_norm": 0.021053491160273552,
+      "learning_rate": 0.0008780695408901613,
+      "loss": 0.1938,
+      "step": 129
+    },
+    {
+      "epoch": 0.11139674378748929,
+      "grad_norm": 0.023733297362923622,
+      "learning_rate": 0.0008759199037394887,
+      "loss": 0.2013,
+      "step": 130
+    },
+    {
+      "epoch": 0.11225364181662383,
+      "grad_norm": 0.021928993985056877,
+      "learning_rate": 0.0008737541634312985,
+      "loss": 0.1939,
+      "step": 131
+    },
+    {
+      "epoch": 0.11311053984575835,
+      "grad_norm": 0.02457588165998459,
+      "learning_rate": 0.0008715724127386971,
+      "loss": 0.2005,
+      "step": 132
+    },
+    {
+      "epoch": 0.11396743787489289,
+      "grad_norm": 0.021438535302877426,
+      "learning_rate": 0.0008693747451206231,
+      "loss": 0.1984,
+      "step": 133
+    },
+    {
+      "epoch": 0.11482433590402742,
+      "grad_norm": 0.025149062275886536,
+      "learning_rate": 0.0008671612547178428,
+      "loss": 0.2042,
+      "step": 134
+    },
+    {
+      "epoch": 0.11568123393316196,
+      "grad_norm": 0.029300540685653687,
+      "learning_rate": 0.0008649320363489178,
+      "loss": 0.2056,
+      "step": 135
+    },
+    {
+      "epoch": 0.11653813196229648,
+      "grad_norm": 0.022236980497837067,
+      "learning_rate": 0.0008626871855061438,
+      "loss": 0.2056,
+      "step": 136
+    },
+    {
+      "epoch": 0.11739502999143102,
+      "grad_norm": 0.02103867195546627,
+      "learning_rate": 0.0008604267983514594,
+      "loss": 0.1949,
+      "step": 137
+    },
+    {
+      "epoch": 0.11825192802056556,
+      "grad_norm": 0.025057192891836166,
+      "learning_rate": 0.0008581509717123273,
+      "loss": 0.1993,
+      "step": 138
+    },
+    {
+      "epoch": 0.11910882604970009,
+      "grad_norm": 0.021641414612531662,
+      "learning_rate": 0.0008558598030775857,
+      "loss": 0.1969,
+      "step": 139
+    },
+    {
+      "epoch": 0.11996572407883462,
+      "grad_norm": 0.022903475910425186,
+      "learning_rate": 0.0008535533905932737,
+      "loss": 0.1971,
+      "step": 140
+    },
+    {
+      "epoch": 0.12082262210796915,
+      "grad_norm": 0.021558206528425217,
+      "learning_rate": 0.0008512318330584259,
+      "loss": 0.1932,
+      "step": 141
+    },
+    {
+      "epoch": 0.12167952013710369,
+      "grad_norm": 0.024407191202044487,
+      "learning_rate": 0.0008488952299208401,
+      "loss": 0.1872,
+      "step": 142
+    },
+    {
+      "epoch": 0.12253641816623821,
+      "grad_norm": 0.02087639644742012,
+      "learning_rate": 0.000846543681272818,
+      "loss": 0.2032,
+      "step": 143
+    },
+    {
+      "epoch": 0.12339331619537275,
+      "grad_norm": 0.02548481896519661,
+      "learning_rate": 0.000844177287846877,
+      "loss": 0.1924,
+      "step": 144
+    },
+    {
+      "epoch": 0.12425021422450729,
+      "grad_norm": 0.02311697043478489,
+      "learning_rate": 0.0008417961510114356,
+      "loss": 0.1925,
+      "step": 145
+    },
+    {
+      "epoch": 0.12510711225364182,
+      "grad_norm": 0.018741579726338387,
+      "learning_rate": 0.0008394003727664709,
+      "loss": 0.1812,
+      "step": 146
+    },
+    {
+      "epoch": 0.12596401028277635,
+      "grad_norm": 0.025283221155405045,
+      "learning_rate": 0.000836990055739149,
+      "loss": 0.1888,
+      "step": 147
+    },
+    {
+      "epoch": 0.1268209083119109,
+      "grad_norm": 0.025587448850274086,
+      "learning_rate": 0.0008345653031794292,
+      "loss": 0.1942,
+      "step": 148
+    },
+    {
+      "epoch": 0.12767780634104542,
+      "grad_norm": 0.020198052749037743,
+      "learning_rate": 0.0008321262189556409,
+      "loss": 0.1863,
+      "step": 149
+    },
+    {
+      "epoch": 0.12853470437017994,
+      "grad_norm": 0.02162170223891735,
+      "learning_rate": 0.0008296729075500344,
+      "loss": 0.1891,
+      "step": 150
+    },
+    {
+      "epoch": 0.1293916023993145,
+      "grad_norm": 0.022257110103964806,
+      "learning_rate": 0.0008272054740543053,
+      "loss": 0.194,
+      "step": 151
+    },
+    {
+      "epoch": 0.13024850042844902,
+      "grad_norm": 0.022947294637560844,
+      "learning_rate": 0.0008247240241650918,
+      "loss": 0.1836,
+      "step": 152
+    },
+    {
+      "epoch": 0.13110539845758354,
+      "grad_norm": 0.021060343831777573,
+      "learning_rate": 0.0008222286641794488,
+      "loss": 0.1886,
+      "step": 153
+    },
+    {
+      "epoch": 0.1319622964867181,
+      "grad_norm": 0.025356875732541084,
+      "learning_rate": 0.0008197195009902923,
+      "loss": 0.1875,
+      "step": 154
+    },
+    {
+      "epoch": 0.1328191945158526,
+      "grad_norm": 0.022868521511554718,
+      "learning_rate": 0.0008171966420818228,
+      "loss": 0.1904,
+      "step": 155
+    },
+    {
+      "epoch": 0.13367609254498714,
+      "grad_norm": 0.023869860917329788,
+      "learning_rate": 0.0008146601955249188,
+      "loss": 0.1832,
+      "step": 156
+    },
+    {
+      "epoch": 0.13453299057412169,
+      "grad_norm": 0.022344481199979782,
+      "learning_rate": 0.0008121102699725089,
+      "loss": 0.1814,
+      "step": 157
+    },
+    {
+      "epoch": 0.1353898886032562,
+      "grad_norm": 0.0211899783462286,
+      "learning_rate": 0.0008095469746549171,
+      "loss": 0.1929,
+      "step": 158
+    },
+    {
+      "epoch": 0.13624678663239073,
+      "grad_norm": 0.022394029423594475,
+      "learning_rate": 0.0008069704193751832,
+      "loss": 0.1879,
+      "step": 159
+    },
+    {
+      "epoch": 0.13710368466152528,
+      "grad_norm": 0.019436044618487358,
+      "learning_rate": 0.0008043807145043603,
+      "loss": 0.187,
+      "step": 160
+    },
+    {
+      "epoch": 0.1379605826906598,
+      "grad_norm": 0.028875339776277542,
+      "learning_rate": 0.0008017779709767858,
+      "loss": 0.1853,
+      "step": 161
+    },
+    {
+      "epoch": 0.13881748071979436,
+      "grad_norm": 0.024842459708452225,
+      "learning_rate": 0.0007991623002853296,
+      "loss": 0.1858,
+      "step": 162
+    },
+    {
+      "epoch": 0.13967437874892888,
+      "grad_norm": 0.021923067048192024,
+      "learning_rate": 0.0007965338144766185,
+      "loss": 0.1879,
+      "step": 163
+    },
+    {
+      "epoch": 0.1405312767780634,
+      "grad_norm": 0.020001094788312912,
+      "learning_rate": 0.0007938926261462366,
+      "loss": 0.1764,
+      "step": 164
+    },
+    {
+      "epoch": 0.14138817480719795,
+      "grad_norm": 0.02206343039870262,
+      "learning_rate": 0.0007912388484339011,
+      "loss": 0.1851,
+      "step": 165
+    },
+    {
+      "epoch": 0.14224507283633248,
+      "grad_norm": 0.018592318519949913,
+      "learning_rate": 0.0007885725950186169,
+      "loss": 0.1807,
+      "step": 166
+    },
+    {
+      "epoch": 0.143101970865467,
+      "grad_norm": 0.022569900378584862,
+      "learning_rate": 0.000785893980113806,
+      "loss": 0.1782,
+      "step": 167
+    },
+    {
+      "epoch": 0.14395886889460155,
+      "grad_norm": 0.02649298682808876,
+      "learning_rate": 0.0007832031184624164,
+      "loss": 0.1796,
+      "step": 168
+    },
+    {
+      "epoch": 0.14481576692373607,
+      "grad_norm": 0.018016191199421883,
+      "learning_rate": 0.000780500125332005,
+      "loss": 0.1773,
+      "step": 169
+    },
+    {
+      "epoch": 0.1456726649528706,
+      "grad_norm": 0.02014097571372986,
+      "learning_rate": 0.0007777851165098011,
+      "loss": 0.1746,
+      "step": 170
+    },
+    {
+      "epoch": 0.14652956298200515,
+      "grad_norm": 0.01795336976647377,
+      "learning_rate": 0.0007750582082977468,
+      "loss": 0.1794,
+      "step": 171
+    },
+    {
+      "epoch": 0.14738646101113967,
+      "grad_norm": 0.02241736464202404,
+      "learning_rate": 0.0007723195175075137,
+      "loss": 0.1697,
+      "step": 172
+    },
+    {
+      "epoch": 0.14824335904027422,
+      "grad_norm": 0.021646304056048393,
+      "learning_rate": 0.0007695691614555002,
+      "loss": 0.1767,
+      "step": 173
+    },
+    {
+      "epoch": 0.14910025706940874,
+      "grad_norm": 0.01908080466091633,
+      "learning_rate": 0.0007668072579578058,
+      "loss": 0.1808,
+      "step": 174
+    },
+    {
+      "epoch": 0.14995715509854327,
+      "grad_norm": 0.019076567143201828,
+      "learning_rate": 0.000764033925325184,
+      "loss": 0.1753,
+      "step": 175
+    },
+    {
+      "epoch": 0.15081405312767782,
+      "grad_norm": 0.01973322220146656,
+      "learning_rate": 0.0007612492823579744,
+      "loss": 0.1747,
+      "step": 176
+    },
+    {
+      "epoch": 0.15167095115681234,
+      "grad_norm": 0.017605643719434738,
+      "learning_rate": 0.0007584534483410137,
+      "loss": 0.1746,
+      "step": 177
+    },
+    {
+      "epoch": 0.15252784918594686,
+      "grad_norm": 0.017287466675043106,
+      "learning_rate": 0.0007556465430385259,
+      "loss": 0.1778,
+      "step": 178
+    },
+    {
+      "epoch": 0.1533847472150814,
+      "grad_norm": 0.019033176824450493,
+      "learning_rate": 0.0007528286866889924,
+      "loss": 0.1795,
+      "step": 179
+    },
+    {
+      "epoch": 0.15424164524421594,
+      "grad_norm": 0.019875552505254745,
+      "learning_rate": 0.00075,
+      "loss": 0.1637,
+      "step": 180
+    },
+    {
+      "epoch": 0.15509854327335046,
+      "grad_norm": 0.018225200474262238,
+      "learning_rate": 0.0007471606041430723,
+      "loss": 0.167,
+      "step": 181
+    },
+    {
+      "epoch": 0.155955441302485,
+      "grad_norm": 0.019243160262703896,
+      "learning_rate": 0.0007443106207484776,
+      "loss": 0.1704,
+      "step": 182
+    },
+    {
+      "epoch": 0.15681233933161953,
+      "grad_norm": 0.019462725147604942,
+      "learning_rate": 0.0007414501719000186,
+      "loss": 0.1764,
+      "step": 183
+    },
+    {
+      "epoch": 0.15766923736075408,
+      "grad_norm": 0.0178945641964674,
+      "learning_rate": 0.0007385793801298042,
+      "loss": 0.1771,
+      "step": 184
+    },
+    {
+      "epoch": 0.1585261353898886,
+      "grad_norm": 0.01750069111585617,
+      "learning_rate": 0.000735698368412999,
+      "loss": 0.1773,
+      "step": 185
+    },
+    {
+      "epoch": 0.15938303341902313,
+      "grad_norm": 0.023665515705943108,
+      "learning_rate": 0.0007328072601625557,
+      "loss": 0.1842,
+      "step": 186
+    },
+    {
+      "epoch": 0.16023993144815768,
+      "grad_norm": 0.021700644865632057,
+      "learning_rate": 0.00072990617922393,
+      "loss": 0.1668,
+      "step": 187
+    },
+    {
+      "epoch": 0.1610968294772922,
+      "grad_norm": 0.022405751049518585,
+      "learning_rate": 0.0007269952498697733,
+      "loss": 0.1831,
+      "step": 188
+    },
+    {
+      "epoch": 0.16195372750642673,
+      "grad_norm": 0.02437387965619564,
+      "learning_rate": 0.0007240745967946113,
+      "loss": 0.1805,
+      "step": 189
+    },
+    {
+      "epoch": 0.16281062553556128,
+      "grad_norm": 0.019709205254912376,
+      "learning_rate": 0.0007211443451095007,
+      "loss": 0.1691,
+      "step": 190
+    },
+    {
+      "epoch": 0.1636675235646958,
+      "grad_norm": 0.02336045354604721,
+      "learning_rate": 0.000718204620336671,
+      "loss": 0.1751,
+      "step": 191
+    },
+    {
+      "epoch": 0.16452442159383032,
+      "grad_norm": 0.0230086762458086,
+      "learning_rate": 0.0007152555484041476,
+      "loss": 0.1697,
+      "step": 192
+    },
+    {
+      "epoch": 0.16538131962296487,
+      "grad_norm": 0.01794915646314621,
+      "learning_rate": 0.0007122972556403566,
+      "loss": 0.1704,
+      "step": 193
+    },
+    {
+      "epoch": 0.1662382176520994,
+      "grad_norm": 0.02200886234641075,
+      "learning_rate": 0.0007093298687687141,
+      "loss": 0.1741,
+      "step": 194
+    },
+    {
+      "epoch": 0.16709511568123395,
+      "grad_norm": 0.02043880894780159,
+      "learning_rate": 0.0007063535149021973,
+      "loss": 0.1753,
+      "step": 195
+    },
+    {
+      "epoch": 0.16795201371036847,
+      "grad_norm": 0.018617108464241028,
+      "learning_rate": 0.0007033683215379002,
+      "loss": 0.1727,
+      "step": 196
+    },
+    {
+      "epoch": 0.168808911739503,
+      "grad_norm": 0.023497916758060455,
+      "learning_rate": 0.0007003744165515704,
+      "loss": 0.1647,
+      "step": 197
+    },
+    {
+      "epoch": 0.16966580976863754,
+      "grad_norm": 0.021086974069476128,
+      "learning_rate": 0.0006973719281921336,
+      "loss": 0.1675,
+      "step": 198
+    },
+    {
+      "epoch": 0.17052270779777207,
+      "grad_norm": 0.018384108319878578,
+      "learning_rate": 0.0006943609850761978,
+      "loss": 0.1738,
+      "step": 199
+    },
+    {
+      "epoch": 0.1713796058269066,
+      "grad_norm": 0.019381960853934288,
+      "learning_rate": 0.000691341716182545,
+      "loss": 0.1711,
+      "step": 200
+    },
+    {
+      "epoch": 0.17223650385604114,
+      "grad_norm": 0.023446090519428253,
+      "learning_rate": 0.0006883142508466054,
+      "loss": 0.17,
+      "step": 201
+    },
+    {
+      "epoch": 0.17309340188517566,
+      "grad_norm": 0.018001163378357887,
+      "learning_rate": 0.0006852787187549182,
+      "loss": 0.1675,
+      "step": 202
+    },
+    {
+      "epoch": 0.17395029991431019,
+      "grad_norm": 0.018448330461978912,
+      "learning_rate": 0.000682235249939575,
+      "loss": 0.1732,
+      "step": 203
+    },
+    {
+      "epoch": 0.17480719794344474,
+      "grad_norm": 0.022160930559039116,
+      "learning_rate": 0.0006791839747726501,
+      "loss": 0.1737,
+      "step": 204
+    },
+    {
+      "epoch": 0.17566409597257926,
+      "grad_norm": 0.021703101694583893,
+      "learning_rate": 0.0006761250239606168,
+      "loss": 0.1716,
+      "step": 205
+    },
+    {
+      "epoch": 0.17652099400171378,
+      "grad_norm": 0.03577428311109543,
+      "learning_rate": 0.0006730585285387465,
+      "loss": 0.2503,
+      "step": 206
+    },
+    {
+      "epoch": 0.17737789203084833,
+      "grad_norm": 0.021015428006649017,
+      "learning_rate": 0.000669984619865497,
+      "loss": 0.1683,
+      "step": 207
+    },
+    {
+      "epoch": 0.17823479005998286,
+      "grad_norm": 0.027599437162280083,
+      "learning_rate": 0.0006669034296168854,
+      "loss": 0.1679,
+      "step": 208
+    },
+    {
+      "epoch": 0.1790916880891174,
+      "grad_norm": 0.02527066133916378,
+      "learning_rate": 0.0006638150897808468,
+      "loss": 0.1711,
+      "step": 209
+    },
+    {
+      "epoch": 0.17994858611825193,
+      "grad_norm": 0.021573588252067566,
+      "learning_rate": 0.0006607197326515808,
+      "loss": 0.1709,
+      "step": 210
+    },
+    {
+      "epoch": 0.18080548414738645,
+      "grad_norm": 0.022143971174955368,
+      "learning_rate": 0.0006576174908238849,
+      "loss": 0.1695,
+      "step": 211
+    },
+    {
+      "epoch": 0.181662382176521,
+      "grad_norm": 0.019489064812660217,
+      "learning_rate": 0.0006545084971874737,
+      "loss": 0.1713,
+      "step": 212
+    },
+    {
+      "epoch": 0.18251928020565553,
+      "grad_norm": 0.016801459714770317,
+      "learning_rate": 0.0006513928849212874,
+      "loss": 0.1628,
+      "step": 213
+    },
+    {
+      "epoch": 0.18337617823479005,
+      "grad_norm": 0.02975154109299183,
+      "learning_rate": 0.0006482707874877854,
+      "loss": 0.1671,
+      "step": 214
+    },
+    {
+      "epoch": 0.1842330762639246,
+      "grad_norm": 0.016304058954119682,
+      "learning_rate": 0.0006451423386272311,
+      "loss": 0.1681,
+      "step": 215
+    },
+    {
+      "epoch": 0.18508997429305912,
+      "grad_norm": 0.018919240683317184,
+      "learning_rate": 0.0006420076723519614,
+      "loss": 0.1632,
+      "step": 216
+    },
+    {
+      "epoch": 0.18594687232219365,
+      "grad_norm": 0.017546426504850388,
+      "learning_rate": 0.0006388669229406462,
+      "loss": 0.1626,
+      "step": 217
+    },
+    {
+      "epoch": 0.1868037703513282,
+      "grad_norm": 0.018862828612327576,
+      "learning_rate": 0.0006357202249325371,
+      "loss": 0.1779,
+      "step": 218
+    },
+    {
+      "epoch": 0.18766066838046272,
+      "grad_norm": 0.01685691997408867,
+      "learning_rate": 0.000632567713121704,
+      "loss": 0.1736,
+      "step": 219
+    },
+    {
+      "epoch": 0.18851756640959727,
+      "grad_norm": 0.017843585461378098,
+      "learning_rate": 0.0006294095225512603,
+      "loss": 0.1685,
+      "step": 220
+    },
+    {
+      "epoch": 0.1893744644387318,
+      "grad_norm": 0.019433507695794106,
+      "learning_rate": 0.000626245788507579,
+      "loss": 0.1601,
+      "step": 221
+    },
+    {
+      "epoch": 0.19023136246786632,
+      "grad_norm": 0.017471810802817345,
+      "learning_rate": 0.0006230766465144965,
+      "loss": 0.1626,
+      "step": 222
+    },
+    {
+      "epoch": 0.19108826049700087,
+      "grad_norm": 0.021109052002429962,
+      "learning_rate": 0.0006199022323275083,
+      "loss": 0.1608,
+      "step": 223
+    },
+    {
+      "epoch": 0.1919451585261354,
+      "grad_norm": 0.017889205366373062,
+      "learning_rate": 0.0006167226819279528,
+      "loss": 0.1666,
+      "step": 224
+    },
+    {
+      "epoch": 0.1928020565552699,
+      "grad_norm": 0.01807485520839691,
+      "learning_rate": 0.0006135381315171866,
+      "loss": 0.1717,
+      "step": 225
+    },
+    {
+      "epoch": 0.19365895458440446,
+      "grad_norm": 0.01634569652378559,
+      "learning_rate": 0.0006103487175107507,
+      "loss": 0.1698,
+      "step": 226
+    },
+    {
+      "epoch": 0.194515852613539,
+      "grad_norm": 0.019904734566807747,
+      "learning_rate": 0.0006071545765325253,
+      "loss": 0.1638,
+      "step": 227
+    },
+    {
+      "epoch": 0.1953727506426735,
+      "grad_norm": 0.020387910306453705,
+      "learning_rate": 0.0006039558454088796,
+      "loss": 0.1689,
+      "step": 228
+    },
+    {
+      "epoch": 0.19622964867180806,
+      "grad_norm": 0.018753819167613983,
+      "learning_rate": 0.0006007526611628086,
+      "loss": 0.1617,
+      "step": 229
+    },
+    {
+      "epoch": 0.19708654670094258,
+      "grad_norm": 0.015613901428878307,
+      "learning_rate": 0.0005975451610080642,
+      "loss": 0.1563,
+      "step": 230
+    },
+    {
+      "epoch": 0.19794344473007713,
+      "grad_norm": 0.018909510225057602,
+      "learning_rate": 0.0005943334823432777,
+      "loss": 0.1633,
+      "step": 231
+    },
+    {
+      "epoch": 0.19880034275921166,
+      "grad_norm": 0.01886546052992344,
+      "learning_rate": 0.0005911177627460738,
+      "loss": 0.1579,
+      "step": 232
+    },
+    {
+      "epoch": 0.19965724078834618,
+      "grad_norm": 0.014775911346077919,
+      "learning_rate": 0.0005878981399671774,
+      "loss": 0.1632,
+      "step": 233
+    },
+    {
+      "epoch": 0.20051413881748073,
+      "grad_norm": 0.01664350926876068,
+      "learning_rate": 0.0005846747519245122,
+      "loss": 0.1647,
+      "step": 234
+    },
+    {
+      "epoch": 0.20137103684661525,
+      "grad_norm": 0.015615535899996758,
+      "learning_rate": 0.0005814477366972944,
+      "loss": 0.1492,
+      "step": 235
+    },
+    {
+      "epoch": 0.20222793487574978,
+      "grad_norm": 0.020716892555356026,
+      "learning_rate": 0.0005782172325201155,
+      "loss": 0.166,
+      "step": 236
+    },
+    {
+      "epoch": 0.20308483290488433,
+      "grad_norm": 0.016642222180962563,
+      "learning_rate": 0.0005749833777770225,
+      "loss": 0.1598,
+      "step": 237
+    },
+    {
+      "epoch": 0.20394173093401885,
+      "grad_norm": 0.02147907204926014,
+      "learning_rate": 0.0005717463109955896,
+      "loss": 0.1604,
+      "step": 238
+    },
+    {
+      "epoch": 0.20479862896315337,
+      "grad_norm": 0.019267620518803596,
+      "learning_rate": 0.0005685061708409841,
+      "loss": 0.1558,
+      "step": 239
+    },
+    {
+      "epoch": 0.20565552699228792,
+      "grad_norm": 0.016083069145679474,
+      "learning_rate": 0.000565263096110026,
+      "loss": 0.1593,
+      "step": 240
+    },
+    {
+      "epoch": 0.20651242502142245,
+      "grad_norm": 0.023486295714974403,
+      "learning_rate": 0.0005620172257252427,
+      "loss": 0.1461,
+      "step": 241
+    },
+    {
+      "epoch": 0.207369323050557,
+      "grad_norm": 0.018065959215164185,
+      "learning_rate": 0.0005587686987289189,
+      "loss": 0.1673,
+      "step": 242
+    },
+    {
+      "epoch": 0.20822622107969152,
+      "grad_norm": 0.02077527344226837,
+      "learning_rate": 0.0005555176542771388,
+      "loss": 0.1617,
+      "step": 243
+    },
+    {
+      "epoch": 0.20908311910882604,
+      "grad_norm": 0.01682773232460022,
+      "learning_rate": 0.0005522642316338268,
+      "loss": 0.1626,
+      "step": 244
+    },
+    {
+      "epoch": 0.2099400171379606,
+      "grad_norm": 0.01779726706445217,
+      "learning_rate": 0.0005490085701647804,
+      "loss": 0.1585,
+      "step": 245
+    },
+    {
+      "epoch": 0.21079691516709512,
+      "grad_norm": 0.01758912205696106,
+      "learning_rate": 0.0005457508093317013,
+      "loss": 0.1642,
+      "step": 246
+    },
+    {
+      "epoch": 0.21165381319622964,
+      "grad_norm": 0.01853191666305065,
+      "learning_rate": 0.0005424910886862209,
+      "loss": 0.1729,
+      "step": 247
+    },
+    {
+      "epoch": 0.2125107112253642,
+      "grad_norm": 0.015318380668759346,
+      "learning_rate": 0.0005392295478639225,
+      "loss": 0.1519,
+      "step": 248
+    },
+    {
+      "epoch": 0.2133676092544987,
+      "grad_norm": 0.016783015802502632,
+      "learning_rate": 0.0005359663265783598,
+      "loss": 0.1534,
+      "step": 249
+    },
+    {
+      "epoch": 0.21422450728363324,
+      "grad_norm": 0.019508186727762222,
+      "learning_rate": 0.0005327015646150716,
+      "loss": 0.1573,
+      "step": 250
+    },
+    {
+      "epoch": 0.2150814053127678,
+      "grad_norm": 0.017221014946699142,
+      "learning_rate": 0.0005294354018255945,
+      "loss": 0.168,
+      "step": 251
+    },
+    {
+      "epoch": 0.2159383033419023,
+      "grad_norm": 0.015044581145048141,
+      "learning_rate": 0.000526167978121472,
+      "loss": 0.168,
+      "step": 252
+    },
+    {
+      "epoch": 0.21679520137103683,
+      "grad_norm": 0.016873784363269806,
+      "learning_rate": 0.0005228994334682604,
+      "loss": 0.1585,
+      "step": 253
+    },
+    {
+      "epoch": 0.21765209940017138,
+      "grad_norm": 0.017317088320851326,
+      "learning_rate": 0.0005196299078795343,
+      "loss": 0.1556,
+      "step": 254
+    },
+    {
+      "epoch": 0.2185089974293059,
+      "grad_norm": 0.017208045348525047,
+      "learning_rate": 0.0005163595414108881,
+      "loss": 0.1551,
+      "step": 255
+    },
+    {
+      "epoch": 0.21936589545844046,
+      "grad_norm": 0.015319808386266232,
+      "learning_rate": 0.0005130884741539367,
+      "loss": 0.1491,
+      "step": 256
+    },
+    {
+      "epoch": 0.22022279348757498,
+      "grad_norm": 0.020643778145313263,
+      "learning_rate": 0.0005098168462303141,
+      "loss": 0.1629,
+      "step": 257
+    },
+    {
+      "epoch": 0.2210796915167095,
+      "grad_norm": 0.01764957793056965,
+      "learning_rate": 0.0005065447977856722,
+      "loss": 0.16,
+      "step": 258
+    },
+    {
+      "epoch": 0.22193658954584405,
+      "grad_norm": 0.014267503283917904,
+      "learning_rate": 0.0005032724689836759,
+      "loss": 0.1524,
+      "step": 259
+    },
+    {
+      "epoch": 0.22279348757497858,
+      "grad_norm": 0.016303062438964844,
+      "learning_rate": 0.0005,
+      "loss": 0.1544,
+      "step": 260
+    },
+    {
+      "epoch": 0.2236503856041131,
+      "grad_norm": 0.021528156474232674,
+      "learning_rate": 0.0004967275310163241,
+      "loss": 0.155,
+      "step": 261
+    },
+    {
+      "epoch": 0.22450728363324765,
+      "grad_norm": 0.017993303015828133,
+      "learning_rate": 0.0004934552022143279,
+      "loss": 0.1646,
+      "step": 262
+    },
+    {
+      "epoch": 0.22536418166238217,
+      "grad_norm": 0.018955664709210396,
+      "learning_rate": 0.0004901831537696859,
+      "loss": 0.1564,
+      "step": 263
+    },
+    {
+      "epoch": 0.2262210796915167,
+      "grad_norm": 0.01549836527556181,
+      "learning_rate": 0.0004869115258460635,
+      "loss": 0.1605,
+      "step": 264
+    },
+    {
+      "epoch": 0.22707797772065125,
+      "grad_norm": 0.019947407767176628,
+      "learning_rate": 0.00048364045858911197,
+      "loss": 0.1586,
+      "step": 265
+    },
+    {
+      "epoch": 0.22793487574978577,
+      "grad_norm": 0.022805072367191315,
+      "learning_rate": 0.00048037009212046586,
+      "loss": 0.1585,
+      "step": 266
+    },
+    {
+      "epoch": 0.22879177377892032,
+      "grad_norm": 0.014930406585335732,
+      "learning_rate": 0.0004771005665317397,
+      "loss": 0.1541,
+      "step": 267
+    },
+    {
+      "epoch": 0.22964867180805484,
+      "grad_norm": 0.017478667199611664,
+      "learning_rate": 0.0004738320218785281,
+      "loss": 0.1652,
+      "step": 268
+    },
+    {
+      "epoch": 0.23050556983718937,
+      "grad_norm": 0.016168439760804176,
+      "learning_rate": 0.00047056459817440544,
+      "loss": 0.1558,
+      "step": 269
+    },
+    {
+      "epoch": 0.23136246786632392,
+      "grad_norm": 0.018146967515349388,
+      "learning_rate": 0.00046729843538492847,
+      "loss": 0.1589,
+      "step": 270
+    },
+    {
+      "epoch": 0.23221936589545844,
+      "grad_norm": 0.021017901599407196,
+      "learning_rate": 0.00046403367342164026,
+      "loss": 0.158,
+      "step": 271
+    },
+    {
+      "epoch": 0.23307626392459296,
+      "grad_norm": 0.0199353639036417,
+      "learning_rate": 0.0004607704521360776,
+      "loss": 0.159,
+      "step": 272
+    },
+    {
+      "epoch": 0.23393316195372751,
+      "grad_norm": 0.0214347206056118,
+      "learning_rate": 0.0004575089113137792,
+      "loss": 0.1594,
+      "step": 273
+    },
+    {
+      "epoch": 0.23479005998286204,
+      "grad_norm": 0.01558151189237833,
+      "learning_rate": 0.00045424919066829885,
+      "loss": 0.1595,
+      "step": 274
+    },
+    {
+      "epoch": 0.23564695801199656,
+      "grad_norm": 0.01718144491314888,
+      "learning_rate": 0.0004509914298352197,
+      "loss": 0.1636,
+      "step": 275
+    },
+    {
+      "epoch": 0.2365038560411311,
+      "grad_norm": 0.0169569943100214,
+      "learning_rate": 0.00044773576836617336,
+      "loss": 0.1581,
+      "step": 276
+    },
+    {
+      "epoch": 0.23736075407026563,
+      "grad_norm": 0.01732802391052246,
+      "learning_rate": 0.0004444823457228612,
+      "loss": 0.1544,
+      "step": 277
+    },
+    {
+      "epoch": 0.23821765209940018,
+      "grad_norm": 0.01665407046675682,
+      "learning_rate": 0.00044123130127108126,
+      "loss": 0.1607,
+      "step": 278
+    },
+    {
+      "epoch": 0.2390745501285347,
+      "grad_norm": 0.0163556020706892,
+      "learning_rate": 0.0004379827742747575,
+      "loss": 0.1544,
+      "step": 279
+    },
+    {
+      "epoch": 0.23993144815766923,
+      "grad_norm": 0.01844378188252449,
+      "learning_rate": 0.00043473690388997434,
+      "loss": 0.1552,
+      "step": 280
+    },
+    {
+      "epoch": 0.24078834618680378,
+      "grad_norm": 0.01735353097319603,
+      "learning_rate": 0.0004314938291590161,
+      "loss": 0.1592,
+      "step": 281
+    },
+    {
+      "epoch": 0.2416452442159383,
+      "grad_norm": 0.01745842583477497,
+      "learning_rate": 0.0004282536890044104,
+      "loss": 0.1548,
+      "step": 282
+    },
+    {
+      "epoch": 0.24250214224507283,
+      "grad_norm": 0.015916811302304268,
+      "learning_rate": 0.0004250166222229774,
+      "loss": 0.1512,
+      "step": 283
+    },
+    {
+      "epoch": 0.24335904027420738,
+      "grad_norm": 0.015339579433202744,
+      "learning_rate": 0.0004217827674798845,
+      "loss": 0.1553,
+      "step": 284
+    },
+    {
+      "epoch": 0.2442159383033419,
+      "grad_norm": 0.01986338384449482,
+      "learning_rate": 0.0004185522633027057,
+      "loss": 0.163,
+      "step": 285
+    },
+    {
+      "epoch": 0.24507283633247642,
+      "grad_norm": 0.018619602546095848,
+      "learning_rate": 0.0004153252480754877,
+      "loss": 0.1526,
+      "step": 286
+    },
+    {
+      "epoch": 0.24592973436161097,
+      "grad_norm": 0.016342243179678917,
+      "learning_rate": 0.00041210186003282274,
+      "loss": 0.157,
+      "step": 287
+    },
+    {
+      "epoch": 0.2467866323907455,
+      "grad_norm": 0.016473161056637764,
+      "learning_rate": 0.00040888223725392626,
+      "loss": 0.1638,
+      "step": 288
+    },
+    {
+      "epoch": 0.24764353041988005,
+      "grad_norm": 0.015475657768547535,
+      "learning_rate": 0.00040566651765672245,
+      "loss": 0.1566,
+      "step": 289
+    },
+    {
+      "epoch": 0.24850042844901457,
+      "grad_norm": 0.019614532589912415,
+      "learning_rate": 0.00040245483899193594,
+      "loss": 0.1563,
+      "step": 290
+    },
+    {
+      "epoch": 0.2493573264781491,
+      "grad_norm": 0.01654512993991375,
+      "learning_rate": 0.00039924733883719147,
+      "loss": 0.155,
+      "step": 291
+    },
+    {
+      "epoch": 0.25021422450728364,
+      "grad_norm": 0.01950278878211975,
+      "learning_rate": 0.0003960441545911204,
+      "loss": 0.1562,
+      "step": 292
+    },
+    {
+      "epoch": 0.25107112253641817,
+      "grad_norm": 0.014652718789875507,
+      "learning_rate": 0.0003928454234674747,
+      "loss": 0.1524,
+      "step": 293
+    },
+    {
+      "epoch": 0.2519280205655527,
+      "grad_norm": 0.014939884655177593,
+      "learning_rate": 0.0003896512824892495,
+      "loss": 0.1541,
+      "step": 294
+    },
+    {
+      "epoch": 0.2527849185946872,
+      "grad_norm": 0.0260702446103096,
+      "learning_rate": 0.00038646186848281344,
+      "loss": 0.1603,
+      "step": 295
+    },
+    {
+      "epoch": 0.2536418166238218,
+      "grad_norm": 0.022277580574154854,
+      "learning_rate": 0.00038327731807204744,
+      "loss": 0.1559,
+      "step": 296
+    },
+    {
+      "epoch": 0.2544987146529563,
+      "grad_norm": 0.017295408993959427,
+      "learning_rate": 0.0003800977676724919,
+      "loss": 0.1641,
+      "step": 297
+    },
+    {
+      "epoch": 0.25535561268209084,
+      "grad_norm": 0.015900392085313797,
+      "learning_rate": 0.0003769233534855035,
+      "loss": 0.1569,
+      "step": 298
+    },
+    {
+      "epoch": 0.25621251071122536,
+      "grad_norm": 0.02040684036910534,
+      "learning_rate": 0.00037375421149242103,
+      "loss": 0.1529,
+      "step": 299
+    },
+    {
+      "epoch": 0.2570694087403599,
+      "grad_norm": 0.020803892984986305,
+      "learning_rate": 0.0003705904774487396,
+      "loss": 0.1546,
+      "step": 300
+    },
+    {
+      "epoch": 0.2579263067694944,
+      "grad_norm": 0.017124850302934647,
+      "learning_rate": 0.0003674322868782959,
+      "loss": 0.1499,
+      "step": 301
+    },
+    {
+      "epoch": 0.258783204798629,
+      "grad_norm": 0.015450037084519863,
+      "learning_rate": 0.0003642797750674629,
+      "loss": 0.1485,
+      "step": 302
+    },
+    {
+      "epoch": 0.2596401028277635,
+      "grad_norm": 0.017722809687256813,
+      "learning_rate": 0.00036113307705935393,
+      "loss": 0.158,
+      "step": 303
+    },
+    {
+      "epoch": 0.26049700085689803,
+      "grad_norm": 0.016334377229213715,
+      "learning_rate": 0.0003579923276480387,
+      "loss": 0.1583,
+      "step": 304
+    },
+    {
+      "epoch": 0.26135389888603255,
+      "grad_norm": 0.014489492401480675,
+      "learning_rate": 0.0003548576613727689,
+      "loss": 0.1493,
+      "step": 305
+    },
+    {
+      "epoch": 0.2622107969151671,
+      "grad_norm": 0.022003574296832085,
+      "learning_rate": 0.0003517292125122146,
+      "loss": 0.159,
+      "step": 306
+    },
+    {
+      "epoch": 0.26306769494430166,
+      "grad_norm": 0.01759640872478485,
+      "learning_rate": 0.0003486071150787128,
+      "loss": 0.1485,
+      "step": 307
+    },
+    {
+      "epoch": 0.2639245929734362,
+      "grad_norm": 0.017299624159932137,
+      "learning_rate": 0.00034549150281252633,
+      "loss": 0.1552,
+      "step": 308
+    },
+    {
+      "epoch": 0.2647814910025707,
+      "grad_norm": 0.015600617974996567,
+      "learning_rate": 0.0003423825091761153,
+      "loss": 0.1535,
+      "step": 309
+    },
+    {
+      "epoch": 0.2656383890317052,
+      "grad_norm": 0.016550280153751373,
+      "learning_rate": 0.0003392802673484193,
+      "loss": 0.1513,
+      "step": 310
+    },
+    {
+      "epoch": 0.26649528706083975,
+      "grad_norm": 0.014733058400452137,
+      "learning_rate": 0.0003361849102191533,
+      "loss": 0.1542,
+      "step": 311
+    },
+    {
+      "epoch": 0.26735218508997427,
+      "grad_norm": 0.015453443862497807,
+      "learning_rate": 0.00033309657038311456,
+      "loss": 0.1567,
+      "step": 312
+    },
+    {
+      "epoch": 0.26820908311910885,
+      "grad_norm": 0.01371778268367052,
+      "learning_rate": 0.00033001538013450283,
+      "loss": 0.1545,
+      "step": 313
+    },
+    {
+      "epoch": 0.26906598114824337,
+      "grad_norm": 0.014121933840215206,
+      "learning_rate": 0.0003269414714612534,
+      "loss": 0.1579,
+      "step": 314
+    },
+    {
+      "epoch": 0.2699228791773779,
+      "grad_norm": 0.012276604771614075,
+      "learning_rate": 0.00032387497603938325,
+      "loss": 0.1493,
+      "step": 315
+    },
+    {
+      "epoch": 0.2707797772065124,
+      "grad_norm": 0.017444469034671783,
+      "learning_rate": 0.00032081602522734986,
+      "loss": 0.1535,
+      "step": 316
+    },
+    {
+      "epoch": 0.27163667523564694,
+      "grad_norm": 0.01802118867635727,
+      "learning_rate": 0.0003177647500604252,
+      "loss": 0.1556,
+      "step": 317
+    },
+    {
+      "epoch": 0.27249357326478146,
+      "grad_norm": 0.014814218506217003,
+      "learning_rate": 0.00031472128124508187,
+      "loss": 0.1534,
+      "step": 318
+    },
+    {
+      "epoch": 0.27335047129391604,
+      "grad_norm": 0.02104412205517292,
+      "learning_rate": 0.00031168574915339467,
+      "loss": 0.1572,
+      "step": 319
+    },
+    {
+      "epoch": 0.27420736932305056,
+      "grad_norm": 0.014852729625999928,
+      "learning_rate": 0.0003086582838174551,
+      "loss": 0.1509,
+      "step": 320
+    },
+    {
+      "epoch": 0.2750642673521851,
+      "grad_norm": 0.013851814903318882,
+      "learning_rate": 0.0003056390149238022,
+      "loss": 0.1557,
+      "step": 321
+    },
+    {
+      "epoch": 0.2759211653813196,
+      "grad_norm": 0.014435027725994587,
+      "learning_rate": 0.00030262807180786645,
+      "loss": 0.1516,
+      "step": 322
+    },
+    {
+      "epoch": 0.27677806341045413,
+      "grad_norm": 0.014334925450384617,
+      "learning_rate": 0.00029962558344842963,
+      "loss": 0.1542,
+      "step": 323
+    },
+    {
+      "epoch": 0.2776349614395887,
+      "grad_norm": 0.017674414440989494,
+      "learning_rate": 0.0002966316784621,
+      "loss": 0.1525,
+      "step": 324
+    },
+    {
+      "epoch": 0.27849185946872324,
+      "grad_norm": 0.019711866974830627,
+      "learning_rate": 0.0002936464850978027,
+      "loss": 0.1486,
+      "step": 325
+    },
+    {
+      "epoch": 0.27934875749785776,
+      "grad_norm": 0.015335663221776485,
+      "learning_rate": 0.0002906701312312861,
+      "loss": 0.1508,
+      "step": 326
+    },
+    {
+      "epoch": 0.2802056555269923,
+      "grad_norm": 0.013025953434407711,
+      "learning_rate": 0.00028770274435964356,
+      "loss": 0.1499,
+      "step": 327
+    },
+    {
+      "epoch": 0.2810625535561268,
+      "grad_norm": 0.01768515445291996,
+      "learning_rate": 0.0002847444515958523,
+      "loss": 0.1527,
+      "step": 328
+    },
+    {
+      "epoch": 0.2819194515852613,
+      "grad_norm": 0.015223097056150436,
+      "learning_rate": 0.0002817953796633289,
+      "loss": 0.1549,
+      "step": 329
+    },
+    {
+      "epoch": 0.2827763496143959,
+      "grad_norm": 0.013024591840803623,
+      "learning_rate": 0.00027885565489049947,
+      "loss": 0.1509,
+      "step": 330
+    },
+    {
+      "epoch": 0.28363324764353043,
+      "grad_norm": 0.014075031504034996,
+      "learning_rate": 0.0002759254032053888,
+      "loss": 0.1558,
+      "step": 331
+    },
+    {
+      "epoch": 0.28449014567266495,
+      "grad_norm": 0.013003628700971603,
+      "learning_rate": 0.00027300475013022663,
+      "loss": 0.15,
+      "step": 332
+    },
+    {
+      "epoch": 0.2853470437017995,
+      "grad_norm": 0.014489670284092426,
+      "learning_rate": 0.0002700938207760701,
+      "loss": 0.1476,
+      "step": 333
+    },
+    {
+      "epoch": 0.286203941730934,
+      "grad_norm": 0.012941529043018818,
+      "learning_rate": 0.0002671927398374443,
+      "loss": 0.1408,
+      "step": 334
+    },
+    {
+      "epoch": 0.2870608397600686,
+      "grad_norm": 0.012417233549058437,
+      "learning_rate": 0.00026430163158700117,
+      "loss": 0.1485,
+      "step": 335
+    },
+    {
+      "epoch": 0.2879177377892031,
+      "grad_norm": 0.01331823505461216,
+      "learning_rate": 0.00026142061987019576,
+      "loss": 0.1578,
+      "step": 336
+    },
+    {
+      "epoch": 0.2887746358183376,
+      "grad_norm": 0.021479349583387375,
+      "learning_rate": 0.0002585498280999815,
+      "loss": 0.1544,
+      "step": 337
+    },
+    {
+      "epoch": 0.28963153384747214,
+      "grad_norm": 0.014903098344802856,
+      "learning_rate": 0.0002556893792515227,
+      "loss": 0.1561,
+      "step": 338
+    },
+    {
+      "epoch": 0.29048843187660667,
+      "grad_norm": 0.013264741748571396,
+      "learning_rate": 0.00025283939585692784,
+      "loss": 0.1599,
+      "step": 339
+    },
+    {
+      "epoch": 0.2913453299057412,
+      "grad_norm": 0.0137154096737504,
+      "learning_rate": 0.0002500000000000001,
+      "loss": 0.1526,
+      "step": 340
+    },
+    {
+      "epoch": 0.29220222793487577,
+      "grad_norm": 0.01252024993300438,
+      "learning_rate": 0.0002471713133110078,
+      "loss": 0.1421,
+      "step": 341
+    },
+    {
+      "epoch": 0.2930591259640103,
+      "grad_norm": 0.01306652370840311,
+      "learning_rate": 0.00024435345696147403,
+      "loss": 0.1373,
+      "step": 342
+    },
+    {
+      "epoch": 0.2939160239931448,
+      "grad_norm": 0.015782173722982407,
+      "learning_rate": 0.00024154655165898627,
+      "loss": 0.149,
+      "step": 343
+    },
+    {
+      "epoch": 0.29477292202227934,
+      "grad_norm": 0.012606021016836166,
+      "learning_rate": 0.00023875071764202561,
+      "loss": 0.153,
+      "step": 344
+    },
+    {
+      "epoch": 0.29562982005141386,
+      "grad_norm": 0.013155002146959305,
+      "learning_rate": 0.00023596607467481602,
+      "loss": 0.1493,
+      "step": 345
+    },
+    {
+      "epoch": 0.29648671808054844,
+      "grad_norm": 0.01664326898753643,
+      "learning_rate": 0.00023319274204219425,
+      "loss": 0.1566,
+      "step": 346
+    },
+    {
+      "epoch": 0.29734361610968296,
+      "grad_norm": 0.01268248911947012,
+      "learning_rate": 0.00023043083854449987,
+      "loss": 0.1467,
+      "step": 347
+    },
+    {
+      "epoch": 0.2982005141388175,
+      "grad_norm": 0.012720319442451,
+      "learning_rate": 0.00022768048249248646,
+      "loss": 0.1528,
+      "step": 348
+    },
+    {
+      "epoch": 0.299057412167952,
+      "grad_norm": 0.012999819591641426,
+      "learning_rate": 0.00022494179170225333,
+      "loss": 0.1549,
+      "step": 349
+    },
+    {
+      "epoch": 0.29991431019708653,
+      "grad_norm": 0.013431290164589882,
+      "learning_rate": 0.00022221488349019903,
+      "loss": 0.1525,
+      "step": 350
+    },
+    {
+      "epoch": 0.30077120822622105,
+      "grad_norm": 0.01783970557153225,
+      "learning_rate": 0.0002194998746679952,
+      "loss": 0.1527,
+      "step": 351
+    },
+    {
+      "epoch": 0.30162810625535563,
+      "grad_norm": 0.021057991310954094,
+      "learning_rate": 0.0002167968815375837,
+      "loss": 0.1538,
+      "step": 352
+    },
+    {
+      "epoch": 0.30248500428449016,
+      "grad_norm": 0.01601223647594452,
+      "learning_rate": 0.00021410601988619394,
+      "loss": 0.1525,
+      "step": 353
+    },
+    {
+      "epoch": 0.3033419023136247,
+      "grad_norm": 0.012890568003058434,
+      "learning_rate": 0.00021142740498138323,
+      "loss": 0.1488,
+      "step": 354
+    },
+    {
+      "epoch": 0.3041988003427592,
+      "grad_norm": 0.013061465695500374,
+      "learning_rate": 0.000208761151566099,
+      "loss": 0.1498,
+      "step": 355
+    },
+    {
+      "epoch": 0.3050556983718937,
+      "grad_norm": 0.013169737532734871,
+      "learning_rate": 0.00020610737385376348,
+      "loss": 0.15,
+      "step": 356
+    },
+    {
+      "epoch": 0.3059125964010283,
+      "grad_norm": 0.012198954820632935,
+      "learning_rate": 0.00020346618552338148,
+      "loss": 0.1567,
+      "step": 357
+    },
+    {
+      "epoch": 0.3067694944301628,
+      "grad_norm": 0.01644357666373253,
+      "learning_rate": 0.00020083769971467047,
+      "loss": 0.1532,
+      "step": 358
+    },
+    {
+      "epoch": 0.30762639245929735,
+      "grad_norm": 0.014105524867773056,
+      "learning_rate": 0.0001982220290232143,
+      "loss": 0.1585,
+      "step": 359
+    },
+    {
+      "epoch": 0.30848329048843187,
+      "grad_norm": 0.012292543426156044,
+      "learning_rate": 0.00019561928549563967,
+      "loss": 0.1525,
+      "step": 360
+    },
+    {
+      "epoch": 0.3093401885175664,
+      "grad_norm": 0.01830083131790161,
+      "learning_rate": 0.00019302958062481672,
+      "loss": 0.1506,
+      "step": 361
+    },
+    {
+      "epoch": 0.3101970865467009,
+      "grad_norm": 0.013068013824522495,
+      "learning_rate": 0.00019045302534508295,
+      "loss": 0.1563,
+      "step": 362
+    },
+    {
+      "epoch": 0.3110539845758355,
+      "grad_norm": 0.012616029009222984,
+      "learning_rate": 0.0001878897300274911,
+      "loss": 0.1477,
+      "step": 363
+    },
+    {
+      "epoch": 0.31191088260497,
+      "grad_norm": 0.015404236502945423,
+      "learning_rate": 0.00018533980447508135,
+      "loss": 0.1501,
+      "step": 364
+    },
+    {
+      "epoch": 0.31276778063410454,
+      "grad_norm": 0.017586475238204002,
+      "learning_rate": 0.00018280335791817732,
+      "loss": 0.1493,
+      "step": 365
+    },
+    {
+      "epoch": 0.31362467866323906,
+      "grad_norm": 0.01738077774643898,
+      "learning_rate": 0.00018028049900970766,
+      "loss": 0.1571,
+      "step": 366
+    },
+    {
+      "epoch": 0.3144815766923736,
+      "grad_norm": 0.01235333550721407,
+      "learning_rate": 0.0001777713358205514,
+      "loss": 0.1528,
+      "step": 367
+    },
+    {
+      "epoch": 0.31533847472150817,
+      "grad_norm": 0.01204688660800457,
+      "learning_rate": 0.00017527597583490823,
+      "loss": 0.1483,
+      "step": 368
+    },
+    {
+      "epoch": 0.3161953727506427,
+      "grad_norm": 0.012409962713718414,
+      "learning_rate": 0.00017279452594569483,
+      "loss": 0.1495,
+      "step": 369
+    },
+    {
+      "epoch": 0.3170522707797772,
+      "grad_norm": 0.010938968509435654,
+      "learning_rate": 0.00017032709244996558,
+      "loss": 0.149,
+      "step": 370
+    },
+    {
+      "epoch": 0.31790916880891174,
+      "grad_norm": 0.012267996557056904,
+      "learning_rate": 0.00016787378104435928,
+      "loss": 0.154,
+      "step": 371
+    },
+    {
+      "epoch": 0.31876606683804626,
+      "grad_norm": 0.012454659678041935,
+      "learning_rate": 0.00016543469682057105,
+      "loss": 0.1509,
+      "step": 372
+    },
+    {
+      "epoch": 0.3196229648671808,
+      "grad_norm": 0.01519513875246048,
+      "learning_rate": 0.00016300994426085103,
+      "loss": 0.1539,
+      "step": 373
+    },
+    {
+      "epoch": 0.32047986289631536,
+      "grad_norm": 0.013357303105294704,
+      "learning_rate": 0.0001605996272335291,
+      "loss": 0.1573,
+      "step": 374
+    },
+    {
+      "epoch": 0.3213367609254499,
+      "grad_norm": 0.012079097330570221,
+      "learning_rate": 0.00015820384898856434,
+      "loss": 0.1555,
+      "step": 375
+    },
+    {
+      "epoch": 0.3221936589545844,
+      "grad_norm": 0.012022243812680244,
+      "learning_rate": 0.00015582271215312294,
+      "loss": 0.1414,
+      "step": 376
+    },
+    {
+      "epoch": 0.32305055698371893,
+      "grad_norm": 0.012276671826839447,
+      "learning_rate": 0.00015345631872718213,
+      "loss": 0.1457,
+      "step": 377
+    },
+    {
+      "epoch": 0.32390745501285345,
+      "grad_norm": 0.015837060287594795,
+      "learning_rate": 0.00015110477007916002,
+      "loss": 0.1492,
+      "step": 378
+    },
+    {
+      "epoch": 0.32476435304198803,
+      "grad_norm": 0.011251943185925484,
+      "learning_rate": 0.0001487681669415742,
+      "loss": 0.1505,
+      "step": 379
+    },
+    {
+      "epoch": 0.32562125107112255,
+      "grad_norm": 0.011603351682424545,
+      "learning_rate": 0.00014644660940672628,
+      "loss": 0.1513,
+      "step": 380
+    },
+    {
+      "epoch": 0.3264781491002571,
+      "grad_norm": 0.01359818410128355,
+      "learning_rate": 0.00014414019692241437,
+      "loss": 0.1459,
+      "step": 381
+    },
+    {
+      "epoch": 0.3273350471293916,
+      "grad_norm": 0.012471762485802174,
+      "learning_rate": 0.00014184902828767287,
+      "loss": 0.1606,
+      "step": 382
+    },
+    {
+      "epoch": 0.3281919451585261,
+      "grad_norm": 0.01537733431905508,
+      "learning_rate": 0.0001395732016485406,
+      "loss": 0.1446,
+      "step": 383
+    },
+    {
+      "epoch": 0.32904884318766064,
+      "grad_norm": 0.01161841582506895,
+      "learning_rate": 0.0001373128144938563,
+      "loss": 0.1504,
+      "step": 384
+    },
+    {
+      "epoch": 0.3299057412167952,
+      "grad_norm": 0.011465324088931084,
+      "learning_rate": 0.00013506796365108232,
+      "loss": 0.145,
+      "step": 385
+    },
+    {
+      "epoch": 0.33076263924592975,
+      "grad_norm": 0.011288580484688282,
+      "learning_rate": 0.00013283874528215734,
+      "loss": 0.1494,
+      "step": 386
+    },
+    {
+      "epoch": 0.33161953727506427,
+      "grad_norm": 0.012868880294263363,
+      "learning_rate": 0.00013062525487937698,
+      "loss": 0.1548,
+      "step": 387
+    },
+    {
+      "epoch": 0.3324764353041988,
+      "grad_norm": 0.012357845902442932,
+      "learning_rate": 0.00012842758726130281,
+      "loss": 0.1504,
+      "step": 388
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.011807232163846493,
+      "learning_rate": 0.00012624583656870153,
+      "loss": 0.1592,
+      "step": 389
+    },
+    {
+      "epoch": 0.3341902313624679,
+      "grad_norm": 0.01140469592064619,
+      "learning_rate": 0.00012408009626051135,
+      "loss": 0.1577,
+      "step": 390
+    },
+    {
+      "epoch": 0.3350471293916024,
+      "grad_norm": 0.009883826598525047,
+      "learning_rate": 0.00012193045910983863,
+      "loss": 0.1451,
+      "step": 391
+    },
+    {
+      "epoch": 0.33590402742073694,
+      "grad_norm": 0.01367274671792984,
+      "learning_rate": 0.00011979701719998454,
+      "loss": 0.1574,
+      "step": 392
+    },
+    {
+      "epoch": 0.33676092544987146,
+      "grad_norm": 0.011320062913000584,
+      "learning_rate": 0.00011767986192049984,
+      "loss": 0.1507,
+      "step": 393
+    },
+    {
+      "epoch": 0.337617823479006,
+      "grad_norm": 0.011477210558950901,
+      "learning_rate": 0.00011557908396327027,
+      "loss": 0.1533,
+      "step": 394
+    },
+    {
+      "epoch": 0.3384747215081405,
+      "grad_norm": 0.012028141878545284,
+      "learning_rate": 0.00011349477331863151,
+      "loss": 0.1592,
+      "step": 395
+    },
+    {
+      "epoch": 0.3393316195372751,
+      "grad_norm": 0.01126360334455967,
+      "learning_rate": 0.00011142701927151455,
+      "loss": 0.1588,
+      "step": 396
+    },
+    {
+      "epoch": 0.3401885175664096,
+      "grad_norm": 0.010625923052430153,
+      "learning_rate": 0.00010937591039762085,
+      "loss": 0.1443,
+      "step": 397
+    },
+    {
+      "epoch": 0.34104541559554413,
+      "grad_norm": 0.01095986645668745,
+      "learning_rate": 0.00010734153455962764,
+      "loss": 0.1533,
+      "step": 398
+    },
+    {
+      "epoch": 0.34190231362467866,
+      "grad_norm": 0.01167625468224287,
+      "learning_rate": 0.00010532397890342504,
+      "loss": 0.1465,
+      "step": 399
+    },
+    {
+      "epoch": 0.3427592116538132,
+      "grad_norm": 0.010631192475557327,
+      "learning_rate": 0.00010332332985438247,
+      "loss": 0.1505,
+      "step": 400
+    },
+    {
+      "epoch": 0.34361610968294776,
+      "grad_norm": 0.013422117568552494,
+      "learning_rate": 0.0001013396731136465,
+      "loss": 0.1475,
+      "step": 401
+    },
+    {
+      "epoch": 0.3444730077120823,
+      "grad_norm": 0.015502565540373325,
+      "learning_rate": 9.937309365446973e-05,
+      "loss": 0.1564,
+      "step": 402
+    },
+    {
+      "epoch": 0.3453299057412168,
+      "grad_norm": 0.012849084101617336,
+      "learning_rate": 9.742367571857092e-05,
+      "loss": 0.1492,
+      "step": 403
+    },
+    {
+      "epoch": 0.3461868037703513,
+      "grad_norm": 0.011048342101275921,
+      "learning_rate": 9.549150281252633e-05,
+      "loss": 0.1497,
+      "step": 404
+    },
+    {
+      "epoch": 0.34704370179948585,
+      "grad_norm": 0.00987666193395853,
+      "learning_rate": 9.357665770419243e-05,
+      "loss": 0.1449,
+      "step": 405
+    },
+    {
+      "epoch": 0.34790059982862037,
+      "grad_norm": 0.009926311671733856,
+      "learning_rate": 9.167922241916055e-05,
+      "loss": 0.16,
+      "step": 406
+    },
+    {
+      "epoch": 0.34875749785775495,
+      "grad_norm": 0.010267515666782856,
+      "learning_rate": 8.979927823724321e-05,
+      "loss": 0.1487,
+      "step": 407
+    },
+    {
+      "epoch": 0.3496143958868895,
+      "grad_norm": 0.0110161192715168,
+      "learning_rate": 8.793690568899215e-05,
+      "loss": 0.1473,
+      "step": 408
+    },
+    {
+      "epoch": 0.350471293916024,
+      "grad_norm": 0.009565568529069424,
+      "learning_rate": 8.609218455224893e-05,
+      "loss": 0.1462,
+      "step": 409
+    },
+    {
+      "epoch": 0.3513281919451585,
+      "grad_norm": 0.00971953570842743,
+      "learning_rate": 8.426519384872733e-05,
+      "loss": 0.1435,
+      "step": 410
+    },
+    {
+      "epoch": 0.35218508997429304,
+      "grad_norm": 0.00972969550639391,
+      "learning_rate": 8.24560118406285e-05,
+      "loss": 0.1482,
+      "step": 411
+    },
+    {
+      "epoch": 0.35304198800342756,
+      "grad_norm": 0.012054841965436935,
+      "learning_rate": 8.066471602728804e-05,
+      "loss": 0.1463,
+      "step": 412
+    },
+    {
+      "epoch": 0.35389888603256214,
+      "grad_norm": 0.011393013410270214,
+      "learning_rate": 7.889138314185678e-05,
+      "loss": 0.1485,
+      "step": 413
+    },
+    {
+      "epoch": 0.35475578406169667,
+      "grad_norm": 0.011225726455450058,
+      "learning_rate": 7.71360891480134e-05,
+      "loss": 0.1452,
+      "step": 414
+    },
+    {
+      "epoch": 0.3556126820908312,
+      "grad_norm": 0.01023333054035902,
+      "learning_rate": 7.53989092367106e-05,
+      "loss": 0.1358,
+      "step": 415
+    },
+    {
+      "epoch": 0.3564695801199657,
+      "grad_norm": 0.010747382417321205,
+      "learning_rate": 7.367991782295391e-05,
+      "loss": 0.1427,
+      "step": 416
+    },
+    {
+      "epoch": 0.35732647814910024,
+      "grad_norm": 0.01004976499825716,
+      "learning_rate": 7.197918854261431e-05,
+      "loss": 0.1473,
+      "step": 417
+    },
+    {
+      "epoch": 0.3581833761782348,
+      "grad_norm": 0.009513266384601593,
+      "learning_rate": 7.029679424927366e-05,
+      "loss": 0.1493,
+      "step": 418
+    },
+    {
+      "epoch": 0.35904027420736934,
+      "grad_norm": 0.015416925773024559,
+      "learning_rate": 6.863280701110408e-05,
+      "loss": 0.14,
+      "step": 419
+    },
+    {
+      "epoch": 0.35989717223650386,
+      "grad_norm": 0.010445120744407177,
+      "learning_rate": 6.698729810778065e-05,
+      "loss": 0.1534,
+      "step": 420
+    },
+    {
+      "epoch": 0.3607540702656384,
+      "grad_norm": 0.011475526727735996,
+      "learning_rate": 6.536033802742814e-05,
+      "loss": 0.1501,
+      "step": 421
+    },
+    {
+      "epoch": 0.3616109682947729,
+      "grad_norm": 0.011237034574151039,
+      "learning_rate": 6.375199646360142e-05,
+      "loss": 0.1542,
+      "step": 422
+    },
+    {
+      "epoch": 0.36246786632390743,
+      "grad_norm": 0.03411533311009407,
+      "learning_rate": 6.21623423123001e-05,
+      "loss": 0.2254,
+      "step": 423
+    },
+    {
+      "epoch": 0.363324764353042,
+      "grad_norm": 0.009747683070600033,
+      "learning_rate": 6.059144366901737e-05,
+      "loss": 0.1454,
+      "step": 424
+    },
+    {
+      "epoch": 0.36418166238217653,
+      "grad_norm": 0.010333815589547157,
+      "learning_rate": 5.903936782582253e-05,
+      "loss": 0.1479,
+      "step": 425
+    },
+    {
+      "epoch": 0.36503856041131105,
+      "grad_norm": 0.010389740578830242,
+      "learning_rate": 5.750618126847912e-05,
+      "loss": 0.1504,
+      "step": 426
+    },
+    {
+      "epoch": 0.3658954584404456,
+      "grad_norm": 0.010068557225167751,
+      "learning_rate": 5.599194967359639e-05,
+      "loss": 0.146,
+      "step": 427
+    },
+    {
+      "epoch": 0.3667523564695801,
+      "grad_norm": 0.012611080892384052,
+      "learning_rate": 5.449673790581611e-05,
+      "loss": 0.1587,
+      "step": 428
+    },
+    {
+      "epoch": 0.3676092544987147,
+      "grad_norm": 0.010398144833743572,
+      "learning_rate": 5.3020610015033946e-05,
+      "loss": 0.1455,
+      "step": 429
+    },
+    {
+      "epoch": 0.3684661525278492,
+      "grad_norm": 0.012510698288679123,
+      "learning_rate": 5.1563629233655876e-05,
+      "loss": 0.1544,
+      "step": 430
+    },
+    {
+      "epoch": 0.3693230505569837,
+      "grad_norm": 0.009702283889055252,
+      "learning_rate": 5.0125857973889355e-05,
+      "loss": 0.1376,
+      "step": 431
+    },
+    {
+      "epoch": 0.37017994858611825,
+      "grad_norm": 0.015176232904195786,
+      "learning_rate": 4.87073578250698e-05,
+      "loss": 0.1505,
+      "step": 432
+    },
+    {
+      "epoch": 0.37103684661525277,
+      "grad_norm": 0.009702375158667564,
+      "learning_rate": 4.730818955102234e-05,
+      "loss": 0.1465,
+      "step": 433
+    },
+    {
+      "epoch": 0.3718937446443873,
+      "grad_norm": 0.01697813719511032,
+      "learning_rate": 4.592841308745932e-05,
+      "loss": 0.1528,
+      "step": 434
+    },
+    {
+      "epoch": 0.37275064267352187,
+      "grad_norm": 0.01560263428837061,
+      "learning_rate": 4.456808753941205e-05,
+      "loss": 0.1437,
+      "step": 435
+    },
+    {
+      "epoch": 0.3736075407026564,
+      "grad_norm": 0.00954125914722681,
+      "learning_rate": 4.322727117869951e-05,
+      "loss": 0.1501,
+      "step": 436
+    },
+    {
+      "epoch": 0.3744644387317909,
+      "grad_norm": 0.01739114709198475,
+      "learning_rate": 4.190602144143207e-05,
+      "loss": 0.1568,
+      "step": 437
+    },
+    {
+      "epoch": 0.37532133676092544,
+      "grad_norm": 0.011187477968633175,
+      "learning_rate": 4.06043949255509e-05,
+      "loss": 0.1558,
+      "step": 438
+    },
+    {
+      "epoch": 0.37617823479005996,
+      "grad_norm": 0.01066075824201107,
+      "learning_rate": 3.932244738840379e-05,
+      "loss": 0.1458,
+      "step": 439
+    },
+    {
+      "epoch": 0.37703513281919454,
+      "grad_norm": 0.009765625,
+      "learning_rate": 3.806023374435663e-05,
+      "loss": 0.1492,
+      "step": 440
+    },
+    {
+      "epoch": 0.37789203084832906,
+      "grad_norm": 0.009504380635917187,
+      "learning_rate": 3.681780806244095e-05,
+      "loss": 0.1413,
+      "step": 441
+    },
+    {
+      "epoch": 0.3787489288774636,
+      "grad_norm": 0.010201869532465935,
+      "learning_rate": 3.559522356403788e-05,
+      "loss": 0.1486,
+      "step": 442
+    },
+    {
+      "epoch": 0.3796058269065981,
+      "grad_norm": 0.010059897787868977,
+      "learning_rate": 3.439253262059822e-05,
+      "loss": 0.1314,
+      "step": 443
+    },
+    {
+      "epoch": 0.38046272493573263,
+      "grad_norm": 0.011777847073972225,
+      "learning_rate": 3.3209786751399184e-05,
+      "loss": 0.1499,
+      "step": 444
+    },
+    {
+      "epoch": 0.38131962296486716,
+      "grad_norm": 0.00938204862177372,
+      "learning_rate": 3.2047036621337236e-05,
+      "loss": 0.1453,
+      "step": 445
+    },
+    {
+      "epoch": 0.38217652099400173,
+      "grad_norm": 0.009476981125772,
+      "learning_rate": 3.0904332038757974e-05,
+      "loss": 0.1446,
+      "step": 446
+    },
+    {
+      "epoch": 0.38303341902313626,
+      "grad_norm": 0.009422726929187775,
+      "learning_rate": 2.9781721953322627e-05,
+      "loss": 0.1488,
+      "step": 447
+    },
+    {
+      "epoch": 0.3838903170522708,
+      "grad_norm": 0.00957945454865694,
+      "learning_rate": 2.8679254453910786e-05,
+      "loss": 0.1458,
+      "step": 448
+    },
+    {
+      "epoch": 0.3847472150814053,
+      "grad_norm": 0.009238988161087036,
+      "learning_rate": 2.7596976766560976e-05,
+      "loss": 0.1468,
+      "step": 449
+    },
+    {
+      "epoch": 0.3856041131105398,
+      "grad_norm": 0.010409279726445675,
+      "learning_rate": 2.653493525244721e-05,
+      "loss": 0.1488,
+      "step": 450
+    },
+    {
+      "epoch": 0.3864610111396744,
+      "grad_norm": 0.010871903970837593,
+      "learning_rate": 2.5493175405893076e-05,
+      "loss": 0.1448,
+      "step": 451
+    },
+    {
+      "epoch": 0.3873179091688089,
+      "grad_norm": 0.01018882729113102,
+      "learning_rate": 2.4471741852423235e-05,
+      "loss": 0.149,
+      "step": 452
+    },
+    {
+      "epoch": 0.38817480719794345,
+      "grad_norm": 0.00999706145375967,
+      "learning_rate": 2.3470678346851513e-05,
+      "loss": 0.1476,
+      "step": 453
+    },
+    {
+      "epoch": 0.389031705227078,
+      "grad_norm": 0.008825534954667091,
+      "learning_rate": 2.2490027771406685e-05,
+      "loss": 0.1522,
+      "step": 454
+    },
+    {
+      "epoch": 0.3898886032562125,
+      "grad_norm": 0.010562725365161896,
+      "learning_rate": 2.152983213389559e-05,
+      "loss": 0.1477,
+      "step": 455
+    },
+    {
+      "epoch": 0.390745501285347,
+      "grad_norm": 0.01029855664819479,
+      "learning_rate": 2.0590132565903473e-05,
+      "loss": 0.1603,
+      "step": 456
+    },
+    {
+      "epoch": 0.3916023993144816,
+      "grad_norm": 0.009447697550058365,
+      "learning_rate": 1.9670969321032406e-05,
+      "loss": 0.1507,
+      "step": 457
+    },
+    {
+      "epoch": 0.3924592973436161,
+      "grad_norm": 0.009440843015909195,
+      "learning_rate": 1.8772381773176416e-05,
+      "loss": 0.1487,
+      "step": 458
+    },
+    {
+      "epoch": 0.39331619537275064,
+      "grad_norm": 0.01008307933807373,
+      "learning_rate": 1.7894408414835363e-05,
+      "loss": 0.1482,
+      "step": 459
+    },
+    {
+      "epoch": 0.39417309340188517,
+      "grad_norm": 0.010164221748709679,
+      "learning_rate": 1.70370868554659e-05,
+      "loss": 0.1514,
+      "step": 460
+    },
+    {
+      "epoch": 0.3950299914310197,
+      "grad_norm": 0.009882348589599133,
+      "learning_rate": 1.620045381987012e-05,
+      "loss": 0.1459,
+      "step": 461
+    },
+    {
+      "epoch": 0.39588688946015427,
+      "grad_norm": 0.01052442193031311,
+      "learning_rate": 1.538454514662285e-05,
+      "loss": 0.1565,
+      "step": 462
+    },
+    {
+      "epoch": 0.3967437874892888,
+      "grad_norm": 0.010268788784742355,
+      "learning_rate": 1.4589395786535953e-05,
+      "loss": 0.1491,
+      "step": 463
+    },
+    {
+      "epoch": 0.3976006855184233,
+      "grad_norm": 0.00896680261939764,
+      "learning_rate": 1.3815039801161721e-05,
+      "loss": 0.1434,
+      "step": 464
+    },
+    {
+      "epoch": 0.39845758354755784,
+      "grad_norm": 0.009579429402947426,
+      "learning_rate": 1.3061510361333184e-05,
+      "loss": 0.147,
+      "step": 465
+    },
+    {
+      "epoch": 0.39931448157669236,
+      "grad_norm": 0.009059751406311989,
+      "learning_rate": 1.232883974574367e-05,
+      "loss": 0.1454,
+      "step": 466
+    },
+    {
+      "epoch": 0.4001713796058269,
+      "grad_norm": 0.00879402831196785,
+      "learning_rate": 1.1617059339563806e-05,
+      "loss": 0.1465,
+      "step": 467
+    },
+    {
+      "epoch": 0.40102827763496146,
+      "grad_norm": 0.008100686594843864,
+      "learning_rate": 1.0926199633097156e-05,
+      "loss": 0.1453,
+      "step": 468
+    },
+    {
+      "epoch": 0.401885175664096,
+      "grad_norm": 0.011180667206645012,
+      "learning_rate": 1.0256290220474307e-05,
+      "loss": 0.155,
+      "step": 469
+    },
+    {
+      "epoch": 0.4027420736932305,
+      "grad_norm": 0.009214532561600208,
+      "learning_rate": 9.607359798384786e-06,
+      "loss": 0.152,
+      "step": 470
+    },
+    {
+      "epoch": 0.40359897172236503,
+      "grad_norm": 0.011793004348874092,
+      "learning_rate": 8.979436164848088e-06,
+      "loss": 0.1503,
+      "step": 471
+    },
+    {
+      "epoch": 0.40445586975149955,
+      "grad_norm": 0.008904839865863323,
+      "learning_rate": 8.372546218022748e-06,
+      "loss": 0.1372,
+      "step": 472
+    },
+    {
+      "epoch": 0.40531276778063413,
+      "grad_norm": 0.010819431394338608,
+      "learning_rate": 7.786715955054202e-06,
+      "loss": 0.1517,
+      "step": 473
+    },
+    {
+      "epoch": 0.40616966580976865,
+      "grad_norm": 0.008758115582168102,
+      "learning_rate": 7.221970470961125e-06,
+      "loss": 0.1415,
+      "step": 474
+    },
+    {
+      "epoch": 0.4070265638389032,
+      "grad_norm": 0.009194256737828255,
+      "learning_rate": 6.678333957560512e-06,
+      "loss": 0.1497,
+      "step": 475
+    },
+    {
+      "epoch": 0.4078834618680377,
+      "grad_norm": 0.010541984811425209,
+      "learning_rate": 6.15582970243117e-06,
+      "loss": 0.155,
+      "step": 476
+    },
+    {
+      "epoch": 0.4087403598971722,
+      "grad_norm": 0.009917198680341244,
+      "learning_rate": 5.6544800879163026e-06,
+      "loss": 0.1483,
+      "step": 477
+    },
+    {
+      "epoch": 0.40959725792630675,
+      "grad_norm": 0.01124248094856739,
+      "learning_rate": 5.174306590164879e-06,
+      "loss": 0.1506,
+      "step": 478
+    },
+    {
+      "epoch": 0.4104541559554413,
+      "grad_norm": 0.009757012128829956,
+      "learning_rate": 4.715329778211374e-06,
+      "loss": 0.1448,
+      "step": 479
+    },
+    {
+      "epoch": 0.41131105398457585,
+      "grad_norm": 0.009212766773998737,
+      "learning_rate": 4.277569313094809e-06,
+      "loss": 0.1483,
+      "step": 480
+    },
+    {
+      "epoch": 0.41216795201371037,
+      "grad_norm": 0.01022613886743784,
+      "learning_rate": 3.861043947016474e-06,
+      "loss": 0.1514,
+      "step": 481
+    },
+    {
+      "epoch": 0.4130248500428449,
+      "grad_norm": 0.012525631114840508,
+      "learning_rate": 3.4657715225368535e-06,
+      "loss": 0.1523,
+      "step": 482
+    },
+    {
+      "epoch": 0.4138817480719794,
+      "grad_norm": 0.013477517291903496,
+      "learning_rate": 3.09176897181096e-06,
+      "loss": 0.1508,
+      "step": 483
+    },
+    {
+      "epoch": 0.414738646101114,
+      "grad_norm": 0.009571454487740993,
+      "learning_rate": 2.739052315863355e-06,
+      "loss": 0.1472,
+      "step": 484
+    },
+    {
+      "epoch": 0.4155955441302485,
+      "grad_norm": 0.008747117593884468,
+      "learning_rate": 2.4076366639015913e-06,
+      "loss": 0.149,
+      "step": 485
+    },
+    {
+      "epoch": 0.41645244215938304,
+      "grad_norm": 0.008356280624866486,
+      "learning_rate": 2.097536212669171e-06,
+      "loss": 0.1475,
+      "step": 486
+    },
+    {
+      "epoch": 0.41730934018851756,
+      "grad_norm": 0.008765033446252346,
+      "learning_rate": 1.8087642458373132e-06,
+      "loss": 0.1476,
+      "step": 487
+    },
+    {
+      "epoch": 0.4181662382176521,
+      "grad_norm": 0.009499343112111092,
+      "learning_rate": 1.541333133436018e-06,
+      "loss": 0.1429,
+      "step": 488
+    },
+    {
+      "epoch": 0.4190231362467866,
+      "grad_norm": 0.009461235255002975,
+      "learning_rate": 1.2952543313240472e-06,
+      "loss": 0.1467,
+      "step": 489
+    },
+    {
+      "epoch": 0.4198800342759212,
+      "grad_norm": 0.008559729903936386,
+      "learning_rate": 1.0705383806982606e-06,
+      "loss": 0.1529,
+      "step": 490
+    },
+    {
+      "epoch": 0.4207369323050557,
+      "grad_norm": 0.010164659470319748,
+      "learning_rate": 8.671949076420882e-07,
+      "loss": 0.1567,
+      "step": 491
+    },
+    {
+      "epoch": 0.42159383033419023,
+      "grad_norm": 0.01175006665289402,
+      "learning_rate": 6.852326227130834e-07,
+      "loss": 0.1579,
+      "step": 492
+    },
+    {
+      "epoch": 0.42245072836332476,
+      "grad_norm": 0.013031876645982265,
+      "learning_rate": 5.246593205699424e-07,
+      "loss": 0.1423,
+      "step": 493
+    },
+    {
+      "epoch": 0.4233076263924593,
+      "grad_norm": 0.009586134925484657,
+      "learning_rate": 3.854818796385495e-07,
+      "loss": 0.1539,
+      "step": 494
+    },
+    {
+      "epoch": 0.4241645244215938,
+      "grad_norm": 0.009407884441316128,
+      "learning_rate": 2.677062618171577e-07,
+      "loss": 0.1446,
+      "step": 495
+    },
+    {
+      "epoch": 0.4250214224507284,
+      "grad_norm": 0.00942971371114254,
+      "learning_rate": 1.7133751222137007e-07,
+      "loss": 0.1533,
+      "step": 496
+    },
+    {
+      "epoch": 0.4258783204798629,
+      "grad_norm": 0.009059898555278778,
+      "learning_rate": 9.637975896759077e-08,
+      "loss": 0.1403,
+      "step": 497
+    },
+    {
+      "epoch": 0.4267352185089974,
+      "grad_norm": 0.010631782002747059,
+      "learning_rate": 4.283621299649987e-08,
+      "loss": 0.159,
+      "step": 498
+    },
+    {
+      "epoch": 0.42759211653813195,
+      "grad_norm": 0.009222394786775112,
+      "learning_rate": 1.0709167935385456e-08,
+      "loss": 0.1512,
+      "step": 499
+    },
+    {
+      "epoch": 0.4284490145672665,
+      "grad_norm": 0.008665120229125023,
+      "learning_rate": 0.0,
+      "loss": 0.1478,
+      "step": 500
+    },
+    {
+      "epoch": 0.4284490145672665,
+      "step": 500,
+      "total_flos": 1.0100648106681958e+20,
+      "train_loss": 0.25743724408745766,
+      "train_runtime": 32795.7548,
+      "train_samples_per_second": 0.244,
+      "train_steps_per_second": 0.015
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.0100648106681958e+20,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}