mtzig commited on Nov 25, 2024

Commit

aafb541

verified ·

1 Parent(s): f6de213

Training in progress, step 200, checkpoint

Browse files

Files changed (17) hide show

.gitattributes +8 -0
last-checkpoint/optimizer_0/.metadata +0 -0
last-checkpoint/optimizer_0/__0_0.distcp +3 -0
last-checkpoint/optimizer_0/__1_0.distcp +3 -0
last-checkpoint/optimizer_0/__2_0.distcp +3 -0
last-checkpoint/optimizer_0/__3_0.distcp +3 -0
last-checkpoint/pytorch_model_fsdp_0/.metadata +0 -0
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp +3 -0
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp +3 -0
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp +3 -0
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp +3 -0
last-checkpoint/rng_state_0.pth +3 -0
last-checkpoint/rng_state_1.pth +3 -0
last-checkpoint/rng_state_2.pth +3 -0
last-checkpoint/rng_state_3.pth +3 -0
last-checkpoint/scheduler.pt +3 -0
last-checkpoint/trainer_state.json +1565 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+last-checkpoint/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
+last-checkpoint/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
+last-checkpoint/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
+last-checkpoint/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
+last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
+last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
+last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
+last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text

last-checkpoint/optimizer_0/.metadata ADDED Viewed

Binary file (369 kB). View file

last-checkpoint/optimizer_0/__0_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4eee0bc20bfe612a2406db1927bad535b871029a1459cdfff99c1d8c6c7f3b63
+size 13934748

last-checkpoint/optimizer_0/__1_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8480d8b71bc4ba12fadce2b7092485478b8c309ecce318c15ffc6f83a418ea33
+size 13999412

last-checkpoint/optimizer_0/__2_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:865ffb2bdf7738b5a7a48e25068e631a1f4cfd3495ea1df1c76166542115412a
+size 13990904

last-checkpoint/optimizer_0/__3_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9bc6404ab67370a58b70ca5d2e8919c5e01e34f1cb289a4a6bd798d70aee2dbd
+size 13990904

last-checkpoint/pytorch_model_fsdp_0/.metadata ADDED Viewed

Binary file (135 kB). View file

last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3498e0b6a4e7ed2241f24f000b2120ffa644d285a44cfde97745c9efb6ed358b
+size 6966784

last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d06a365662a6d32a03d081ca66ae94093585c255a49fe32e4fc6101155e341c
+size 6966784

last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44818d96fc5cb3fb73cb12c5017e94708a24961757ad115fff879a4c54351a1b
+size 6966784

last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7503aeea618e7970daff2e762d6b9cc3c0b593f25c7e566d92c8b37634b729e0
+size 6966784

last-checkpoint/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7be0f10bff4b59eb4d3472c8dc5f6f8b12c709dd561a83d4586f3461ec1745a5
+size 14960

last-checkpoint/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e61888020fafc126b7e547b5961b63a5561eea0a9665cf9acb78e192fc0856bc
+size 14960

last-checkpoint/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:065119fcdbace59dd30c03371fc097ed8d58b83537d1b5e3a1f5c321afd26dfd
+size 14960

last-checkpoint/rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:061f461111f5cd0052d853db52e46aef61f148d9da594c2cc07a97c23921266c
+size 14960

last-checkpoint/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2f6d7e0e198940381bc01669f2b59ed3c54273b38889812ff9b29559c995120
+size 1064

last-checkpoint/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1565 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.24752475247524752,
+  "eval_steps": 20,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0,
+      "eval_accuracy": 0.7339246119733924,
+      "eval_f1": 0.24528301886792453,
+      "eval_loss": 0.6025775074958801,
+      "eval_precision": 0.6,
+      "eval_recall": 0.1541501976284585,
+      "eval_runtime": 47.5679,
+      "eval_samples_per_second": 5.802,
+      "eval_steps_per_second": 0.189,
+      "step": 0
+    },
+    {
+      "epoch": 0.0012376237623762376,
+      "grad_norm": 2.056412935256958,
+      "learning_rate": 2.469135802469136e-07,
+      "loss": 0.6505,
+      "step": 1
+    },
+    {
+      "epoch": 0.0024752475247524753,
+      "grad_norm": 2.1361210346221924,
+      "learning_rate": 4.938271604938272e-07,
+      "loss": 0.7395,
+      "step": 2
+    },
+    {
+      "epoch": 0.0037128712871287127,
+      "grad_norm": 2.2638471126556396,
+      "learning_rate": 7.407407407407407e-07,
+      "loss": 0.6948,
+      "step": 3
+    },
+    {
+      "epoch": 0.0049504950495049506,
+      "grad_norm": 1.881201148033142,
+      "learning_rate": 9.876543209876544e-07,
+      "loss": 0.6427,
+      "step": 4
+    },
+    {
+      "epoch": 0.006188118811881188,
+      "grad_norm": 2.1328437328338623,
+      "learning_rate": 1.234567901234568e-06,
+      "loss": 0.6554,
+      "step": 5
+    },
+    {
+      "epoch": 0.007425742574257425,
+      "grad_norm": 2.2691922187805176,
+      "learning_rate": 1.4814814814814815e-06,
+      "loss": 0.7034,
+      "step": 6
+    },
+    {
+      "epoch": 0.008663366336633664,
+      "grad_norm": 2.424414873123169,
+      "learning_rate": 1.7283950617283952e-06,
+      "loss": 0.6598,
+      "step": 7
+    },
+    {
+      "epoch": 0.009900990099009901,
+      "grad_norm": 2.1118245124816895,
+      "learning_rate": 1.9753086419753087e-06,
+      "loss": 0.668,
+      "step": 8
+    },
+    {
+      "epoch": 0.011138613861386138,
+      "grad_norm": 1.8890514373779297,
+      "learning_rate": 2.222222222222222e-06,
+      "loss": 0.6658,
+      "step": 9
+    },
+    {
+      "epoch": 0.012376237623762377,
+      "grad_norm": 2.2101762294769287,
+      "learning_rate": 2.469135802469136e-06,
+      "loss": 0.6984,
+      "step": 10
+    },
+    {
+      "epoch": 0.013613861386138614,
+      "grad_norm": 2.1789631843566895,
+      "learning_rate": 2.7160493827160496e-06,
+      "loss": 0.6483,
+      "step": 11
+    },
+    {
+      "epoch": 0.01485148514851485,
+      "grad_norm": 2.1754183769226074,
+      "learning_rate": 2.962962962962963e-06,
+      "loss": 0.6328,
+      "step": 12
+    },
+    {
+      "epoch": 0.01608910891089109,
+      "grad_norm": 1.9709060192108154,
+      "learning_rate": 3.2098765432098767e-06,
+      "loss": 0.6425,
+      "step": 13
+    },
+    {
+      "epoch": 0.017326732673267328,
+      "grad_norm": 2.338000535964966,
+      "learning_rate": 3.4567901234567904e-06,
+      "loss": 0.7665,
+      "step": 14
+    },
+    {
+      "epoch": 0.018564356435643563,
+      "grad_norm": 1.9738425016403198,
+      "learning_rate": 3.7037037037037037e-06,
+      "loss": 0.6994,
+      "step": 15
+    },
+    {
+      "epoch": 0.019801980198019802,
+      "grad_norm": 1.9872663021087646,
+      "learning_rate": 3.9506172839506175e-06,
+      "loss": 0.6101,
+      "step": 16
+    },
+    {
+      "epoch": 0.02103960396039604,
+      "grad_norm": 1.9945553541183472,
+      "learning_rate": 4.197530864197531e-06,
+      "loss": 0.641,
+      "step": 17
+    },
+    {
+      "epoch": 0.022277227722772276,
+      "grad_norm": 2.1487791538238525,
+      "learning_rate": 4.444444444444444e-06,
+      "loss": 0.6871,
+      "step": 18
+    },
+    {
+      "epoch": 0.023514851485148515,
+      "grad_norm": 2.6171352863311768,
+      "learning_rate": 4.691358024691358e-06,
+      "loss": 0.6863,
+      "step": 19
+    },
+    {
+      "epoch": 0.024752475247524754,
+      "grad_norm": 1.7834933996200562,
+      "learning_rate": 4.938271604938272e-06,
+      "loss": 0.6391,
+      "step": 20
+    },
+    {
+      "epoch": 0.024752475247524754,
+      "eval_accuracy": 0.7361419068736141,
+      "eval_f1": 0.25625,
+      "eval_loss": 0.5953530669212341,
+      "eval_precision": 0.6119402985074627,
+      "eval_recall": 0.16205533596837945,
+      "eval_runtime": 50.5471,
+      "eval_samples_per_second": 5.46,
+      "eval_steps_per_second": 0.178,
+      "step": 20
+    },
+    {
+      "epoch": 0.02599009900990099,
+      "grad_norm": 2.140673875808716,
+      "learning_rate": 5.185185185185185e-06,
+      "loss": 0.6099,
+      "step": 21
+    },
+    {
+      "epoch": 0.027227722772277228,
+      "grad_norm": 1.9627602100372314,
+      "learning_rate": 5.432098765432099e-06,
+      "loss": 0.6677,
+      "step": 22
+    },
+    {
+      "epoch": 0.028465346534653466,
+      "grad_norm": 1.9993869066238403,
+      "learning_rate": 5.6790123456790125e-06,
+      "loss": 0.6015,
+      "step": 23
+    },
+    {
+      "epoch": 0.0297029702970297,
+      "grad_norm": 1.7692540884017944,
+      "learning_rate": 5.925925925925926e-06,
+      "loss": 0.5969,
+      "step": 24
+    },
+    {
+      "epoch": 0.03094059405940594,
+      "grad_norm": 2.137422561645508,
+      "learning_rate": 6.17283950617284e-06,
+      "loss": 0.6501,
+      "step": 25
+    },
+    {
+      "epoch": 0.03217821782178218,
+      "grad_norm": 1.9657728672027588,
+      "learning_rate": 6.419753086419753e-06,
+      "loss": 0.6085,
+      "step": 26
+    },
+    {
+      "epoch": 0.03341584158415842,
+      "grad_norm": 1.7881442308425903,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 0.635,
+      "step": 27
+    },
+    {
+      "epoch": 0.034653465346534656,
+      "grad_norm": 2.832048177719116,
+      "learning_rate": 6.913580246913581e-06,
+      "loss": 0.7251,
+      "step": 28
+    },
+    {
+      "epoch": 0.03589108910891089,
+      "grad_norm": 1.9947174787521362,
+      "learning_rate": 7.160493827160494e-06,
+      "loss": 0.6394,
+      "step": 29
+    },
+    {
+      "epoch": 0.03712871287128713,
+      "grad_norm": 2.0211126804351807,
+      "learning_rate": 7.4074074074074075e-06,
+      "loss": 0.6082,
+      "step": 30
+    },
+    {
+      "epoch": 0.038366336633663366,
+      "grad_norm": 1.9397317171096802,
+      "learning_rate": 7.654320987654322e-06,
+      "loss": 0.6465,
+      "step": 31
+    },
+    {
+      "epoch": 0.039603960396039604,
+      "grad_norm": 2.2408998012542725,
+      "learning_rate": 7.901234567901235e-06,
+      "loss": 0.643,
+      "step": 32
+    },
+    {
+      "epoch": 0.04084158415841584,
+      "grad_norm": 1.9772993326187134,
+      "learning_rate": 8.148148148148148e-06,
+      "loss": 0.618,
+      "step": 33
+    },
+    {
+      "epoch": 0.04207920792079208,
+      "grad_norm": 1.6278493404388428,
+      "learning_rate": 8.395061728395062e-06,
+      "loss": 0.6425,
+      "step": 34
+    },
+    {
+      "epoch": 0.043316831683168314,
+      "grad_norm": 1.9789159297943115,
+      "learning_rate": 8.641975308641975e-06,
+      "loss": 0.6046,
+      "step": 35
+    },
+    {
+      "epoch": 0.04455445544554455,
+      "grad_norm": 1.801087498664856,
+      "learning_rate": 8.888888888888888e-06,
+      "loss": 0.6561,
+      "step": 36
+    },
+    {
+      "epoch": 0.04579207920792079,
+      "grad_norm": 1.5089136362075806,
+      "learning_rate": 9.135802469135803e-06,
+      "loss": 0.5883,
+      "step": 37
+    },
+    {
+      "epoch": 0.04702970297029703,
+      "grad_norm": 1.676107406616211,
+      "learning_rate": 9.382716049382717e-06,
+      "loss": 0.5684,
+      "step": 38
+    },
+    {
+      "epoch": 0.04826732673267327,
+      "grad_norm": 1.8138374090194702,
+      "learning_rate": 9.62962962962963e-06,
+      "loss": 0.6034,
+      "step": 39
+    },
+    {
+      "epoch": 0.04950495049504951,
+      "grad_norm": 1.7539325952529907,
+      "learning_rate": 9.876543209876543e-06,
+      "loss": 0.5891,
+      "step": 40
+    },
+    {
+      "epoch": 0.04950495049504951,
+      "eval_accuracy": 0.7549889135254989,
+      "eval_f1": 0.4318766066838046,
+      "eval_loss": 0.556958794593811,
+      "eval_precision": 0.6176470588235294,
+      "eval_recall": 0.33201581027667987,
+      "eval_runtime": 48.6708,
+      "eval_samples_per_second": 5.671,
+      "eval_steps_per_second": 0.185,
+      "step": 40
+    },
+    {
+      "epoch": 0.050742574257425746,
+      "grad_norm": 1.4187287092208862,
+      "learning_rate": 1.0123456790123458e-05,
+      "loss": 0.5636,
+      "step": 41
+    },
+    {
+      "epoch": 0.05198019801980198,
+      "grad_norm": 1.9447287321090698,
+      "learning_rate": 1.037037037037037e-05,
+      "loss": 0.5496,
+      "step": 42
+    },
+    {
+      "epoch": 0.053217821782178217,
+      "grad_norm": 1.6454174518585205,
+      "learning_rate": 1.0617283950617285e-05,
+      "loss": 0.5807,
+      "step": 43
+    },
+    {
+      "epoch": 0.054455445544554455,
+      "grad_norm": 1.7853933572769165,
+      "learning_rate": 1.0864197530864198e-05,
+      "loss": 0.6028,
+      "step": 44
+    },
+    {
+      "epoch": 0.055693069306930694,
+      "grad_norm": 1.6090970039367676,
+      "learning_rate": 1.1111111111111113e-05,
+      "loss": 0.5838,
+      "step": 45
+    },
+    {
+      "epoch": 0.05693069306930693,
+      "grad_norm": 2.3328471183776855,
+      "learning_rate": 1.1358024691358025e-05,
+      "loss": 0.5993,
+      "step": 46
+    },
+    {
+      "epoch": 0.05816831683168317,
+      "grad_norm": 2.4744842052459717,
+      "learning_rate": 1.160493827160494e-05,
+      "loss": 0.6092,
+      "step": 47
+    },
+    {
+      "epoch": 0.0594059405940594,
+      "grad_norm": 1.7244300842285156,
+      "learning_rate": 1.1851851851851852e-05,
+      "loss": 0.5969,
+      "step": 48
+    },
+    {
+      "epoch": 0.06064356435643564,
+      "grad_norm": 1.6698678731918335,
+      "learning_rate": 1.2098765432098767e-05,
+      "loss": 0.5254,
+      "step": 49
+    },
+    {
+      "epoch": 0.06188118811881188,
+      "grad_norm": 1.591994285583496,
+      "learning_rate": 1.234567901234568e-05,
+      "loss": 0.5509,
+      "step": 50
+    },
+    {
+      "epoch": 0.06311881188118812,
+      "grad_norm": 1.9688084125518799,
+      "learning_rate": 1.2592592592592593e-05,
+      "loss": 0.5232,
+      "step": 51
+    },
+    {
+      "epoch": 0.06435643564356436,
+      "grad_norm": 2.0831687450408936,
+      "learning_rate": 1.2839506172839507e-05,
+      "loss": 0.5141,
+      "step": 52
+    },
+    {
+      "epoch": 0.0655940594059406,
+      "grad_norm": 2.0480973720550537,
+      "learning_rate": 1.3086419753086422e-05,
+      "loss": 0.5669,
+      "step": 53
+    },
+    {
+      "epoch": 0.06683168316831684,
+      "grad_norm": 1.5781453847885132,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 0.5065,
+      "step": 54
+    },
+    {
+      "epoch": 0.06806930693069307,
+      "grad_norm": 2.123061180114746,
+      "learning_rate": 1.3580246913580248e-05,
+      "loss": 0.4856,
+      "step": 55
+    },
+    {
+      "epoch": 0.06930693069306931,
+      "grad_norm": 2.2889890670776367,
+      "learning_rate": 1.3827160493827162e-05,
+      "loss": 0.4936,
+      "step": 56
+    },
+    {
+      "epoch": 0.07054455445544554,
+      "grad_norm": 2.201887607574463,
+      "learning_rate": 1.4074074074074075e-05,
+      "loss": 0.538,
+      "step": 57
+    },
+    {
+      "epoch": 0.07178217821782178,
+      "grad_norm": 1.8556184768676758,
+      "learning_rate": 1.4320987654320988e-05,
+      "loss": 0.5091,
+      "step": 58
+    },
+    {
+      "epoch": 0.07301980198019802,
+      "grad_norm": 1.5986840724945068,
+      "learning_rate": 1.4567901234567903e-05,
+      "loss": 0.4939,
+      "step": 59
+    },
+    {
+      "epoch": 0.07425742574257425,
+      "grad_norm": 2.35420560836792,
+      "learning_rate": 1.4814814814814815e-05,
+      "loss": 0.4606,
+      "step": 60
+    },
+    {
+      "epoch": 0.07425742574257425,
+      "eval_accuracy": 0.779379157427938,
+      "eval_f1": 0.5204819277108433,
+      "eval_loss": 0.4962254464626312,
+      "eval_precision": 0.6666666666666666,
+      "eval_recall": 0.4268774703557312,
+      "eval_runtime": 47.7725,
+      "eval_samples_per_second": 5.777,
+      "eval_steps_per_second": 0.188,
+      "step": 60
+    },
+    {
+      "epoch": 0.07549504950495049,
+      "grad_norm": 2.571995496749878,
+      "learning_rate": 1.506172839506173e-05,
+      "loss": 0.538,
+      "step": 61
+    },
+    {
+      "epoch": 0.07673267326732673,
+      "grad_norm": 2.467172622680664,
+      "learning_rate": 1.5308641975308643e-05,
+      "loss": 0.5176,
+      "step": 62
+    },
+    {
+      "epoch": 0.07797029702970297,
+      "grad_norm": 1.9836307764053345,
+      "learning_rate": 1.555555555555556e-05,
+      "loss": 0.544,
+      "step": 63
+    },
+    {
+      "epoch": 0.07920792079207921,
+      "grad_norm": 1.576439380645752,
+      "learning_rate": 1.580246913580247e-05,
+      "loss": 0.4453,
+      "step": 64
+    },
+    {
+      "epoch": 0.08044554455445545,
+      "grad_norm": 1.6136027574539185,
+      "learning_rate": 1.6049382716049385e-05,
+      "loss": 0.46,
+      "step": 65
+    },
+    {
+      "epoch": 0.08168316831683169,
+      "grad_norm": 2.130403518676758,
+      "learning_rate": 1.6296296296296297e-05,
+      "loss": 0.4797,
+      "step": 66
+    },
+    {
+      "epoch": 0.08292079207920793,
+      "grad_norm": 2.6445112228393555,
+      "learning_rate": 1.654320987654321e-05,
+      "loss": 0.5095,
+      "step": 67
+    },
+    {
+      "epoch": 0.08415841584158416,
+      "grad_norm": 2.384965658187866,
+      "learning_rate": 1.6790123456790123e-05,
+      "loss": 0.478,
+      "step": 68
+    },
+    {
+      "epoch": 0.0853960396039604,
+      "grad_norm": 1.9021402597427368,
+      "learning_rate": 1.7037037037037038e-05,
+      "loss": 0.4508,
+      "step": 69
+    },
+    {
+      "epoch": 0.08663366336633663,
+      "grad_norm": 2.2608911991119385,
+      "learning_rate": 1.728395061728395e-05,
+      "loss": 0.4828,
+      "step": 70
+    },
+    {
+      "epoch": 0.08787128712871287,
+      "grad_norm": 2.5560309886932373,
+      "learning_rate": 1.7530864197530865e-05,
+      "loss": 0.4429,
+      "step": 71
+    },
+    {
+      "epoch": 0.0891089108910891,
+      "grad_norm": 3.586392879486084,
+      "learning_rate": 1.7777777777777777e-05,
+      "loss": 0.393,
+      "step": 72
+    },
+    {
+      "epoch": 0.09034653465346534,
+      "grad_norm": 2.5128958225250244,
+      "learning_rate": 1.802469135802469e-05,
+      "loss": 0.4795,
+      "step": 73
+    },
+    {
+      "epoch": 0.09158415841584158,
+      "grad_norm": 2.255323886871338,
+      "learning_rate": 1.8271604938271607e-05,
+      "loss": 0.3733,
+      "step": 74
+    },
+    {
+      "epoch": 0.09282178217821782,
+      "grad_norm": 1.9865373373031616,
+      "learning_rate": 1.851851851851852e-05,
+      "loss": 0.3899,
+      "step": 75
+    },
+    {
+      "epoch": 0.09405940594059406,
+      "grad_norm": 2.985546588897705,
+      "learning_rate": 1.8765432098765433e-05,
+      "loss": 0.3784,
+      "step": 76
+    },
+    {
+      "epoch": 0.0952970297029703,
+      "grad_norm": 3.0742247104644775,
+      "learning_rate": 1.901234567901235e-05,
+      "loss": 0.4457,
+      "step": 77
+    },
+    {
+      "epoch": 0.09653465346534654,
+      "grad_norm": 2.365544319152832,
+      "learning_rate": 1.925925925925926e-05,
+      "loss": 0.3507,
+      "step": 78
+    },
+    {
+      "epoch": 0.09777227722772278,
+      "grad_norm": 3.4621968269348145,
+      "learning_rate": 1.9506172839506175e-05,
+      "loss": 0.405,
+      "step": 79
+    },
+    {
+      "epoch": 0.09900990099009901,
+      "grad_norm": 3.251645088195801,
+      "learning_rate": 1.9753086419753087e-05,
+      "loss": 0.4229,
+      "step": 80
+    },
+    {
+      "epoch": 0.09900990099009901,
+      "eval_accuracy": 0.7904656319290465,
+      "eval_f1": 0.5771812080536913,
+      "eval_loss": 0.4432809352874756,
+      "eval_precision": 0.6649484536082474,
+      "eval_recall": 0.5098814229249012,
+      "eval_runtime": 48.2096,
+      "eval_samples_per_second": 5.725,
+      "eval_steps_per_second": 0.187,
+      "step": 80
+    },
+    {
+      "epoch": 0.10024752475247525,
+      "grad_norm": 3.5432498455047607,
+      "learning_rate": 2e-05,
+      "loss": 0.3498,
+      "step": 81
+    },
+    {
+      "epoch": 0.10148514851485149,
+      "grad_norm": 4.109142303466797,
+      "learning_rate": 1.9999906631527858e-05,
+      "loss": 0.3289,
+      "step": 82
+    },
+    {
+      "epoch": 0.10272277227722772,
+      "grad_norm": 3.4147417545318604,
+      "learning_rate": 1.9999626527854966e-05,
+      "loss": 0.2813,
+      "step": 83
+    },
+    {
+      "epoch": 0.10396039603960396,
+      "grad_norm": 5.5374436378479,
+      "learning_rate": 1.9999159694211894e-05,
+      "loss": 0.3393,
+      "step": 84
+    },
+    {
+      "epoch": 0.1051980198019802,
+      "grad_norm": 4.537343502044678,
+      "learning_rate": 1.999850613931615e-05,
+      "loss": 0.4392,
+      "step": 85
+    },
+    {
+      "epoch": 0.10643564356435643,
+      "grad_norm": 3.075702428817749,
+      "learning_rate": 1.999766587537202e-05,
+      "loss": 0.3329,
+      "step": 86
+    },
+    {
+      "epoch": 0.10767326732673267,
+      "grad_norm": 6.164308071136475,
+      "learning_rate": 1.9996638918070336e-05,
+      "loss": 0.3292,
+      "step": 87
+    },
+    {
+      "epoch": 0.10891089108910891,
+      "grad_norm": 3.1993377208709717,
+      "learning_rate": 1.9995425286588187e-05,
+      "loss": 0.318,
+      "step": 88
+    },
+    {
+      "epoch": 0.11014851485148515,
+      "grad_norm": 3.789552927017212,
+      "learning_rate": 1.9994025003588547e-05,
+      "loss": 0.3504,
+      "step": 89
+    },
+    {
+      "epoch": 0.11138613861386139,
+      "grad_norm": 4.15277624130249,
+      "learning_rate": 1.9992438095219886e-05,
+      "loss": 0.2838,
+      "step": 90
+    },
+    {
+      "epoch": 0.11262376237623763,
+      "grad_norm": 3.4878060817718506,
+      "learning_rate": 1.9990664591115637e-05,
+      "loss": 0.3165,
+      "step": 91
+    },
+    {
+      "epoch": 0.11386138613861387,
+      "grad_norm": 5.2607035636901855,
+      "learning_rate": 1.9988704524393678e-05,
+      "loss": 0.3229,
+      "step": 92
+    },
+    {
+      "epoch": 0.1150990099009901,
+      "grad_norm": 6.290886878967285,
+      "learning_rate": 1.9986557931655688e-05,
+      "loss": 0.3629,
+      "step": 93
+    },
+    {
+      "epoch": 0.11633663366336634,
+      "grad_norm": 7.600953102111816,
+      "learning_rate": 1.9984224852986494e-05,
+      "loss": 0.3405,
+      "step": 94
+    },
+    {
+      "epoch": 0.11757425742574257,
+      "grad_norm": 4.730844974517822,
+      "learning_rate": 1.9981705331953295e-05,
+      "loss": 0.3718,
+      "step": 95
+    },
+    {
+      "epoch": 0.1188118811881188,
+      "grad_norm": 5.086641788482666,
+      "learning_rate": 1.9978999415604847e-05,
+      "loss": 0.2757,
+      "step": 96
+    },
+    {
+      "epoch": 0.12004950495049505,
+      "grad_norm": 6.739199161529541,
+      "learning_rate": 1.9976107154470613e-05,
+      "loss": 0.2859,
+      "step": 97
+    },
+    {
+      "epoch": 0.12128712871287128,
+      "grad_norm": 4.352366924285889,
+      "learning_rate": 1.9973028602559787e-05,
+      "loss": 0.3398,
+      "step": 98
+    },
+    {
+      "epoch": 0.12252475247524752,
+      "grad_norm": 7.858609199523926,
+      "learning_rate": 1.9969763817360314e-05,
+      "loss": 0.471,
+      "step": 99
+    },
+    {
+      "epoch": 0.12376237623762376,
+      "grad_norm": 5.571165561676025,
+      "learning_rate": 1.996631285983779e-05,
+      "loss": 0.3836,
+      "step": 100
+    },
+    {
+      "epoch": 0.12376237623762376,
+      "eval_accuracy": 0.8159645232815964,
+      "eval_f1": 0.6047619047619047,
+      "eval_loss": 0.42972368001937866,
+      "eval_precision": 0.7604790419161677,
+      "eval_recall": 0.5019762845849802,
+      "eval_runtime": 48.4236,
+      "eval_samples_per_second": 5.7,
+      "eval_steps_per_second": 0.186,
+      "step": 100
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 4.134688854217529,
+      "learning_rate": 1.9962675794434342e-05,
+      "loss": 0.2516,
+      "step": 101
+    },
+    {
+      "epoch": 0.12623762376237624,
+      "grad_norm": 3.988821506500244,
+      "learning_rate": 1.9958852689067423e-05,
+      "loss": 0.2509,
+      "step": 102
+    },
+    {
+      "epoch": 0.12747524752475248,
+      "grad_norm": 5.836869716644287,
+      "learning_rate": 1.9954843615128528e-05,
+      "loss": 0.3183,
+      "step": 103
+    },
+    {
+      "epoch": 0.12871287128712872,
+      "grad_norm": 9.7975492477417,
+      "learning_rate": 1.995064864748188e-05,
+      "loss": 0.3471,
+      "step": 104
+    },
+    {
+      "epoch": 0.12995049504950495,
+      "grad_norm": 5.1211066246032715,
+      "learning_rate": 1.9946267864463027e-05,
+      "loss": 0.3466,
+      "step": 105
+    },
+    {
+      "epoch": 0.1311881188118812,
+      "grad_norm": 5.172476291656494,
+      "learning_rate": 1.994170134787737e-05,
+      "loss": 0.3442,
+      "step": 106
+    },
+    {
+      "epoch": 0.13242574257425743,
+      "grad_norm": 4.703874111175537,
+      "learning_rate": 1.993694918299864e-05,
+      "loss": 0.3027,
+      "step": 107
+    },
+    {
+      "epoch": 0.13366336633663367,
+      "grad_norm": 3.981438398361206,
+      "learning_rate": 1.9932011458567315e-05,
+      "loss": 0.2803,
+      "step": 108
+    },
+    {
+      "epoch": 0.1349009900990099,
+      "grad_norm": 3.627497911453247,
+      "learning_rate": 1.9926888266788955e-05,
+      "loss": 0.3011,
+      "step": 109
+    },
+    {
+      "epoch": 0.13613861386138615,
+      "grad_norm": 5.726022720336914,
+      "learning_rate": 1.9921579703332475e-05,
+      "loss": 0.3463,
+      "step": 110
+    },
+    {
+      "epoch": 0.1373762376237624,
+      "grad_norm": 3.9661319255828857,
+      "learning_rate": 1.991608586732837e-05,
+      "loss": 0.3455,
+      "step": 111
+    },
+    {
+      "epoch": 0.13861386138613863,
+      "grad_norm": 4.330716133117676,
+      "learning_rate": 1.991040686136685e-05,
+      "loss": 0.2888,
+      "step": 112
+    },
+    {
+      "epoch": 0.13985148514851486,
+      "grad_norm": 2.6466479301452637,
+      "learning_rate": 1.9904542791495938e-05,
+      "loss": 0.2423,
+      "step": 113
+    },
+    {
+      "epoch": 0.14108910891089108,
+      "grad_norm": 3.5607573986053467,
+      "learning_rate": 1.9898493767219486e-05,
+      "loss": 0.2481,
+      "step": 114
+    },
+    {
+      "epoch": 0.14232673267326731,
+      "grad_norm": 3.259629011154175,
+      "learning_rate": 1.989225990149512e-05,
+      "loss": 0.2707,
+      "step": 115
+    },
+    {
+      "epoch": 0.14356435643564355,
+      "grad_norm": 3.952185869216919,
+      "learning_rate": 1.988584131073215e-05,
+      "loss": 0.2607,
+      "step": 116
+    },
+    {
+      "epoch": 0.1448019801980198,
+      "grad_norm": 2.9898970127105713,
+      "learning_rate": 1.9879238114789375e-05,
+      "loss": 0.2234,
+      "step": 117
+    },
+    {
+      "epoch": 0.14603960396039603,
+      "grad_norm": 3.857395648956299,
+      "learning_rate": 1.9872450436972856e-05,
+      "loss": 0.2691,
+      "step": 118
+    },
+    {
+      "epoch": 0.14727722772277227,
+      "grad_norm": 4.034820079803467,
+      "learning_rate": 1.986547840403362e-05,
+      "loss": 0.3632,
+      "step": 119
+    },
+    {
+      "epoch": 0.1485148514851485,
+      "grad_norm": 3.5433619022369385,
+      "learning_rate": 1.9858322146165272e-05,
+      "loss": 0.3363,
+      "step": 120
+    },
+    {
+      "epoch": 0.1485148514851485,
+      "eval_accuracy": 0.8381374722838137,
+      "eval_f1": 0.6666666666666666,
+      "eval_loss": 0.36761781573295593,
+      "eval_precision": 0.7891891891891892,
+      "eval_recall": 0.5770750988142292,
+      "eval_runtime": 48.4565,
+      "eval_samples_per_second": 5.696,
+      "eval_steps_per_second": 0.186,
+      "step": 120
+    },
+    {
+      "epoch": 0.14975247524752475,
+      "grad_norm": 4.58292818069458,
+      "learning_rate": 1.9850981797001593e-05,
+      "loss": 0.2657,
+      "step": 121
+    },
+    {
+      "epoch": 0.15099009900990099,
+      "grad_norm": 4.649030685424805,
+      "learning_rate": 1.9843457493614016e-05,
+      "loss": 0.2851,
+      "step": 122
+    },
+    {
+      "epoch": 0.15222772277227722,
+      "grad_norm": 4.370965957641602,
+      "learning_rate": 1.9835749376509084e-05,
+      "loss": 0.2917,
+      "step": 123
+    },
+    {
+      "epoch": 0.15346534653465346,
+      "grad_norm": 5.558561325073242,
+      "learning_rate": 1.9827857589625817e-05,
+      "loss": 0.2922,
+      "step": 124
+    },
+    {
+      "epoch": 0.1547029702970297,
+      "grad_norm": 3.4896552562713623,
+      "learning_rate": 1.981978228033304e-05,
+      "loss": 0.2478,
+      "step": 125
+    },
+    {
+      "epoch": 0.15594059405940594,
+      "grad_norm": 5.457974910736084,
+      "learning_rate": 1.9811523599426604e-05,
+      "loss": 0.3341,
+      "step": 126
+    },
+    {
+      "epoch": 0.15717821782178218,
+      "grad_norm": 3.6488845348358154,
+      "learning_rate": 1.980308170112659e-05,
+      "loss": 0.2577,
+      "step": 127
+    },
+    {
+      "epoch": 0.15841584158415842,
+      "grad_norm": 3.6894092559814453,
+      "learning_rate": 1.979445674307444e-05,
+      "loss": 0.2544,
+      "step": 128
+    },
+    {
+      "epoch": 0.15965346534653466,
+      "grad_norm": 5.288538455963135,
+      "learning_rate": 1.9785648886329974e-05,
+      "loss": 0.2452,
+      "step": 129
+    },
+    {
+      "epoch": 0.1608910891089109,
+      "grad_norm": 6.3318305015563965,
+      "learning_rate": 1.977665829536842e-05,
+      "loss": 0.2628,
+      "step": 130
+    },
+    {
+      "epoch": 0.16212871287128713,
+      "grad_norm": 5.06384801864624,
+      "learning_rate": 1.9767485138077327e-05,
+      "loss": 0.337,
+      "step": 131
+    },
+    {
+      "epoch": 0.16336633663366337,
+      "grad_norm": 3.954658269882202,
+      "learning_rate": 1.9758129585753433e-05,
+      "loss": 0.2729,
+      "step": 132
+    },
+    {
+      "epoch": 0.1646039603960396,
+      "grad_norm": 3.3781790733337402,
+      "learning_rate": 1.9748591813099457e-05,
+      "loss": 0.2204,
+      "step": 133
+    },
+    {
+      "epoch": 0.16584158415841585,
+      "grad_norm": 5.148495674133301,
+      "learning_rate": 1.9738871998220857e-05,
+      "loss": 0.2585,
+      "step": 134
+    },
+    {
+      "epoch": 0.1670792079207921,
+      "grad_norm": 4.203769207000732,
+      "learning_rate": 1.9728970322622485e-05,
+      "loss": 0.3102,
+      "step": 135
+    },
+    {
+      "epoch": 0.16831683168316833,
+      "grad_norm": 3.7691049575805664,
+      "learning_rate": 1.9718886971205206e-05,
+      "loss": 0.2592,
+      "step": 136
+    },
+    {
+      "epoch": 0.16955445544554457,
+      "grad_norm": 5.7634711265563965,
+      "learning_rate": 1.970862213226244e-05,
+      "loss": 0.2607,
+      "step": 137
+    },
+    {
+      "epoch": 0.1707920792079208,
+      "grad_norm": 4.632352828979492,
+      "learning_rate": 1.9698175997476657e-05,
+      "loss": 0.2914,
+      "step": 138
+    },
+    {
+      "epoch": 0.17202970297029702,
+      "grad_norm": 5.2901434898376465,
+      "learning_rate": 1.968754876191578e-05,
+      "loss": 0.2874,
+      "step": 139
+    },
+    {
+      "epoch": 0.17326732673267325,
+      "grad_norm": 3.2094457149505615,
+      "learning_rate": 1.9676740624029566e-05,
+      "loss": 0.2483,
+      "step": 140
+    },
+    {
+      "epoch": 0.17326732673267325,
+      "eval_accuracy": 0.8403547671840355,
+      "eval_f1": 0.6587677725118484,
+      "eval_loss": 0.35367104411125183,
+      "eval_precision": 0.8224852071005917,
+      "eval_recall": 0.549407114624506,
+      "eval_runtime": 49.1165,
+      "eval_samples_per_second": 5.619,
+      "eval_steps_per_second": 0.183,
+      "step": 140
+    },
+    {
+      "epoch": 0.1745049504950495,
+      "grad_norm": 3.4511711597442627,
+      "learning_rate": 1.9665751785645874e-05,
+      "loss": 0.2277,
+      "step": 141
+    },
+    {
+      "epoch": 0.17574257425742573,
+      "grad_norm": 3.3621718883514404,
+      "learning_rate": 1.9654582451966915e-05,
+      "loss": 0.2893,
+      "step": 142
+    },
+    {
+      "epoch": 0.17698019801980197,
+      "grad_norm": 4.829539775848389,
+      "learning_rate": 1.9643232831565417e-05,
+      "loss": 0.2127,
+      "step": 143
+    },
+    {
+      "epoch": 0.1782178217821782,
+      "grad_norm": 4.233989715576172,
+      "learning_rate": 1.9631703136380716e-05,
+      "loss": 0.2133,
+      "step": 144
+    },
+    {
+      "epoch": 0.17945544554455445,
+      "grad_norm": 9.943169593811035,
+      "learning_rate": 1.961999358171482e-05,
+      "loss": 0.442,
+      "step": 145
+    },
+    {
+      "epoch": 0.1806930693069307,
+      "grad_norm": 4.362405300140381,
+      "learning_rate": 1.960810438622838e-05,
+      "loss": 0.2677,
+      "step": 146
+    },
+    {
+      "epoch": 0.18193069306930693,
+      "grad_norm": 4.714008808135986,
+      "learning_rate": 1.959603577193659e-05,
+      "loss": 0.3213,
+      "step": 147
+    },
+    {
+      "epoch": 0.18316831683168316,
+      "grad_norm": 3.655679702758789,
+      "learning_rate": 1.9583787964205073e-05,
+      "loss": 0.199,
+      "step": 148
+    },
+    {
+      "epoch": 0.1844059405940594,
+      "grad_norm": 4.397619247436523,
+      "learning_rate": 1.9571361191745647e-05,
+      "loss": 0.2728,
+      "step": 149
+    },
+    {
+      "epoch": 0.18564356435643564,
+      "grad_norm": 4.055555820465088,
+      "learning_rate": 1.955875568661206e-05,
+      "loss": 0.2461,
+      "step": 150
+    },
+    {
+      "epoch": 0.18688118811881188,
+      "grad_norm": 4.366605281829834,
+      "learning_rate": 1.9545971684195664e-05,
+      "loss": 0.2026,
+      "step": 151
+    },
+    {
+      "epoch": 0.18811881188118812,
+      "grad_norm": 3.7074687480926514,
+      "learning_rate": 1.9533009423221014e-05,
+      "loss": 0.2817,
+      "step": 152
+    },
+    {
+      "epoch": 0.18935643564356436,
+      "grad_norm": 4.276401996612549,
+      "learning_rate": 1.951986914574141e-05,
+      "loss": 0.2661,
+      "step": 153
+    },
+    {
+      "epoch": 0.1905940594059406,
+      "grad_norm": 3.917130708694458,
+      "learning_rate": 1.9506551097134384e-05,
+      "loss": 0.3005,
+      "step": 154
+    },
+    {
+      "epoch": 0.19183168316831684,
+      "grad_norm": 6.731651306152344,
+      "learning_rate": 1.94930555260971e-05,
+      "loss": 0.2892,
+      "step": 155
+    },
+    {
+      "epoch": 0.19306930693069307,
+      "grad_norm": 4.87600564956665,
+      "learning_rate": 1.947938268464173e-05,
+      "loss": 0.1983,
+      "step": 156
+    },
+    {
+      "epoch": 0.1943069306930693,
+      "grad_norm": 4.437981605529785,
+      "learning_rate": 1.9465532828090735e-05,
+      "loss": 0.2479,
+      "step": 157
+    },
+    {
+      "epoch": 0.19554455445544555,
+      "grad_norm": 3.6721622943878174,
+      "learning_rate": 1.9451506215072106e-05,
+      "loss": 0.243,
+      "step": 158
+    },
+    {
+      "epoch": 0.1967821782178218,
+      "grad_norm": 3.8687756061553955,
+      "learning_rate": 1.943730310751453e-05,
+      "loss": 0.2619,
+      "step": 159
+    },
+    {
+      "epoch": 0.19801980198019803,
+      "grad_norm": 4.864063739776611,
+      "learning_rate": 1.9422923770642494e-05,
+      "loss": 0.2803,
+      "step": 160
+    },
+    {
+      "epoch": 0.19801980198019803,
+      "eval_accuracy": 0.8414634146341463,
+      "eval_f1": 0.6520681265206812,
+      "eval_loss": 0.34682103991508484,
+      "eval_precision": 0.8481012658227848,
+      "eval_recall": 0.5296442687747036,
+      "eval_runtime": 49.8936,
+      "eval_samples_per_second": 5.532,
+      "eval_steps_per_second": 0.18,
+      "step": 160
+    },
+    {
+      "epoch": 0.19925742574257427,
+      "grad_norm": 3.036126136779785,
+      "learning_rate": 1.9408368472971344e-05,
+      "loss": 0.2777,
+      "step": 161
+    },
+    {
+      "epoch": 0.2004950495049505,
+      "grad_norm": 3.19771409034729,
+      "learning_rate": 1.9393637486302257e-05,
+      "loss": 0.2741,
+      "step": 162
+    },
+    {
+      "epoch": 0.20173267326732675,
+      "grad_norm": 4.557991027832031,
+      "learning_rate": 1.937873108571718e-05,
+      "loss": 0.2677,
+      "step": 163
+    },
+    {
+      "epoch": 0.20297029702970298,
+      "grad_norm": 4.806491374969482,
+      "learning_rate": 1.936364954957368e-05,
+      "loss": 0.2728,
+      "step": 164
+    },
+    {
+      "epoch": 0.2042079207920792,
+      "grad_norm": 5.901110649108887,
+      "learning_rate": 1.934839315949976e-05,
+      "loss": 0.2406,
+      "step": 165
+    },
+    {
+      "epoch": 0.20544554455445543,
+      "grad_norm": 3.7812883853912354,
+      "learning_rate": 1.933296220038858e-05,
+      "loss": 0.2857,
+      "step": 166
+    },
+    {
+      "epoch": 0.20668316831683167,
+      "grad_norm": 4.161533832550049,
+      "learning_rate": 1.9317356960393158e-05,
+      "loss": 0.2132,
+      "step": 167
+    },
+    {
+      "epoch": 0.2079207920792079,
+      "grad_norm": 3.8676390647888184,
+      "learning_rate": 1.9301577730920975e-05,
+      "loss": 0.2486,
+      "step": 168
+    },
+    {
+      "epoch": 0.20915841584158415,
+      "grad_norm": 4.488946437835693,
+      "learning_rate": 1.9285624806628543e-05,
+      "loss": 0.2859,
+      "step": 169
+    },
+    {
+      "epoch": 0.2103960396039604,
+      "grad_norm": 3.541072130203247,
+      "learning_rate": 1.9269498485415897e-05,
+      "loss": 0.2522,
+      "step": 170
+    },
+    {
+      "epoch": 0.21163366336633663,
+      "grad_norm": 3.683732509613037,
+      "learning_rate": 1.925319906842103e-05,
+      "loss": 0.223,
+      "step": 171
+    },
+    {
+      "epoch": 0.21287128712871287,
+      "grad_norm": 3.875123977661133,
+      "learning_rate": 1.923672686001427e-05,
+      "loss": 0.2906,
+      "step": 172
+    },
+    {
+      "epoch": 0.2141089108910891,
+      "grad_norm": 4.992143630981445,
+      "learning_rate": 1.922008216779261e-05,
+      "loss": 0.2183,
+      "step": 173
+    },
+    {
+      "epoch": 0.21534653465346534,
+      "grad_norm": 5.165887355804443,
+      "learning_rate": 1.920326530257394e-05,
+      "loss": 0.2291,
+      "step": 174
+    },
+    {
+      "epoch": 0.21658415841584158,
+      "grad_norm": 3.6516168117523193,
+      "learning_rate": 1.9186276578391268e-05,
+      "loss": 0.2092,
+      "step": 175
+    },
+    {
+      "epoch": 0.21782178217821782,
+      "grad_norm": 3.7098777294158936,
+      "learning_rate": 1.9169116312486835e-05,
+      "loss": 0.2635,
+      "step": 176
+    },
+    {
+      "epoch": 0.21905940594059406,
+      "grad_norm": 6.8240180015563965,
+      "learning_rate": 1.9151784825306205e-05,
+      "loss": 0.2545,
+      "step": 177
+    },
+    {
+      "epoch": 0.2202970297029703,
+      "grad_norm": 4.409351348876953,
+      "learning_rate": 1.9134282440492272e-05,
+      "loss": 0.2505,
+      "step": 178
+    },
+    {
+      "epoch": 0.22153465346534654,
+      "grad_norm": 3.2560315132141113,
+      "learning_rate": 1.911660948487922e-05,
+      "loss": 0.2857,
+      "step": 179
+    },
+    {
+      "epoch": 0.22277227722772278,
+      "grad_norm": 5.461050987243652,
+      "learning_rate": 1.9098766288486426e-05,
+      "loss": 0.2782,
+      "step": 180
+    },
+    {
+      "epoch": 0.22277227722772278,
+      "eval_accuracy": 0.8237250554323725,
+      "eval_f1": 0.5974683544303797,
+      "eval_loss": 0.34932276606559753,
+      "eval_precision": 0.8309859154929577,
+      "eval_recall": 0.466403162055336,
+      "eval_runtime": 49.2509,
+      "eval_samples_per_second": 5.604,
+      "eval_steps_per_second": 0.183,
+      "step": 180
+    },
+    {
+      "epoch": 0.22400990099009901,
+      "grad_norm": 3.929197072982788,
+      "learning_rate": 1.9080753184512284e-05,
+      "loss": 0.2682,
+      "step": 181
+    },
+    {
+      "epoch": 0.22524752475247525,
+      "grad_norm": 4.4159393310546875,
+      "learning_rate": 1.9062570509327993e-05,
+      "loss": 0.2503,
+      "step": 182
+    },
+    {
+      "epoch": 0.2264851485148515,
+      "grad_norm": 5.622183799743652,
+      "learning_rate": 1.9044218602471275e-05,
+      "loss": 0.3253,
+      "step": 183
+    },
+    {
+      "epoch": 0.22772277227722773,
+      "grad_norm": 3.281792402267456,
+      "learning_rate": 1.9025697806640035e-05,
+      "loss": 0.2018,
+      "step": 184
+    },
+    {
+      "epoch": 0.22896039603960397,
+      "grad_norm": 3.431208372116089,
+      "learning_rate": 1.9007008467685947e-05,
+      "loss": 0.2012,
+      "step": 185
+    },
+    {
+      "epoch": 0.2301980198019802,
+      "grad_norm": 5.277952671051025,
+      "learning_rate": 1.8988150934608014e-05,
+      "loss": 0.2031,
+      "step": 186
+    },
+    {
+      "epoch": 0.23143564356435645,
+      "grad_norm": 4.322801113128662,
+      "learning_rate": 1.8969125559546054e-05,
+      "loss": 0.2626,
+      "step": 187
+    },
+    {
+      "epoch": 0.23267326732673269,
+      "grad_norm": 4.021146297454834,
+      "learning_rate": 1.894993269777411e-05,
+      "loss": 0.2343,
+      "step": 188
+    },
+    {
+      "epoch": 0.23391089108910892,
+      "grad_norm": 3.045038938522339,
+      "learning_rate": 1.893057270769381e-05,
+      "loss": 0.1718,
+      "step": 189
+    },
+    {
+      "epoch": 0.23514851485148514,
+      "grad_norm": 4.587369441986084,
+      "learning_rate": 1.8911045950827693e-05,
+      "loss": 0.2377,
+      "step": 190
+    },
+    {
+      "epoch": 0.23638613861386137,
+      "grad_norm": 5.442078590393066,
+      "learning_rate": 1.8891352791812452e-05,
+      "loss": 0.2796,
+      "step": 191
+    },
+    {
+      "epoch": 0.2376237623762376,
+      "grad_norm": 6.258726596832275,
+      "learning_rate": 1.8871493598392122e-05,
+      "loss": 0.2856,
+      "step": 192
+    },
+    {
+      "epoch": 0.23886138613861385,
+      "grad_norm": 6.618675231933594,
+      "learning_rate": 1.885146874141121e-05,
+      "loss": 0.256,
+      "step": 193
+    },
+    {
+      "epoch": 0.2400990099009901,
+      "grad_norm": 4.947834491729736,
+      "learning_rate": 1.8831278594807783e-05,
+      "loss": 0.2452,
+      "step": 194
+    },
+    {
+      "epoch": 0.24133663366336633,
+      "grad_norm": 3.6348724365234375,
+      "learning_rate": 1.881092353560646e-05,
+      "loss": 0.2141,
+      "step": 195
+    },
+    {
+      "epoch": 0.24257425742574257,
+      "grad_norm": 7.256039619445801,
+      "learning_rate": 1.8790403943911403e-05,
+      "loss": 0.2617,
+      "step": 196
+    },
+    {
+      "epoch": 0.2438118811881188,
+      "grad_norm": 4.058467864990234,
+      "learning_rate": 1.8769720202899196e-05,
+      "loss": 0.2119,
+      "step": 197
+    },
+    {
+      "epoch": 0.24504950495049505,
+      "grad_norm": 8.09382438659668,
+      "learning_rate": 1.8748872698811695e-05,
+      "loss": 0.2156,
+      "step": 198
+    },
+    {
+      "epoch": 0.24628712871287128,
+      "grad_norm": 5.703820705413818,
+      "learning_rate": 1.872786182094882e-05,
+      "loss": 0.1883,
+      "step": 199
+    },
+    {
+      "epoch": 0.24752475247524752,
+      "grad_norm": 6.104684352874756,
+      "learning_rate": 1.870668796166129e-05,
+      "loss": 0.2174,
+      "step": 200
+    },
+    {
+      "epoch": 0.24752475247524752,
+      "eval_accuracy": 0.8492239467849224,
+      "eval_f1": 0.6866359447004609,
+      "eval_loss": 0.33290114998817444,
+      "eval_precision": 0.8232044198895028,
+      "eval_recall": 0.5889328063241107,
+      "eval_runtime": 48.1855,
+      "eval_samples_per_second": 5.728,
+      "eval_steps_per_second": 0.187,
+      "step": 200
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 808,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.099415773216768e+16,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}