Model save

Browse files

Files changed (7) hide show

README.md +7 -9
adapter_config.json +4 -4
adapter_model.safetensors +2 -2
all_results.json +7 -7
runs/Jun07_23-14-40_ip-172-31-69-60.ec2.internal/events.out.tfevents.1717802114.ip-172-31-69-60.ec2.internal.32152.0 +2 -2
train_results.json +7 -7
trainer_state.json +447 -216

README.md CHANGED Viewed

@@ -2,13 +2,11 @@
 license: apache-2.0
 library_name: peft
 tags:
-- alignment-handbook
 - trl
 - sft
 - generated_from_trainer
 base_model: mistralai/Mistral-7B-v0.1
-datasets:
-- HuggingFaceH4/ultrachat_200k
 model-index:
 - name: mistral5p
   results: []
@@ -19,7 +17,7 @@ should probably proofread and complete it, then remove this comment. -->
 # mistral5p
-This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the HuggingFaceH4/ultrachat_200k dataset.
 It achieves the following results on the evaluation set:
 - Loss: nan
@@ -46,8 +44,8 @@ The following hyperparameters were used during training:
 - seed: 42
 - distributed_type: multi-GPU
 - num_devices: 4
-- gradient_accumulation_steps: 4
-- total_train_batch_size: 128
 - total_eval_batch_size: 4
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
@@ -56,9 +54,9 @@ The following hyperparameters were used during training:
 ### Training results
-| Training Loss | Epoch  | Step | Validation Loss |
-|:-------------:|:------:|:----:|:---------------:|
-| 0.7092        | 0.9969 | 243  | nan             |
 ### Framework versions

 license: apache-2.0
 library_name: peft
 tags:
 - trl
 - sft
+- alignment-handbook
 - generated_from_trainer
 base_model: mistralai/Mistral-7B-v0.1
 model-index:
 - name: mistral5p
   results: []
 # mistral5p
+This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
 It achieves the following results on the evaluation set:
 - Loss: nan
 - seed: 42
 - distributed_type: multi-GPU
 - num_devices: 4
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 256
 - total_eval_batch_size: 4
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 ### Training results
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:-----:|:----:|:---------------:|
+| 0.6894        | 1.0   | 406  | nan             |
 ### Framework versions

adapter_config.json CHANGED Viewed

@@ -22,11 +22,11 @@
   "target_modules": [
     "k_proj",
     "gate_proj",
-    "o_proj",
-    "q_proj",
     "down_proj",
-    "up_proj",
-    "v_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "target_modules": [
     "k_proj",
     "gate_proj",
     "down_proj",
+    "q_proj",
+    "o_proj",
+    "v_proj",
+    "up_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3fe90f2f98208e74edd203b571c5549da46239bd0283d4b96ce66fefe4c7662e
-size 31516744

 version https://git-lfs.github.com/spec/v1
+oid sha256:9915b7e1547d3d9bed1440525979f707b4f386020b4013df18918578132e931f
+size 62973728

all_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 0.9969230769230769,
-    "total_flos": 5623477606285312.0,
-    "train_loss": 0.7391852758548878,
-    "train_runtime": 18302.1626,
-    "train_samples": 31180,
-    "train_samples_per_second": 1.704,
-    "train_steps_per_second": 0.013
 }

 {
+    "epoch": 1.0,
+    "total_flos": 2.1219407349982167e+18,
+    "train_loss": 0.3640357888684484,
+    "train_runtime": 42241.1922,
+    "train_samples": 103932,
+    "train_samples_per_second": 2.46,
+    "train_steps_per_second": 0.01
 }

runs/Jun07_23-14-40_ip-172-31-69-60.ec2.internal/events.out.tfevents.1717802114.ip-172-31-69-60.ec2.internal.32152.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2b85799135ab367db15a1795367cf3873efe2ee1e377c5075a8ce83e03c97e46
-size 13528

 version https://git-lfs.github.com/spec/v1
+oid sha256:b3aa58153670c2e6ce55b2a7cf6665a2aa2ed380a166f90c5e8e077be316a481
+size 14364

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 0.9969230769230769,
-    "total_flos": 5623477606285312.0,
-    "train_loss": 0.7391852758548878,
-    "train_runtime": 18302.1626,
-    "train_samples": 31180,
-    "train_samples_per_second": 1.704,
-    "train_steps_per_second": 0.013
 }

 {
+    "epoch": 1.0,
+    "total_flos": 2.1219407349982167e+18,
+    "train_loss": 0.3640357888684484,
+    "train_runtime": 42241.1922,
+    "train_samples": 103932,
+    "train_samples_per_second": 2.46,
+    "train_steps_per_second": 0.01
 }

trainer_state.json CHANGED Viewed

@@ -1,392 +1,623 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.9969230769230769,
   "eval_steps": 500,
-  "global_step": 243,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.0041025641025641026,
-      "grad_norm": 0.26542961092188894,
-      "learning_rate": 8.000000000000001e-06,
-      "loss": 0.8534,
       "step": 1
     },
     {
-      "epoch": 0.020512820512820513,
-      "grad_norm": 0.2656899378598073,
-      "learning_rate": 4e-05,
-      "loss": 0.8667,
       "step": 5
     },
     {
-      "epoch": 0.041025641025641026,
-      "grad_norm": 0.2572181527992512,
-      "learning_rate": 8e-05,
-      "loss": 0.8031,
       "step": 10
     },
     {
-      "epoch": 0.06153846153846154,
-      "grad_norm": 0.18710816410634198,
-      "learning_rate": 0.00012,
-      "loss": 0.7552,
       "step": 15
     },
     {
-      "epoch": 0.08205128205128205,
-      "grad_norm": 0.17366628242950644,
-      "learning_rate": 0.00016,
-      "loss": 0.7434,
       "step": 20
     },
     {
-      "epoch": 0.10256410256410256,
-      "grad_norm": 0.18209587440813843,
-      "learning_rate": 0.0002,
-      "loss": 0.7613,
       "step": 25
     },
     {
-      "epoch": 0.12307692307692308,
-      "grad_norm": 0.17584855523843826,
-      "learning_rate": 0.00019974051702905277,
-      "loss": 0.7686,
       "step": 30
     },
     {
-      "epoch": 0.14358974358974358,
-      "grad_norm": 0.1876526868957483,
-      "learning_rate": 0.00019896341474445525,
-      "loss": 0.7408,
       "step": 35
     },
     {
-      "epoch": 0.1641025641025641,
-      "grad_norm": 0.14962305137903897,
-      "learning_rate": 0.00019767272604239824,
-      "loss": 0.7422,
       "step": 40
     },
     {
-      "epoch": 0.18461538461538463,
-      "grad_norm": 0.16336671883236106,
-      "learning_rate": 0.00019587514915766124,
-      "loss": 0.7565,
       "step": 45
     },
     {
-      "epoch": 0.20512820512820512,
-      "grad_norm": 0.15152018240587575,
-      "learning_rate": 0.00019358001290205543,
-      "loss": 0.7493,
       "step": 50
     },
     {
-      "epoch": 0.22564102564102564,
-      "grad_norm": 0.14810534949892065,
-      "learning_rate": 0.0001907992282510675,
-      "loss": 0.7539,
       "step": 55
     },
     {
-      "epoch": 0.24615384615384617,
-      "grad_norm": 0.16847734140964582,
-      "learning_rate": 0.00018754722652995347,
-      "loss": 0.7395,
       "step": 60
     },
     {
-      "epoch": 0.26666666666666666,
-      "grad_norm": 0.15436628692835286,
-      "learning_rate": 0.00018384088452007578,
-      "loss": 0.747,
       "step": 65
     },
     {
-      "epoch": 0.28717948717948716,
-      "grad_norm": 0.15652291403007046,
-      "learning_rate": 0.00017969943687415576,
-      "loss": 0.7506,
       "step": 70
     },
     {
-      "epoch": 0.3076923076923077,
-      "grad_norm": 0.16413347022113,
-      "learning_rate": 0.0001751443762949772,
-      "loss": 0.7611,
       "step": 75
     },
     {
-      "epoch": 0.3282051282051282,
-      "grad_norm": 0.16182771329972745,
-      "learning_rate": 0.00017019934199557867,
-      "loss": 0.7576,
       "step": 80
     },
     {
-      "epoch": 0.3487179487179487,
-      "grad_norm": 0.137569364566755,
-      "learning_rate": 0.00016488999701978903,
-      "loss": 0.7451,
       "step": 85
     },
     {
-      "epoch": 0.36923076923076925,
-      "grad_norm": 0.14900540538806303,
-      "learning_rate": 0.00015924389505977038,
-      "loss": 0.7197,
       "step": 90
     },
     {
-      "epoch": 0.38974358974358975,
-      "grad_norm": 0.14655990407738764,
-      "learning_rate": 0.00015329033746173975,
-      "loss": 0.7149,
       "step": 95
     },
     {
-      "epoch": 0.41025641025641024,
-      "grad_norm": 0.1402768270199422,
-      "learning_rate": 0.00014706022116196208,
-      "loss": 0.7018,
       "step": 100
     },
     {
-      "epoch": 0.4307692307692308,
-      "grad_norm": 0.14134928021057855,
-      "learning_rate": 0.00014058587834217355,
-      "loss": 0.7324,
       "step": 105
     },
     {
-      "epoch": 0.4512820512820513,
-      "grad_norm": 0.15600188868890955,
-      "learning_rate": 0.00013390090863657047,
-      "loss": 0.748,
       "step": 110
     },
     {
-      "epoch": 0.4717948717948718,
-      "grad_norm": 0.13996117144988574,
-      "learning_rate": 0.0001270400047611508,
-      "loss": 0.7703,
       "step": 115
     },
     {
-      "epoch": 0.49230769230769234,
-      "grad_norm": 0.1510982087519504,
-      "learning_rate": 0.00012003877247033411,
-      "loss": 0.7515,
       "step": 120
     },
     {
-      "epoch": 0.5128205128205128,
-      "grad_norm": 0.14250885612041378,
-      "learning_rate": 0.00011293354577522263,
-      "loss": 0.7196,
       "step": 125
     },
     {
-      "epoch": 0.5333333333333333,
-      "grad_norm": 0.14795873562666287,
-      "learning_rate": 0.00010576119838245844,
-      "loss": 0.731,
       "step": 130
     },
     {
-      "epoch": 0.5538461538461539,
-      "grad_norm": 0.13063022182064904,
-      "learning_rate": 9.85589523322443e-05,
-      "loss": 0.7301,
       "step": 135
     },
     {
-      "epoch": 0.5743589743589743,
-      "grad_norm": 0.13557487318693218,
-      "learning_rate": 9.136418482863229e-05,
-      "loss": 0.718,
       "step": 140
     },
     {
-      "epoch": 0.5948717948717949,
-      "grad_norm": 0.15507641481392034,
-      "learning_rate": 8.42142342645646e-05,
-      "loss": 0.7193,
       "step": 145
     },
     {
-      "epoch": 0.6153846153846154,
-      "grad_norm": 0.15494002592225475,
-      "learning_rate": 7.714620644833111e-05,
-      "loss": 0.731,
       "step": 150
     },
     {
-      "epoch": 0.6358974358974359,
-      "grad_norm": 0.1601203804358568,
-      "learning_rate": 7.019678203706163e-05,
-      "loss": 0.75,
       "step": 155
     },
     {
-      "epoch": 0.6564102564102564,
-      "grad_norm": 0.15685275896075176,
-      "learning_rate": 6.340202617660842e-05,
-      "loss": 0.7505,
       "step": 160
     },
     {
-      "epoch": 0.676923076923077,
-      "grad_norm": 0.1416704035001306,
-      "learning_rate": 5.679720133572206e-05,
-      "loss": 0.7311,
       "step": 165
     },
     {
-      "epoch": 0.6974358974358974,
-      "grad_norm": 0.14126335283382643,
-      "learning_rate": 5.0416584305848524e-05,
-      "loss": 0.755,
       "step": 170
     },
     {
-      "epoch": 0.717948717948718,
-      "grad_norm": 0.13481365291511518,
-      "learning_rate": 4.4293288316255653e-05,
-      "loss": 0.695,
       "step": 175
     },
     {
-      "epoch": 0.7384615384615385,
-      "grad_norm": 0.14785228000969083,
-      "learning_rate": 3.845909118765073e-05,
-      "loss": 0.7209,
       "step": 180
     },
     {
-      "epoch": 0.7589743589743589,
-      "grad_norm": 0.1503268472656009,
-      "learning_rate": 3.294427041611425e-05,
-      "loss": 0.7307,
       "step": 185
     },
     {
-      "epoch": 0.7794871794871795,
-      "grad_norm": 0.15299701883959732,
-      "learning_rate": 2.7777446043207058e-05,
-      "loss": 0.7351,
       "step": 190
     },
     {
-      "epoch": 0.8,
-      "grad_norm": 0.1373494159366909,
-      "learning_rate": 2.2985432127701946e-05,
-      "loss": 0.7304,
       "step": 195
     },
     {
-      "epoch": 0.8205128205128205,
-      "grad_norm": 0.16245499908137687,
-      "learning_rate": 1.859309758975132e-05,
-      "loss": 0.7385,
       "step": 200
     },
     {
-      "epoch": 0.841025641025641,
-      "grad_norm": 0.13807085431435603,
-      "learning_rate": 1.462323714966114e-05,
-      "loss": 0.714,
       "step": 205
     },
     {
-      "epoch": 0.8615384615384616,
-      "grad_norm": 0.12926916684005163,
-      "learning_rate": 1.1096453031056264e-05,
-      "loss": 0.7078,
       "step": 210
     },
     {
-      "epoch": 0.882051282051282,
-      "grad_norm": 0.12918592796308734,
-      "learning_rate": 8.031048042356392e-06,
-      "loss": 0.7208,
       "step": 215
     },
     {
-      "epoch": 0.9025641025641026,
-      "grad_norm": 0.14837431396022016,
-      "learning_rate": 5.442930591433992e-06,
-      "loss": 0.7435,
       "step": 220
     },
     {
-      "epoch": 0.9230769230769231,
-      "grad_norm": 0.14838485907528307,
-      "learning_rate": 3.3455321263955786e-06,
-      "loss": 0.7255,
       "step": 225
     },
     {
-      "epoch": 0.9435897435897436,
-      "grad_norm": 0.1480931290003826,
-      "learning_rate": 1.7497374309405346e-06,
-      "loss": 0.695,
       "step": 230
     },
     {
-      "epoch": 0.9641025641025641,
-      "grad_norm": 0.14733343614150893,
-      "learning_rate": 6.638281360408339e-07,
-      "loss": 0.7012,
       "step": 235
     },
     {
-      "epoch": 0.9846153846153847,
-      "grad_norm": 0.1368626907349573,
-      "learning_rate": 9.343974109685682e-08,
-      "loss": 0.7092,
       "step": 240
     },
     {
-      "epoch": 0.9969230769230769,
       "eval_loss": NaN,
-      "eval_runtime": 748.4122,
-      "eval_samples_per_second": 1.545,
-      "eval_steps_per_second": 0.386,
-      "step": 243
-    },
-    {
-      "epoch": 0.9969230769230769,
-      "step": 243,
-      "total_flos": 5623477606285312.0,
-      "train_loss": 0.7391852758548878,
-      "train_runtime": 18302.1626,
-      "train_samples_per_second": 1.704,
-      "train_steps_per_second": 0.013
     }
   ],
   "logging_steps": 5,
-  "max_steps": 243,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
-  "save_steps": 1000,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
         "should_epoch_stop": false,
         "should_evaluate": false,
         "should_log": false,
-        "should_save": false,
         "should_training_stop": false
       },
       "attributes": {}
     }
   },
-  "total_flos": 5623477606285312.0,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 1.0,
   "eval_steps": 500,
+  "global_step": 406,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.0024630541871921183,
+      "grad_norm": 0.13191281259059906,
+      "learning_rate": 4.8780487804878055e-06,
+      "loss": 0.7592,
       "step": 1
     },
     {
+      "epoch": 0.012315270935960592,
+      "grad_norm": 0.1377675086259842,
+      "learning_rate": 2.4390243902439026e-05,
+      "loss": 0.7841,
       "step": 5
     },
     {
+      "epoch": 0.024630541871921183,
+      "grad_norm": 0.11912436038255692,
+      "learning_rate": 4.878048780487805e-05,
+      "loss": 0.7896,
       "step": 10
     },
     {
+      "epoch": 0.03694581280788178,
+      "grad_norm": 0.12670652568340302,
+      "learning_rate": 7.317073170731707e-05,
+      "loss": 0.7835,
       "step": 15
     },
     {
+      "epoch": 0.04926108374384237,
+      "grad_norm": 0.10752439498901367,
+      "learning_rate": 9.75609756097561e-05,
+      "loss": 0.7806,
       "step": 20
     },
     {
+      "epoch": 0.06157635467980296,
+      "grad_norm": 0.10097604990005493,
+      "learning_rate": 0.00012195121951219512,
+      "loss": 0.7567,
       "step": 25
     },
     {
+      "epoch": 0.07389162561576355,
+      "grad_norm": 0.10570556670427322,
+      "learning_rate": 0.00014634146341463414,
+      "loss": 0.7527,
       "step": 30
     },
     {
+      "epoch": 0.08620689655172414,
+      "grad_norm": 0.11552459746599197,
+      "learning_rate": 0.0001707317073170732,
+      "loss": 0.7514,
       "step": 35
     },
     {
+      "epoch": 0.09852216748768473,
+      "grad_norm": 0.11246493458747864,
+      "learning_rate": 0.0001951219512195122,
+      "loss": 0.75,
       "step": 40
     },
     {
+      "epoch": 0.11083743842364532,
+      "grad_norm": 0.10585317760705948,
+      "learning_rate": 0.0001999407400739705,
+      "loss": 0.738,
       "step": 45
     },
     {
+      "epoch": 0.12315270935960591,
+      "grad_norm": 0.10629229247570038,
+      "learning_rate": 0.00019970011699250152,
+      "loss": 0.7748,
       "step": 50
     },
     {
+      "epoch": 0.1354679802955665,
+      "grad_norm": 0.09721837192773819,
+      "learning_rate": 0.00019927487224577402,
+      "loss": 0.7469,
       "step": 55
     },
     {
+      "epoch": 0.1477832512315271,
+      "grad_norm": 0.10196585208177567,
+      "learning_rate": 0.0001986657932891657,
+      "loss": 0.7435,
       "step": 60
     },
     {
+      "epoch": 0.16009852216748768,
+      "grad_norm": 0.09946288913488388,
+      "learning_rate": 0.00019787400799669154,
+      "loss": 0.7482,
       "step": 65
     },
     {
+      "epoch": 0.1724137931034483,
+      "grad_norm": 0.10071098804473877,
+      "learning_rate": 0.00019690098257244064,
+      "loss": 0.7551,
       "step": 70
     },
     {
+      "epoch": 0.18472906403940886,
+      "grad_norm": 0.10068488121032715,
+      "learning_rate": 0.00019574851883550395,
+      "loss": 0.7502,
       "step": 75
     },
     {
+      "epoch": 0.19704433497536947,
+      "grad_norm": 0.10105381160974503,
+      "learning_rate": 0.00019441875088341997,
+      "loss": 0.7438,
       "step": 80
     },
     {
+      "epoch": 0.20935960591133004,
+      "grad_norm": 0.09201391041278839,
+      "learning_rate": 0.00019291414114031743,
+      "loss": 0.7374,
       "step": 85
     },
     {
+      "epoch": 0.22167487684729065,
+      "grad_norm": 0.09724140912294388,
+      "learning_rate": 0.00019123747579707275,
+      "loss": 0.7588,
       "step": 90
     },
     {
+      "epoch": 0.23399014778325122,
+      "grad_norm": 0.09425447881221771,
+      "learning_rate": 0.0001893918596519257,
+      "loss": 0.7631,
       "step": 95
     },
     {
+      "epoch": 0.24630541871921183,
+      "grad_norm": 0.10638237744569778,
+      "learning_rate": 0.00018738071036110808,
+      "loss": 0.7507,
       "step": 100
     },
     {
+      "epoch": 0.25862068965517243,
+      "grad_norm": 0.10152100026607513,
+      "learning_rate": 0.00018520775211013093,
+      "loss": 0.7503,
       "step": 105
     },
     {
+      "epoch": 0.270935960591133,
+      "grad_norm": 0.09959056228399277,
+      "learning_rate": 0.00018287700871745036,
+      "loss": 0.7644,
       "step": 110
     },
     {
+      "epoch": 0.2832512315270936,
+      "grad_norm": 0.10374708473682404,
+      "learning_rate": 0.00018039279618328212,
+      "loss": 0.7601,
       "step": 115
     },
     {
+      "epoch": 0.2955665024630542,
+      "grad_norm": 0.10066307336091995,
+      "learning_rate": 0.0001777597146973627,
+      "loss": 0.7392,
       "step": 120
     },
     {
+      "epoch": 0.3078817733990148,
+      "grad_norm": 0.1052442416548729,
+      "learning_rate": 0.00017498264012045687,
+      "loss": 0.7443,
       "step": 125
     },
     {
+      "epoch": 0.32019704433497537,
+      "grad_norm": 0.09332836419343948,
+      "learning_rate": 0.00017206671495538612,
+      "loss": 0.7407,
       "step": 130
     },
     {
+      "epoch": 0.33251231527093594,
+      "grad_norm": 0.10167726129293442,
+      "learning_rate": 0.0001690173388242972,
+      "loss": 0.737,
       "step": 135
     },
     {
+      "epoch": 0.3448275862068966,
+      "grad_norm": 0.09576103836297989,
+      "learning_rate": 0.0001658401584698049,
+      "loss": 0.7467,
       "step": 140
     },
     {
+      "epoch": 0.35714285714285715,
+      "grad_norm": 0.09853356331586838,
+      "learning_rate": 0.00016254105729852464,
+      "loss": 0.7413,
       "step": 145
     },
     {
+      "epoch": 0.3694581280788177,
+      "grad_norm": 0.0903814435005188,
+      "learning_rate": 0.00015912614448635782,
+      "loss": 0.7529,
       "step": 150
     },
     {
+      "epoch": 0.3817733990147783,
+      "grad_norm": 0.10001372545957565,
+      "learning_rate": 0.00015560174366570446,
+      "loss": 0.7396,
       "step": 155
     },
     {
+      "epoch": 0.39408866995073893,
+      "grad_norm": 0.10121896117925644,
+      "learning_rate": 0.0001519743812155516,
+      "loss": 0.7536,
       "step": 160
     },
     {
+      "epoch": 0.4064039408866995,
+      "grad_norm": 0.09317266196012497,
+      "learning_rate": 0.00014825077417612186,
+      "loss": 0.754,
       "step": 165
     },
     {
+      "epoch": 0.4187192118226601,
+      "grad_norm": 0.09699834138154984,
+      "learning_rate": 0.00014443781781046136,
+      "loss": 0.7585,
       "step": 170
     },
     {
+      "epoch": 0.43103448275862066,
+      "grad_norm": 0.09634856134653091,
+      "learning_rate": 0.00014054257283599973,
+      "loss": 0.7193,
       "step": 175
     },
     {
+      "epoch": 0.4433497536945813,
+      "grad_norm": 0.09830117970705032,
+      "learning_rate": 0.00013657225234972695,
+      "loss": 0.7346,
       "step": 180
     },
     {
+      "epoch": 0.45566502463054187,
+      "grad_norm": 0.09632200747728348,
+      "learning_rate": 0.00013253420847119803,
+      "loss": 0.7356,
       "step": 185
     },
     {
+      "epoch": 0.46798029556650245,
+      "grad_norm": 0.09794441610574722,
+      "learning_rate": 0.0001284359187281004,
+      "loss": 0.7404,
       "step": 190
     },
     {
+      "epoch": 0.4802955665024631,
+      "grad_norm": 0.09384100884199142,
+      "learning_rate": 0.0001242849722095936,
+      "loss": 0.7282,
       "step": 195
     },
     {
+      "epoch": 0.49261083743842365,
+      "grad_norm": 0.09335623681545258,
+      "learning_rate": 0.00012008905551306356,
+      "loss": 0.7511,
       "step": 200
     },
     {
+      "epoch": 0.5049261083743842,
+      "grad_norm": 0.11448359489440918,
+      "learning_rate": 0.00011585593851031347,
+      "loss": 0.7173,
       "step": 205
     },
     {
+      "epoch": 0.5172413793103449,
+      "grad_norm": 0.11399874091148376,
+      "learning_rate": 0.00011159345995955006,
+      "loss": 0.7261,
       "step": 210
     },
     {
+      "epoch": 0.5295566502463054,
+      "grad_norm": 0.10711462795734406,
+      "learning_rate": 0.00010730951298980776,
+      "loss": 0.7293,
       "step": 215
     },
     {
+      "epoch": 0.541871921182266,
+      "grad_norm": 0.11034953594207764,
+      "learning_rate": 0.00010301203048469083,
+      "loss": 0.7284,
       "step": 220
     },
     {
+      "epoch": 0.5541871921182266,
+      "grad_norm": 0.11478842049837112,
+      "learning_rate": 9.870897039249911e-05,
+      "loss": 0.7146,
       "step": 225
     },
     {
+      "epoch": 0.5665024630541872,
+      "grad_norm": 0.1125415787100792,
+      "learning_rate": 9.440830098993969e-05,
+      "loss": 0.7218,
       "step": 230
     },
     {
+      "epoch": 0.5788177339901478,
+      "grad_norm": 0.11621884256601334,
+      "learning_rate": 9.011798612671286e-05,
+      "loss": 0.7242,
       "step": 235
     },
     {
+      "epoch": 0.5911330049261084,
+      "grad_norm": 0.11078532040119171,
+      "learning_rate": 8.58459704782957e-05,
+      "loss": 0.7268,
       "step": 240
     },
     {
+      "epoch": 0.603448275862069,
+      "grad_norm": 0.11298695206642151,
+      "learning_rate": 8.160016483423199e-05,
+      "loss": 0.7317,
+      "step": 245
+    },
+    {
+      "epoch": 0.6157635467980296,
+      "grad_norm": 0.1098036915063858,
+      "learning_rate": 7.738843144917119e-05,
+      "loss": 0.7174,
+      "step": 250
+    },
+    {
+      "epoch": 0.6280788177339901,
+      "grad_norm": 0.11521229147911072,
+      "learning_rate": 7.321856948378259e-05,
+      "loss": 0.7123,
+      "step": 255
+    },
+    {
+      "epoch": 0.6403940886699507,
+      "grad_norm": 0.11004281044006348,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 0.7247,
+      "step": 260
+    },
+    {
+      "epoch": 0.6527093596059114,
+      "grad_norm": 0.11443013697862625,
+      "learning_rate": 6.503525447487715e-05,
+      "loss": 0.7016,
+      "step": 265
+    },
+    {
+      "epoch": 0.6650246305418719,
+      "grad_norm": 0.12297528237104416,
+      "learning_rate": 6.103695504692122e-05,
+      "loss": 0.7014,
+      "step": 270
+    },
+    {
+      "epoch": 0.6773399014778325,
+      "grad_norm": 0.11536238342523575,
+      "learning_rate": 5.7110806208751655e-05,
+      "loss": 0.7248,
+      "step": 275
+    },
+    {
+      "epoch": 0.6896551724137931,
+      "grad_norm": 0.1148216500878334,
+      "learning_rate": 5.326407828419979e-05,
+      "loss": 0.7145,
+      "step": 280
+    },
+    {
+      "epoch": 0.7019704433497537,
+      "grad_norm": 0.11440951377153397,
+      "learning_rate": 4.9503894527847964e-05,
+      "loss": 0.7127,
+      "step": 285
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 0.11594696342945099,
+      "learning_rate": 4.583721793440188e-05,
+      "loss": 0.7491,
+      "step": 290
+    },
+    {
+      "epoch": 0.7266009852216748,
+      "grad_norm": 0.1071944460272789,
+      "learning_rate": 4.227083834482728e-05,
+      "loss": 0.7173,
+      "step": 295
+    },
+    {
+      "epoch": 0.7389162561576355,
+      "grad_norm": 0.11470213532447815,
+      "learning_rate": 3.881135987312757e-05,
+      "loss": 0.72,
+      "step": 300
+    },
+    {
+      "epoch": 0.7512315270935961,
+      "grad_norm": 0.11714080721139908,
+      "learning_rate": 3.546518867704499e-05,
+      "loss": 0.7289,
+      "step": 305
+    },
+    {
+      "epoch": 0.7635467980295566,
+      "grad_norm": 0.11828279495239258,
+      "learning_rate": 3.223852109533112e-05,
+      "loss": 0.7209,
+      "step": 310
+    },
+    {
+      "epoch": 0.7758620689655172,
+      "grad_norm": 0.1184413954615593,
+      "learning_rate": 2.9137332173554043e-05,
+      "loss": 0.7071,
+      "step": 315
+    },
+    {
+      "epoch": 0.7881773399014779,
+      "grad_norm": 0.12409494072198868,
+      "learning_rate": 2.616736459968936e-05,
+      "loss": 0.7104,
+      "step": 320
+    },
+    {
+      "epoch": 0.8004926108374384,
+      "grad_norm": 0.1128469705581665,
+      "learning_rate": 2.33341180699841e-05,
+      "loss": 0.7128,
+      "step": 325
+    },
+    {
+      "epoch": 0.812807881773399,
+      "grad_norm": 0.11089266836643219,
+      "learning_rate": 2.0642839104785272e-05,
+      "loss": 0.7166,
+      "step": 330
+    },
+    {
+      "epoch": 0.8251231527093597,
+      "grad_norm": 0.11719653755426407,
+      "learning_rate": 1.8098511333192024e-05,
+      "loss": 0.7333,
+      "step": 335
+    },
+    {
+      "epoch": 0.8374384236453202,
+      "grad_norm": 0.11491881310939789,
+      "learning_rate": 1.570584626452173e-05,
+      "loss": 0.7208,
+      "step": 340
+    },
+    {
+      "epoch": 0.8497536945812808,
+      "grad_norm": 0.10957372933626175,
+      "learning_rate": 1.3469274563679402e-05,
+      "loss": 0.7046,
+      "step": 345
+    },
+    {
+      "epoch": 0.8620689655172413,
+      "grad_norm": 0.1188158467411995,
+      "learning_rate": 1.1392937846586215e-05,
+      "loss": 0.7178,
+      "step": 350
+    },
+    {
+      "epoch": 0.874384236453202,
+      "grad_norm": 0.11511045694351196,
+      "learning_rate": 9.48068101086026e-06,
+      "loss": 0.7161,
+      "step": 355
+    },
+    {
+      "epoch": 0.8866995073891626,
+      "grad_norm": 0.11417835205793381,
+      "learning_rate": 7.736045115951251e-06,
+      "loss": 0.7274,
+      "step": 360
+    },
+    {
+      "epoch": 0.8990147783251231,
+      "grad_norm": 0.11594397574663162,
+      "learning_rate": 6.16226082591359e-06,
+      "loss": 0.7081,
+      "step": 365
+    },
+    {
+      "epoch": 0.9113300492610837,
+      "grad_norm": 0.11524856835603714,
+      "learning_rate": 4.762242426960262e-06,
+      "loss": 0.7209,
+      "step": 370
+    },
+    {
+      "epoch": 0.9236453201970444,
+      "grad_norm": 0.12005619704723358,
+      "learning_rate": 3.5385824308756587e-06,
+      "loss": 0.6989,
+      "step": 375
+    },
+    {
+      "epoch": 0.9359605911330049,
+      "grad_norm": 0.12374743819236755,
+      "learning_rate": 2.493546774280531e-06,
+      "loss": 0.7276,
+      "step": 380
+    },
+    {
+      "epoch": 0.9482758620689655,
+      "grad_norm": 0.11894236505031586,
+      "learning_rate": 1.6290706226390285e-06,
+      "loss": 0.7144,
+      "step": 385
+    },
+    {
+      "epoch": 0.9605911330049262,
+      "grad_norm": 0.11776220053434372,
+      "learning_rate": 9.46754786777726e-07,
+      "loss": 0.7015,
+      "step": 390
+    },
+    {
+      "epoch": 0.9729064039408867,
+      "grad_norm": 0.109645314514637,
+      "learning_rate": 4.4786275855247527e-07,
+      "loss": 0.7109,
+      "step": 395
+    },
+    {
+      "epoch": 0.9852216748768473,
+      "grad_norm": 0.11453017592430115,
+      "learning_rate": 1.333183711524133e-07,
+      "loss": 0.7094,
+      "step": 400
+    },
+    {
+      "epoch": 0.9975369458128078,
+      "grad_norm": 0.1103915199637413,
+      "learning_rate": 3.7040883734462683e-09,
+      "loss": 0.6894,
+      "step": 405
+    },
+    {
+      "epoch": 1.0,
       "eval_loss": NaN,
+      "eval_runtime": 944.7015,
+      "eval_samples_per_second": 1.223,
+      "eval_steps_per_second": 0.306,
+      "step": 406
+    },
+    {
+      "epoch": 1.0,
+      "step": 406,
+      "total_flos": 2.1219407349982167e+18,
+      "train_loss": 0.3640357888684484,
+      "train_runtime": 42241.1922,
+      "train_samples_per_second": 2.46,
+      "train_steps_per_second": 0.01
     }
   ],
   "logging_steps": 5,
+  "max_steps": 406,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
+  "save_steps": 100,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
         "should_epoch_stop": false,
         "should_evaluate": false,
         "should_log": false,
+        "should_save": true,
         "should_training_stop": false
       },
       "attributes": {}
     }
   },
+  "total_flos": 2.1219407349982167e+18,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null