Model save

Browse files

Files changed (6) hide show

README.md +2 -5
adapter_model.safetensors +1 -1
all_results.json +5 -5
runs/May03_06-09-37_ip-172-31-69-60.ec2.internal/events.out.tfevents.1714716718.ip-172-31-69-60.ec2.internal.2066.0 +2 -2
train_results.json +5 -5
trainer_state.json +1060 -556

README.md CHANGED Viewed

@@ -2,13 +2,10 @@
 license: llama2
 library_name: peft
 tags:
-- alignment-handbook
 - trl
 - sft
 - generated_from_trainer
 base_model: meta-llama/Llama-2-7b-hf
-datasets:
-- HuggingFaceH4/ultrachat_200k
 model-index:
 - name: llama2-20p-POE
   results: []
@@ -19,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # llama2-20p-POE
-This model is a fine-tuned version of [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) on the HuggingFaceH4/ultrachat_200k dataset.
 It achieves the following results on the evaluation set:
 - Loss: nan
@@ -58,7 +55,7 @@ The following hyperparameters were used during training:
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
-| 0.7591        | 1.0   | 675  | nan             |
 ### Framework versions

 license: llama2
 library_name: peft
 tags:
 - trl
 - sft
 - generated_from_trainer
 base_model: meta-llama/Llama-2-7b-hf
 model-index:
 - name: llama2-20p-POE
   results: []
 # llama2-20p-POE
+This model is a fine-tuned version of [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) on the None dataset.
 It achieves the following results on the evaluation set:
 - Loss: nan
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
+| 0.7327        | 1.0   | 1039 | nan             |
 ### Framework versions

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d370cf8d66f04acedfbae1d4d8d05426bcf20615ba409e1ded0bd718bca76c0f
 size 60089544

 version https://git-lfs.github.com/spec/v1
+oid sha256:af8b59a0f339223195ffda22e2cc190e5b99e802340991131a98cce499515eaf
 size 60089544

all_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "epoch": 1.0,
-    "train_loss": 0.7647893634548893,
-    "train_runtime": 21987.6737,
-    "train_samples": 21594,
-    "train_samples_per_second": 0.982,
-    "train_steps_per_second": 0.031
 }

 {
     "epoch": 1.0,
+    "train_loss": 0.7603742487735766,
+    "train_runtime": 32307.7024,
+    "train_samples": 33257,
+    "train_samples_per_second": 1.029,
+    "train_steps_per_second": 0.032
 }

runs/May03_06-09-37_ip-172-31-69-60.ec2.internal/events.out.tfevents.1714716718.ip-172-31-69-60.ec2.internal.2066.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:962e7c820913d857a21050000793e4752d88da1e40357a60fd8544edbf849d8a
-size 47019

 version https://git-lfs.github.com/spec/v1
+oid sha256:38dcc78b365fe2e878562c93604823a359f8f1c3842b73d7ec15f47d7c201acf
+size 49121

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "epoch": 1.0,
-    "train_loss": 0.7647893634548893,
-    "train_runtime": 21987.6737,
-    "train_samples": 21594,
-    "train_samples_per_second": 0.982,
-    "train_steps_per_second": 0.031
 }

 {
     "epoch": 1.0,
+    "train_loss": 0.7603742487735766,
+    "train_runtime": 32307.7024,
+    "train_samples": 33257,
+    "train_samples_per_second": 1.029,
+    "train_steps_per_second": 0.032
 }

trainer_state.json CHANGED Viewed

@@ -1,989 +1,1493 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 1.0,
   "eval_steps": 500,
-  "global_step": 675,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.0,
-      "grad_norm": 0.028420851102407323,
-      "learning_rate": 2.9411764705882355e-06,
-      "loss": 0.8769,
       "step": 1
     },
     {
-      "epoch": 0.01,
-      "grad_norm": 0.02533437087421799,
-      "learning_rate": 1.4705882352941177e-05,
-      "loss": 0.863,
       "step": 5
     },
     {
       "epoch": 0.01,
-      "grad_norm": 0.029598127358419903,
-      "learning_rate": 2.9411764705882354e-05,
-      "loss": 0.8899,
       "step": 10
     },
     {
-      "epoch": 0.02,
-      "grad_norm": 0.04418645965251518,
-      "learning_rate": 4.411764705882353e-05,
-      "loss": 0.8643,
       "step": 15
     },
     {
-      "epoch": 0.03,
-      "grad_norm": 0.08391069341850195,
-      "learning_rate": 5.882352941176471e-05,
-      "loss": 0.8164,
       "step": 20
     },
     {
-      "epoch": 0.04,
-      "grad_norm": 0.0848430702291638,
-      "learning_rate": 7.352941176470589e-05,
-      "loss": 0.838,
       "step": 25
     },
     {
-      "epoch": 0.04,
-      "grad_norm": 0.07340453634110397,
-      "learning_rate": 8.823529411764706e-05,
-      "loss": 0.8371,
       "step": 30
     },
     {
-      "epoch": 0.05,
-      "grad_norm": 0.060721401358425686,
-      "learning_rate": 0.00010294117647058823,
-      "loss": 0.7967,
       "step": 35
     },
     {
-      "epoch": 0.06,
-      "grad_norm": 0.06406699741542068,
-      "learning_rate": 0.00011764705882352942,
-      "loss": 0.7952,
       "step": 40
     },
     {
-      "epoch": 0.07,
-      "grad_norm": 0.0702367088144725,
-      "learning_rate": 0.0001323529411764706,
-      "loss": 0.7785,
       "step": 45
     },
     {
-      "epoch": 0.07,
-      "grad_norm": 0.05482720735795007,
-      "learning_rate": 0.00014705882352941178,
-      "loss": 0.7855,
       "step": 50
     },
     {
-      "epoch": 0.08,
-      "grad_norm": 0.061311996254463264,
-      "learning_rate": 0.00016176470588235295,
-      "loss": 0.7836,
       "step": 55
     },
     {
-      "epoch": 0.09,
-      "grad_norm": 0.057997545698835314,
-      "learning_rate": 0.00017647058823529413,
-      "loss": 0.7932,
       "step": 60
     },
     {
-      "epoch": 0.1,
-      "grad_norm": 0.04913048738654215,
-      "learning_rate": 0.0001911764705882353,
-      "loss": 0.7746,
       "step": 65
     },
     {
-      "epoch": 0.1,
-      "grad_norm": 0.051276736158220364,
-      "learning_rate": 0.00019999464266898484,
-      "loss": 0.7464,
       "step": 70
     },
     {
-      "epoch": 0.11,
-      "grad_norm": 0.04995736942327917,
-      "learning_rate": 0.00019993437928712978,
-      "loss": 0.7248,
       "step": 75
     },
     {
-      "epoch": 0.12,
-      "grad_norm": 0.045680340036933116,
-      "learning_rate": 0.0001998071963486563,
-      "loss": 0.7855,
       "step": 80
     },
     {
-      "epoch": 0.13,
-      "grad_norm": 0.053344671279534676,
-      "learning_rate": 0.00019961317901970953,
-      "loss": 0.7396,
       "step": 85
     },
     {
-      "epoch": 0.13,
-      "grad_norm": 0.05347208673468481,
-      "learning_rate": 0.0001993524572210807,
-      "loss": 0.7623,
       "step": 90
     },
     {
-      "epoch": 0.14,
-      "grad_norm": 0.044577349788325,
-      "learning_rate": 0.00019902520554120772,
-      "loss": 0.7595,
       "step": 95
     },
     {
-      "epoch": 0.15,
-      "grad_norm": 0.04633173399608109,
-      "learning_rate": 0.00019863164311926433,
-      "loss": 0.7759,
       "step": 100
     },
     {
-      "epoch": 0.16,
-      "grad_norm": 0.044125485710741395,
-      "learning_rate": 0.00019817203349841738,
-      "loss": 0.7858,
       "step": 105
     },
     {
-      "epoch": 0.16,
-      "grad_norm": 0.041739410620092,
-      "learning_rate": 0.00019764668444934854,
-      "loss": 0.7682,
       "step": 110
     },
     {
-      "epoch": 0.17,
-      "grad_norm": 0.046548199607491646,
-      "learning_rate": 0.0001970559477641606,
-      "loss": 0.7442,
       "step": 115
     },
     {
-      "epoch": 0.18,
-      "grad_norm": 0.04287304549804181,
-      "learning_rate": 0.0001964002190208052,
-      "loss": 0.7966,
       "step": 120
     },
     {
-      "epoch": 0.19,
-      "grad_norm": 0.04461470993270133,
-      "learning_rate": 0.00019567993731818984,
-      "loss": 0.7678,
       "step": 125
     },
     {
-      "epoch": 0.19,
-      "grad_norm": 0.039705545161659035,
-      "learning_rate": 0.00019489558498214196,
-      "loss": 0.7345,
       "step": 130
     },
     {
-      "epoch": 0.2,
-      "grad_norm": 0.03330898777260923,
-      "learning_rate": 0.00019404768724242666,
-      "loss": 0.7714,
       "step": 135
     },
     {
-      "epoch": 0.21,
-      "grad_norm": 0.046594949440474036,
-      "learning_rate": 0.00019313681188103457,
-      "loss": 0.7757,
       "step": 140
     },
     {
-      "epoch": 0.21,
-      "grad_norm": 0.04928885679685318,
-      "learning_rate": 0.000192163568851975,
-      "loss": 0.8217,
       "step": 145
     },
     {
-      "epoch": 0.22,
-      "grad_norm": 0.04466472170425771,
-      "learning_rate": 0.00019112860987282958,
-      "loss": 0.7356,
       "step": 150
     },
     {
-      "epoch": 0.23,
-      "grad_norm": 0.05078422739553919,
-      "learning_rate": 0.0001900326279883392,
-      "loss": 0.7262,
       "step": 155
     },
     {
-      "epoch": 0.24,
-      "grad_norm": 0.042393502583193486,
-      "learning_rate": 0.00018887635710631716,
-      "loss": 0.791,
       "step": 160
     },
     {
-      "epoch": 0.24,
-      "grad_norm": 0.04096339138017866,
-      "learning_rate": 0.00018766057150619865,
-      "loss": 0.7621,
       "step": 165
     },
     {
-      "epoch": 0.25,
-      "grad_norm": 0.04894911531750606,
-      "learning_rate": 0.00018638608532055634,
-      "loss": 0.714,
       "step": 170
     },
     {
-      "epoch": 0.26,
-      "grad_norm": 0.04424627496496155,
-      "learning_rate": 0.00018505375198992857,
-      "loss": 0.7445,
       "step": 175
     },
     {
-      "epoch": 0.27,
-      "grad_norm": 0.05064439962306937,
-      "learning_rate": 0.00018366446369132578,
-      "loss": 0.7502,
       "step": 180
     },
     {
-      "epoch": 0.27,
-      "grad_norm": 0.05185726609274544,
-      "learning_rate": 0.00018221915074079762,
-      "loss": 0.7423,
       "step": 185
     },
     {
-      "epoch": 0.28,
-      "grad_norm": 0.049634018260632524,
-      "learning_rate": 0.00018071878097046065,
-      "loss": 0.7853,
       "step": 190
     },
     {
-      "epoch": 0.29,
-      "grad_norm": 0.03718521878894617,
-      "learning_rate": 0.00017916435908040413,
-      "loss": 0.7575,
       "step": 195
     },
     {
-      "epoch": 0.3,
-      "grad_norm": 0.054866103106943676,
-      "learning_rate": 0.00017755692596590778,
-      "loss": 0.7604,
       "step": 200
     },
     {
-      "epoch": 0.3,
-      "grad_norm": 0.040017034968621745,
-      "learning_rate": 0.00017589755802042186,
-      "loss": 0.7818,
       "step": 205
     },
     {
-      "epoch": 0.31,
-      "grad_norm": 0.03964997679073274,
-      "learning_rate": 0.00017418736641477636,
-      "loss": 0.7464,
       "step": 210
     },
     {
-      "epoch": 0.32,
-      "grad_norm": 0.051157610923925706,
-      "learning_rate": 0.0001724274963531022,
-      "loss": 0.7534,
       "step": 215
     },
     {
-      "epoch": 0.33,
-      "grad_norm": 0.04692776206415383,
-      "learning_rate": 0.00017061912630596252,
-      "loss": 0.7862,
       "step": 220
     },
     {
-      "epoch": 0.33,
-      "grad_norm": 0.04009778793971981,
-      "learning_rate": 0.00016876346722120747,
-      "loss": 0.7619,
       "step": 225
     },
     {
-      "epoch": 0.34,
-      "grad_norm": 0.037858593687305236,
-      "learning_rate": 0.00016686176171308126,
-      "loss": 0.7822,
       "step": 230
     },
     {
-      "epoch": 0.35,
-      "grad_norm": 0.03514517489146636,
-      "learning_rate": 0.0001649152832301241,
-      "loss": 0.7475,
       "step": 235
     },
     {
-      "epoch": 0.36,
-      "grad_norm": 0.043964485334365984,
-      "learning_rate": 0.00016292533520242662,
-      "loss": 0.775,
       "step": 240
     },
     {
-      "epoch": 0.36,
-      "grad_norm": 0.06121852032774167,
-      "learning_rate": 0.00016089325016880736,
-      "loss": 0.7501,
       "step": 245
     },
     {
-      "epoch": 0.37,
-      "grad_norm": 0.050365919886299945,
-      "learning_rate": 0.0001588203888844982,
-      "loss": 0.7498,
       "step": 250
     },
     {
-      "epoch": 0.38,
-      "grad_norm": 0.04601818654614394,
-      "learning_rate": 0.00015670813940993502,
-      "loss": 0.7942,
       "step": 255
     },
     {
-      "epoch": 0.39,
-      "grad_norm": 0.049451503331748733,
-      "learning_rate": 0.00015455791618126404,
-      "loss": 0.7232,
       "step": 260
     },
     {
-      "epoch": 0.39,
-      "grad_norm": 0.049704756401709786,
-      "learning_rate": 0.00015237115906318563,
-      "loss": 0.7474,
       "step": 265
     },
     {
-      "epoch": 0.4,
-      "grad_norm": 0.043536870823896914,
-      "learning_rate": 0.0001501493323847707,
-      "loss": 0.7074,
       "step": 270
     },
     {
-      "epoch": 0.41,
-      "grad_norm": 0.052592436248192806,
-      "learning_rate": 0.00014789392395889468,
-      "loss": 0.7675,
       "step": 275
     },
     {
-      "epoch": 0.41,
-      "grad_norm": 0.04641887287230184,
-      "learning_rate": 0.00014560644408594602,
-      "loss": 0.7884,
       "step": 280
     },
     {
-      "epoch": 0.42,
-      "grad_norm": 0.03778851947891411,
-      "learning_rate": 0.0001432884245424761,
-      "loss": 0.7364,
       "step": 285
     },
     {
-      "epoch": 0.43,
-      "grad_norm": 0.04383653972641628,
-      "learning_rate": 0.00014094141755546815,
-      "loss": 0.7633,
       "step": 290
     },
     {
-      "epoch": 0.44,
-      "grad_norm": 0.04958511831355967,
-      "learning_rate": 0.00013856699476291176,
-      "loss": 0.7254,
       "step": 295
     },
     {
-      "epoch": 0.44,
-      "grad_norm": 0.047545677145208354,
-      "learning_rate": 0.000136166746161379,
-      "loss": 0.7389,
       "step": 300
     },
     {
-      "epoch": 0.45,
-      "grad_norm": 0.049601892362158714,
-      "learning_rate": 0.00013374227904130724,
-      "loss": 0.7298,
       "step": 305
     },
     {
-      "epoch": 0.46,
-      "grad_norm": 0.039019104385466755,
-      "learning_rate": 0.00013129521691070107,
-      "loss": 0.7372,
       "step": 310
     },
     {
-      "epoch": 0.47,
-      "grad_norm": 0.04324309132836927,
-      "learning_rate": 0.00012882719840797473,
-      "loss": 0.7586,
       "step": 315
     },
     {
-      "epoch": 0.47,
-      "grad_norm": 0.04511381039375704,
-      "learning_rate": 0.0001263398762046623,
-      "loss": 0.782,
       "step": 320
     },
     {
-      "epoch": 0.48,
-      "grad_norm": 0.037065274891104005,
-      "learning_rate": 0.00012383491589873123,
-      "loss": 0.73,
       "step": 325
     },
     {
-      "epoch": 0.49,
-      "grad_norm": 0.04379387246767109,
-      "learning_rate": 0.0001213139948992394,
-      "loss": 0.7602,
       "step": 330
     },
     {
-      "epoch": 0.5,
-      "grad_norm": 0.04868933738312991,
-      "learning_rate": 0.0001187788013030837,
-      "loss": 0.7467,
       "step": 335
     },
     {
-      "epoch": 0.5,
-      "grad_norm": 0.04841745199938846,
-      "learning_rate": 0.00011623103276459086,
-      "loss": 0.7862,
       "step": 340
     },
     {
-      "epoch": 0.51,
-      "grad_norm": 0.046825753709491394,
-      "learning_rate": 0.00011367239535870913,
-      "loss": 0.7523,
       "step": 345
     },
     {
-      "epoch": 0.52,
-      "grad_norm": 0.05204047537850423,
-      "learning_rate": 0.00011110460243856052,
-      "loss": 0.721,
       "step": 350
     },
     {
-      "epoch": 0.53,
-      "grad_norm": 0.04835371095843328,
-      "learning_rate": 0.0001085293734881197,
-      "loss": 0.8165,
       "step": 355
     },
     {
-      "epoch": 0.53,
-      "grad_norm": 0.046503091391954944,
-      "learning_rate": 0.00010594843297078737,
-      "loss": 0.7151,
       "step": 360
     },
     {
-      "epoch": 0.54,
-      "grad_norm": 0.05401067419624875,
-      "learning_rate": 0.00010336350917462925,
-      "loss": 0.7623,
       "step": 365
     },
     {
-      "epoch": 0.55,
-      "grad_norm": 0.046238914587313926,
-      "learning_rate": 0.00010077633305505403,
-      "loss": 0.7952,
       "step": 370
     },
     {
-      "epoch": 0.56,
-      "grad_norm": 0.04067724976292184,
-      "learning_rate": 9.818863707570475e-05,
-      "loss": 0.7509,
       "step": 375
     },
     {
-      "epoch": 0.56,
-      "grad_norm": 0.041537242387637924,
-      "learning_rate": 9.560215404834095e-05,
-      "loss": 0.7121,
       "step": 380
     },
     {
-      "epoch": 0.57,
-      "grad_norm": 0.042556692843415594,
-      "learning_rate": 9.30186159724869e-05,
-      "loss": 0.7708,
       "step": 385
     },
     {
-      "epoch": 0.58,
-      "grad_norm": 0.04846970178587132,
-      "learning_rate": 9.043975287562441e-05,
-      "loss": 0.7258,
       "step": 390
     },
     {
-      "epoch": 0.59,
-      "grad_norm": 0.062274515472467,
-      "learning_rate": 8.786729165470584e-05,
-      "loss": 0.7698,
       "step": 395
     },
     {
-      "epoch": 0.59,
-      "grad_norm": 0.05082087012341951,
-      "learning_rate": 8.530295491976337e-05,
-      "loss": 0.7717,
       "step": 400
     },
     {
-      "epoch": 0.6,
-      "grad_norm": 0.04831882202604005,
-      "learning_rate": 8.274845984038916e-05,
-      "loss": 0.7679,
       "step": 405
     },
     {
-      "epoch": 0.61,
-      "grad_norm": 0.051680028901517745,
-      "learning_rate": 8.020551699585842e-05,
-      "loss": 0.7265,
       "step": 410
     },
     {
-      "epoch": 0.61,
-      "grad_norm": 0.051175873784970766,
-      "learning_rate": 7.76758292296659e-05,
-      "loss": 0.7696,
       "step": 415
     },
     {
-      "epoch": 0.62,
-      "grad_norm": 0.06738625933727418,
-      "learning_rate": 7.516109050924201e-05,
-      "loss": 0.7781,
       "step": 420
     },
     {
-      "epoch": 0.63,
-      "grad_norm": 0.05117489109997125,
-      "learning_rate": 7.266298479161318e-05,
-      "loss": 0.771,
       "step": 425
     },
     {
-      "epoch": 0.64,
-      "grad_norm": 0.04211785915218291,
-      "learning_rate": 7.01831848957653e-05,
-      "loss": 0.7368,
       "step": 430
     },
     {
-      "epoch": 0.64,
-      "grad_norm": 0.049459639807936356,
-      "learning_rate": 6.772335138246548e-05,
-      "loss": 0.7234,
       "step": 435
     },
     {
-      "epoch": 0.65,
-      "grad_norm": 0.05981206570373218,
-      "learning_rate": 6.528513144229255e-05,
-      "loss": 0.7063,
       "step": 440
     },
     {
-      "epoch": 0.66,
-      "grad_norm": 0.042858944283255956,
-      "learning_rate": 6.287015779262064e-05,
-      "loss": 0.7178,
       "step": 445
     },
     {
-      "epoch": 0.67,
-      "grad_norm": 0.051605890455677136,
-      "learning_rate": 6.048004758429451e-05,
-      "loss": 0.8009,
       "step": 450
     },
     {
-      "epoch": 0.67,
-      "grad_norm": 0.04839278716779415,
-      "learning_rate": 5.8116401318728667e-05,
-      "loss": 0.7969,
       "step": 455
     },
     {
-      "epoch": 0.68,
-      "grad_norm": 0.053679686497026036,
-      "learning_rate": 5.578080177615575e-05,
-      "loss": 0.7744,
       "step": 460
     },
     {
-      "epoch": 0.69,
-      "grad_norm": 0.04886589382975143,
-      "learning_rate": 5.3474812955741404e-05,
-      "loss": 0.782,
       "step": 465
     },
     {
-      "epoch": 0.7,
-      "grad_norm": 0.05872104081042249,
-      "learning_rate": 5.119997902827584e-05,
-      "loss": 0.7684,
       "step": 470
     },
     {
-      "epoch": 0.7,
-      "grad_norm": 0.04971188432483476,
-      "learning_rate": 4.895782330214291e-05,
-      "loss": 0.8219,
       "step": 475
     },
     {
-      "epoch": 0.71,
-      "grad_norm": 0.05137448628344103,
-      "learning_rate": 4.674984720325961e-05,
-      "loss": 0.7654,
       "step": 480
     },
     {
-      "epoch": 0.72,
-      "grad_norm": 0.04686654449943715,
-      "learning_rate": 4.4577529269668874e-05,
-      "loss": 0.7774,
       "step": 485
     },
     {
-      "epoch": 0.73,
-      "grad_norm": 0.055295260668893974,
-      "learning_rate": 4.244232416145839e-05,
-      "loss": 0.7245,
       "step": 490
     },
     {
-      "epoch": 0.73,
-      "grad_norm": 0.05389337215866635,
-      "learning_rate": 4.0345661686669745e-05,
-      "loss": 0.8061,
       "step": 495
     },
     {
-      "epoch": 0.74,
-      "grad_norm": 0.05870235717745641,
-      "learning_rate": 3.828894584384867e-05,
-      "loss": 0.8031,
       "step": 500
     },
     {
-      "epoch": 0.75,
-      "grad_norm": 0.05571889022670846,
-      "learning_rate": 3.62735538818787e-05,
-      "loss": 0.7614,
       "step": 505
     },
     {
-      "epoch": 0.76,
-      "grad_norm": 0.04492834518434723,
-      "learning_rate": 3.43008353777269e-05,
-      "loss": 0.7331,
       "step": 510
     },
     {
-      "epoch": 0.76,
-      "grad_norm": 0.05343462218042988,
-      "learning_rate": 3.237211133272004e-05,
-      "loss": 0.7355,
       "step": 515
     },
     {
-      "epoch": 0.77,
-      "grad_norm": 0.047988801277496954,
-      "learning_rate": 3.0488673287955882e-05,
-      "loss": 0.7237,
       "step": 520
     },
     {
-      "epoch": 0.78,
-      "grad_norm": 0.055886245680751935,
-      "learning_rate": 2.8651782459442176e-05,
-      "loss": 0.7426,
       "step": 525
     },
     {
-      "epoch": 0.79,
-      "grad_norm": 0.04696112028105608,
-      "learning_rate": 2.686266889354211e-05,
-      "loss": 0.7487,
       "step": 530
     },
     {
-      "epoch": 0.79,
-      "grad_norm": 0.04555764819319834,
-      "learning_rate": 2.5122530643292275e-05,
-      "loss": 0.7344,
       "step": 535
     },
     {
-      "epoch": 0.8,
-      "grad_norm": 0.05199303289710418,
-      "learning_rate": 2.3432532966144527e-05,
-      "loss": 0.7604,
       "step": 540
     },
     {
-      "epoch": 0.81,
-      "grad_norm": 0.05146492861699787,
-      "learning_rate": 2.1793807543668853e-05,
-      "loss": 0.7383,
       "step": 545
     },
     {
-      "epoch": 0.81,
-      "grad_norm": 0.05548374949115557,
-      "learning_rate": 2.0207451723739633e-05,
-      "loss": 0.7565,
       "step": 550
     },
     {
-      "epoch": 0.82,
-      "grad_norm": 0.04718878121287069,
-      "learning_rate": 1.8674527785713247e-05,
-      "loss": 0.7889,
       "step": 555
     },
     {
-      "epoch": 0.83,
-      "grad_norm": 0.07223231382050395,
-      "learning_rate": 1.7196062229088604e-05,
-      "loss": 0.7734,
       "step": 560
     },
     {
-      "epoch": 0.84,
-      "grad_norm": 0.051918878796507154,
-      "learning_rate": 1.577304508612717e-05,
-      "loss": 0.7697,
       "step": 565
     },
     {
-      "epoch": 0.84,
-      "grad_norm": 0.05143371773283759,
-      "learning_rate": 1.4406429258892762e-05,
-      "loss": 0.7591,
       "step": 570
     },
     {
-      "epoch": 0.85,
-      "grad_norm": 0.06242991163796485,
-      "learning_rate": 1.3097129881154934e-05,
-      "loss": 0.7888,
       "step": 575
     },
     {
-      "epoch": 0.86,
-      "grad_norm": 0.05577486269105434,
-      "learning_rate": 1.1846023705583442e-05,
-      "loss": 0.7503,
       "step": 580
     },
     {
-      "epoch": 0.87,
-      "grad_norm": 0.05386359623343792,
-      "learning_rate": 1.065394851664394e-05,
-      "loss": 0.7306,
       "step": 585
     },
     {
-      "epoch": 0.87,
-      "grad_norm": 0.06740139242925512,
-      "learning_rate": 9.521702569588198e-06,
-      "loss": 0.7748,
       "step": 590
     },
     {
-      "epoch": 0.88,
-      "grad_norm": 0.05952304608258396,
-      "learning_rate": 8.450044055914497e-06,
-      "loss": 0.6941,
       "step": 595
     },
     {
-      "epoch": 0.89,
-      "grad_norm": 0.06052761239891292,
-      "learning_rate": 7.439690595656013e-06,
-      "loss": 0.7775,
       "step": 600
     },
     {
-      "epoch": 0.9,
-      "grad_norm": 0.05646485799009386,
-      "learning_rate": 6.4913187568374164e-06,
-      "loss": 0.7941,
       "step": 605
     },
     {
-      "epoch": 0.9,
-      "grad_norm": 0.051627777900137006,
-      "learning_rate": 5.605563602421149e-06,
-      "loss": 0.7621,
       "step": 610
     },
     {
-      "epoch": 0.91,
-      "grad_norm": 0.05672370227304409,
-      "learning_rate": 4.783018265047179e-06,
-      "loss": 0.7598,
       "step": 615
     },
     {
-      "epoch": 0.92,
-      "grad_norm": 0.04806888785096822,
-      "learning_rate": 4.024233549850509e-06,
-      "loss": 0.7585,
       "step": 620
     },
     {
-      "epoch": 0.93,
-      "grad_norm": 0.054256686487143276,
-      "learning_rate": 3.329717565622825e-06,
-      "loss": 0.7766,
       "step": 625
     },
     {
-      "epoch": 0.93,
-      "grad_norm": 0.04081148671596208,
-      "learning_rate": 2.699935384565111e-06,
-      "loss": 0.7324,
       "step": 630
     },
     {
-      "epoch": 0.94,
-      "grad_norm": 0.052421185625722275,
-      "learning_rate": 2.1353087308590314e-06,
-      "loss": 0.7933,
       "step": 635
     },
     {
-      "epoch": 0.95,
-      "grad_norm": 0.05392813215656955,
-      "learning_rate": 1.6362156982656084e-06,
-      "loss": 0.7896,
       "step": 640
     },
     {
-      "epoch": 0.96,
-      "grad_norm": 0.052109378844003566,
-      "learning_rate": 1.2029904969404482e-06,
-      "loss": 0.7633,
       "step": 645
     },
     {
-      "epoch": 0.96,
-      "grad_norm": 0.05562712893622207,
-      "learning_rate": 8.359232296349162e-07,
-      "loss": 0.7664,
       "step": 650
     },
     {
-      "epoch": 0.97,
-      "grad_norm": 0.05328976304172556,
-      "learning_rate": 5.352596974332436e-07,
-      "loss": 0.7658,
       "step": 655
     },
     {
-      "epoch": 0.98,
-      "grad_norm": 0.05351883682294467,
-      "learning_rate": 3.0120123515540164e-07,
-      "loss": 0.7871,
       "step": 660
     },
     {
-      "epoch": 0.99,
-      "grad_norm": 0.05900811011725127,
-      "learning_rate": 1.3390457653639222e-07,
-      "loss": 0.7749,
       "step": 665
     },
     {
-      "epoch": 0.99,
-      "grad_norm": 0.06175235611664889,
-      "learning_rate": 3.3481749271768726e-08,
-      "loss": 0.7353,
       "step": 670
     },
     {
-      "epoch": 1.0,
-      "grad_norm": 0.05208882374001457,
-      "learning_rate": 0.0,
-      "loss": 0.7591,
       "step": 675
     },
     {
       "epoch": 1.0,
       "eval_loss": NaN,
-      "eval_runtime": 2998.9455,
-      "eval_samples_per_second": 0.77,
-      "eval_steps_per_second": 0.193,
-      "step": 675
     },
     {
       "epoch": 1.0,
-      "step": 675,
-      "total_flos": 2.235287773328179e+16,
-      "train_loss": 0.7647893634548893,
-      "train_runtime": 21987.6737,
-      "train_samples_per_second": 0.982,
-      "train_steps_per_second": 0.031
     }
   ],
   "logging_steps": 5,
-  "max_steps": 675,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 100,
-  "total_flos": 2.235287773328179e+16,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.9995189995189995,
   "eval_steps": 500,
+  "global_step": 1039,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.0,
+      "grad_norm": 0.026559076829896673,
+      "learning_rate": 1.9230769230769234e-06,
+      "loss": 0.8553,
       "step": 1
     },
     {
+      "epoch": 0.0,
+      "grad_norm": 0.024109469955629563,
+      "learning_rate": 9.615384615384616e-06,
+      "loss": 0.8949,
       "step": 5
     },
     {
       "epoch": 0.01,
+      "grad_norm": 0.035843143099576466,
+      "learning_rate": 1.923076923076923e-05,
+      "loss": 0.8487,
       "step": 10
     },
     {
+      "epoch": 0.01,
+      "grad_norm": 0.02927511973728809,
+      "learning_rate": 2.8846153846153845e-05,
+      "loss": 0.8298,
       "step": 15
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 0.0482470347406101,
+      "learning_rate": 3.846153846153846e-05,
+      "loss": 0.8369,
       "step": 20
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 0.04787111054382548,
+      "learning_rate": 4.8076923076923084e-05,
+      "loss": 0.8625,
       "step": 25
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 0.09068040679248557,
+      "learning_rate": 5.769230769230769e-05,
+      "loss": 0.8133,
       "step": 30
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 0.07412802798136442,
+      "learning_rate": 6.730769230769232e-05,
+      "loss": 0.8543,
       "step": 35
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 0.0699997088327299,
+      "learning_rate": 7.692307692307693e-05,
+      "loss": 0.763,
       "step": 40
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 0.05170267487139005,
+      "learning_rate": 8.653846153846155e-05,
+      "loss": 0.7827,
       "step": 45
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 0.07357073672675946,
+      "learning_rate": 9.615384615384617e-05,
+      "loss": 0.8022,
       "step": 50
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 0.05641137867224088,
+      "learning_rate": 0.00010576923076923077,
+      "loss": 0.7397,
       "step": 55
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 0.057002882272420445,
+      "learning_rate": 0.00011538461538461538,
+      "loss": 0.777,
       "step": 60
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 0.050844397685080686,
+      "learning_rate": 0.000125,
+      "loss": 0.7664,
       "step": 65
     },
     {
+      "epoch": 0.07,
+      "grad_norm": 0.05336016241159068,
+      "learning_rate": 0.00013461538461538464,
+      "loss": 0.7956,
       "step": 70
     },
     {
+      "epoch": 0.07,
+      "grad_norm": 0.05359842093910465,
+      "learning_rate": 0.00014423076923076924,
+      "loss": 0.751,
       "step": 75
     },
     {
+      "epoch": 0.08,
+      "grad_norm": 0.043971570409194735,
+      "learning_rate": 0.00015384615384615385,
+      "loss": 0.7823,
       "step": 80
     },
     {
+      "epoch": 0.08,
+      "grad_norm": 0.044563802592992065,
+      "learning_rate": 0.00016346153846153846,
+      "loss": 0.7908,
       "step": 85
     },
     {
+      "epoch": 0.09,
+      "grad_norm": 0.04801183812641932,
+      "learning_rate": 0.0001730769230769231,
+      "loss": 0.827,
       "step": 90
     },
     {
+      "epoch": 0.09,
+      "grad_norm": 0.0522380719803802,
+      "learning_rate": 0.0001826923076923077,
+      "loss": 0.8225,
       "step": 95
     },
     {
+      "epoch": 0.1,
+      "grad_norm": 0.048043288977650755,
+      "learning_rate": 0.00019230769230769233,
+      "loss": 0.7715,
       "step": 100
     },
     {
+      "epoch": 0.1,
+      "grad_norm": 0.04594631744893944,
+      "learning_rate": 0.00019999943552317104,
+      "loss": 0.7789,
       "step": 105
     },
     {
+      "epoch": 0.11,
+      "grad_norm": 0.050805778293244806,
+      "learning_rate": 0.00019997967950328128,
+      "loss": 0.8401,
       "step": 110
     },
     {
+      "epoch": 0.11,
+      "grad_norm": 0.04435482092479941,
+      "learning_rate": 0.0001999317060143023,
+      "loss": 0.7773,
       "step": 115
     },
     {
+      "epoch": 0.12,
+      "grad_norm": 0.05022795262704976,
+      "learning_rate": 0.0001998555285958899,
+      "loss": 0.7271,
       "step": 120
     },
     {
+      "epoch": 0.12,
+      "grad_norm": 0.05316130307831719,
+      "learning_rate": 0.00019975116874775242,
+      "loss": 0.8088,
       "step": 125
     },
     {
+      "epoch": 0.13,
+      "grad_norm": 0.04630310630357938,
+      "learning_rate": 0.00019961865592358288,
+      "loss": 0.7752,
       "step": 130
     },
     {
+      "epoch": 0.13,
+      "grad_norm": 0.05955477676937173,
+      "learning_rate": 0.0001994580275227462,
+      "loss": 0.7639,
       "step": 135
     },
     {
+      "epoch": 0.13,
+      "grad_norm": 0.04911523657827075,
+      "learning_rate": 0.00019926932887972393,
+      "loss": 0.7476,
       "step": 140
     },
     {
+      "epoch": 0.14,
+      "grad_norm": 0.04365854699386244,
+      "learning_rate": 0.0001990526132513194,
+      "loss": 0.7671,
       "step": 145
     },
     {
+      "epoch": 0.14,
+      "grad_norm": 0.0479590041051757,
+      "learning_rate": 0.00019880794180162693,
+      "loss": 0.8015,
       "step": 150
     },
     {
+      "epoch": 0.15,
+      "grad_norm": 0.053032592897760425,
+      "learning_rate": 0.00019853538358476932,
+      "loss": 0.8114,
       "step": 155
     },
     {
+      "epoch": 0.15,
+      "grad_norm": 0.04823282437342225,
+      "learning_rate": 0.00019823501552540865,
+      "loss": 0.7843,
       "step": 160
     },
     {
+      "epoch": 0.16,
+      "grad_norm": 0.13942036529350957,
+      "learning_rate": 0.00019790692239703557,
+      "loss": 0.7066,
       "step": 165
     },
     {
+      "epoch": 0.16,
+      "grad_norm": 0.05813669118246924,
+      "learning_rate": 0.00019755119679804367,
+      "loss": 0.7782,
       "step": 170
     },
     {
+      "epoch": 0.17,
+      "grad_norm": 0.045133845682998344,
+      "learning_rate": 0.00019716793912559507,
+      "loss": 0.8228,
       "step": 175
     },
     {
+      "epoch": 0.17,
+      "grad_norm": 0.04767176858103638,
+      "learning_rate": 0.00019675725754728527,
+      "loss": 0.8016,
       "step": 180
     },
     {
+      "epoch": 0.18,
+      "grad_norm": 0.05800072730359953,
+      "learning_rate": 0.00019631926797061456,
+      "loss": 0.7576,
       "step": 185
     },
     {
+      "epoch": 0.18,
+      "grad_norm": 0.044752361745923355,
+      "learning_rate": 0.00019585409401027556,
+      "loss": 0.7311,
       "step": 190
     },
     {
+      "epoch": 0.19,
+      "grad_norm": 0.0436163176892526,
+      "learning_rate": 0.00019536186695326486,
+      "loss": 0.7584,
       "step": 195
     },
     {
+      "epoch": 0.19,
+      "grad_norm": 0.07116796220713574,
+      "learning_rate": 0.00019484272572182986,
+      "loss": 0.7525,
       "step": 200
     },
     {
+      "epoch": 0.2,
+      "grad_norm": 0.051057018027582515,
+      "learning_rate": 0.00019429681683426022,
+      "loss": 0.798,
       "step": 205
     },
     {
+      "epoch": 0.2,
+      "grad_norm": 0.06621856781536847,
+      "learning_rate": 0.00019372429436353606,
+      "loss": 0.7242,
       "step": 210
     },
     {
+      "epoch": 0.21,
+      "grad_norm": 0.05280223873501623,
+      "learning_rate": 0.0001931253198938432,
+      "loss": 0.8013,
       "step": 215
     },
     {
+      "epoch": 0.21,
+      "grad_norm": 0.049913511617762696,
+      "learning_rate": 0.00019250006247496928,
+      "loss": 0.7282,
       "step": 220
     },
     {
+      "epoch": 0.22,
+      "grad_norm": 0.057224419119031235,
+      "learning_rate": 0.00019184869857459232,
+      "loss": 0.7986,
       "step": 225
     },
     {
+      "epoch": 0.22,
+      "grad_norm": 0.062107090513119266,
+      "learning_rate": 0.00019117141202847586,
+      "loss": 0.7305,
       "step": 230
     },
     {
+      "epoch": 0.23,
+      "grad_norm": 0.06236876490128247,
+      "learning_rate": 0.00019046839398858474,
+      "loss": 0.7961,
       "step": 235
     },
     {
+      "epoch": 0.23,
+      "grad_norm": 0.04936400763486868,
+      "learning_rate": 0.00018973984286913584,
+      "loss": 0.735,
       "step": 240
     },
     {
+      "epoch": 0.24,
+      "grad_norm": 0.053408247386716914,
+      "learning_rate": 0.0001889859642905992,
+      "loss": 0.7857,
       "step": 245
     },
     {
+      "epoch": 0.24,
+      "grad_norm": 0.05781811933339179,
+      "learning_rate": 0.00018820697102166526,
+      "loss": 0.7627,
       "step": 250
     },
     {
+      "epoch": 0.25,
+      "grad_norm": 0.05674627874967702,
+      "learning_rate": 0.00018740308291919497,
+      "loss": 0.7492,
       "step": 255
     },
     {
+      "epoch": 0.25,
+      "grad_norm": 0.05113375946042358,
+      "learning_rate": 0.0001865745268661689,
+      "loss": 0.8117,
       "step": 260
     },
     {
+      "epoch": 0.25,
+      "grad_norm": 0.06186000436202041,
+      "learning_rate": 0.00018572153670765365,
+      "loss": 0.801,
       "step": 265
     },
     {
+      "epoch": 0.26,
+      "grad_norm": 0.07562426320620609,
+      "learning_rate": 0.00018484435318480332,
+      "loss": 0.8071,
       "step": 270
     },
     {
+      "epoch": 0.26,
+      "grad_norm": 0.06311677584919179,
+      "learning_rate": 0.0001839432238669147,
+      "loss": 0.7843,
       "step": 275
     },
     {
+      "epoch": 0.27,
+      "grad_norm": 0.06630091865014885,
+      "learning_rate": 0.00018301840308155507,
+      "loss": 0.7493,
       "step": 280
     },
     {
+      "epoch": 0.27,
+      "grad_norm": 0.06544583135680962,
+      "learning_rate": 0.00018207015184278305,
+      "loss": 0.782,
       "step": 285
     },
     {
+      "epoch": 0.28,
+      "grad_norm": 0.06541978841910906,
+      "learning_rate": 0.000181098737777482,
+      "loss": 0.766,
       "step": 290
     },
     {
+      "epoch": 0.28,
+      "grad_norm": 0.05791188755333043,
+      "learning_rate": 0.00018010443504982694,
+      "loss": 0.7499,
       "step": 295
     },
     {
+      "epoch": 0.29,
+      "grad_norm": 0.06550833011506903,
+      "learning_rate": 0.000179087524283907,
+      "loss": 0.8137,
       "step": 300
     },
     {
+      "epoch": 0.29,
+      "grad_norm": 0.058622811505104906,
+      "learning_rate": 0.00017804829248452395,
+      "loss": 0.7512,
       "step": 305
     },
     {
+      "epoch": 0.3,
+      "grad_norm": 0.0646444641030475,
+      "learning_rate": 0.00017698703295619052,
+      "loss": 0.7908,
       "step": 310
     },
     {
+      "epoch": 0.3,
+      "grad_norm": 0.06559061358782307,
+      "learning_rate": 0.00017590404522035028,
+      "loss": 0.7308,
       "step": 315
     },
     {
+      "epoch": 0.31,
+      "grad_norm": 0.06308845309249086,
+      "learning_rate": 0.00017479963493084329,
+      "loss": 0.7643,
       "step": 320
     },
     {
+      "epoch": 0.31,
+      "grad_norm": 0.06406066039467145,
+      "learning_rate": 0.0001736741137876405,
+      "loss": 0.7775,
       "step": 325
     },
     {
+      "epoch": 0.32,
+      "grad_norm": 0.06444860497739553,
+      "learning_rate": 0.00017252779944887235,
+      "loss": 0.7774,
       "step": 330
     },
     {
+      "epoch": 0.32,
+      "grad_norm": 0.06604642339408012,
+      "learning_rate": 0.00017136101544117525,
+      "loss": 0.7362,
       "step": 335
     },
     {
+      "epoch": 0.33,
+      "grad_norm": 0.06850287335544007,
+      "learning_rate": 0.00017017409106838207,
+      "loss": 0.7501,
       "step": 340
     },
     {
+      "epoch": 0.33,
+      "grad_norm": 0.06459535279086648,
+      "learning_rate": 0.00016896736131858208,
+      "loss": 0.7606,
       "step": 345
     },
     {
+      "epoch": 0.34,
+      "grad_norm": 0.06245922169687922,
+      "learning_rate": 0.0001677411667695765,
+      "loss": 0.7459,
       "step": 350
     },
     {
+      "epoch": 0.34,
+      "grad_norm": 0.0618217626323294,
+      "learning_rate": 0.00016649585349275662,
+      "loss": 0.7608,
       "step": 355
     },
     {
+      "epoch": 0.35,
+      "grad_norm": 0.06529458738837608,
+      "learning_rate": 0.0001652317729554313,
+      "loss": 0.7793,
       "step": 360
     },
     {
+      "epoch": 0.35,
+      "grad_norm": 0.07281345190056655,
+      "learning_rate": 0.0001639492819216316,
+      "loss": 0.7769,
       "step": 365
     },
     {
+      "epoch": 0.36,
+      "grad_norm": 0.07253822403948054,
+      "learning_rate": 0.0001626487423514207,
+      "loss": 0.7699,
       "step": 370
     },
     {
+      "epoch": 0.36,
+      "grad_norm": 0.059915171172645505,
+      "learning_rate": 0.00016133052129873693,
+      "loss": 0.7426,
       "step": 375
     },
     {
+      "epoch": 0.37,
+      "grad_norm": 0.06063815927279327,
+      "learning_rate": 0.0001599949908077996,
+      "loss": 0.7859,
       "step": 380
     },
     {
+      "epoch": 0.37,
+      "grad_norm": 0.07982151033712452,
+      "learning_rate": 0.00015864252780810616,
+      "loss": 0.7484,
       "step": 385
     },
     {
+      "epoch": 0.38,
+      "grad_norm": 0.07807371830206032,
+      "learning_rate": 0.00015727351400805052,
+      "loss": 0.7318,
       "step": 390
     },
     {
+      "epoch": 0.38,
+      "grad_norm": 0.06359423217230728,
+      "learning_rate": 0.0001558883357871928,
+      "loss": 0.7707,
       "step": 395
     },
     {
+      "epoch": 0.38,
+      "grad_norm": 0.09384048396706658,
+      "learning_rate": 0.00015448738408721052,
+      "loss": 0.7869,
       "step": 400
     },
     {
+      "epoch": 0.39,
+      "grad_norm": 0.07059181179022768,
+      "learning_rate": 0.00015307105430156255,
+      "loss": 0.7139,
       "step": 405
     },
     {
+      "epoch": 0.39,
+      "grad_norm": 0.07407320984033573,
+      "learning_rate": 0.0001516397461638962,
+      "loss": 0.7476,
       "step": 410
     },
     {
+      "epoch": 0.4,
+      "grad_norm": 0.07240911601504253,
+      "learning_rate": 0.0001501938636352297,
+      "loss": 0.7655,
       "step": 415
     },
     {
+      "epoch": 0.4,
+      "grad_norm": 0.07068099666527985,
+      "learning_rate": 0.00014873381478994134,
+      "loss": 0.7893,
       "step": 420
     },
     {
+      "epoch": 0.41,
+      "grad_norm": 0.08305359605908401,
+      "learning_rate": 0.00014726001170059792,
+      "loss": 0.7111,
       "step": 425
     },
     {
+      "epoch": 0.41,
+      "grad_norm": 0.0670085750282625,
+      "learning_rate": 0.00014577287032165468,
+      "loss": 0.7527,
       "step": 430
     },
     {
+      "epoch": 0.42,
+      "grad_norm": 0.06789518226534799,
+      "learning_rate": 0.00014427281037205945,
+      "loss": 0.7751,
       "step": 435
     },
     {
+      "epoch": 0.42,
+      "grad_norm": 0.09136108168814464,
+      "learning_rate": 0.00014276025521679471,
+      "loss": 0.726,
       "step": 440
     },
     {
+      "epoch": 0.43,
+      "grad_norm": 0.06956344474331025,
+      "learning_rate": 0.00014123563174739037,
+      "loss": 0.8187,
       "step": 445
     },
     {
+      "epoch": 0.43,
+      "grad_norm": 0.07271098739476833,
+      "learning_rate": 0.00013969937026144118,
+      "loss": 0.7787,
       "step": 450
     },
     {
+      "epoch": 0.44,
+      "grad_norm": 0.06984149301611005,
+      "learning_rate": 0.00013815190434116317,
+      "loss": 0.7873,
       "step": 455
     },
     {
+      "epoch": 0.44,
+      "grad_norm": 0.07335035453389814,
+      "learning_rate": 0.00013659367073102268,
+      "loss": 0.7609,
       "step": 460
     },
     {
+      "epoch": 0.45,
+      "grad_norm": 0.0744105188121296,
+      "learning_rate": 0.00013502510921447323,
+      "loss": 0.7169,
       "step": 465
     },
     {
+      "epoch": 0.45,
+      "grad_norm": 0.06947572989302908,
+      "learning_rate": 0.00013344666248983432,
+      "loss": 0.7837,
       "step": 470
     },
     {
+      "epoch": 0.46,
+      "grad_norm": 0.07999096873197906,
+      "learning_rate": 0.000131858776045348,
+      "loss": 0.7727,
       "step": 475
     },
     {
+      "epoch": 0.46,
+      "grad_norm": 0.07774452098436961,
+      "learning_rate": 0.00013026189803344774,
+      "loss": 0.8242,
       "step": 480
     },
     {
+      "epoch": 0.47,
+      "grad_norm": 0.07801636818427328,
+      "learning_rate": 0.00012865647914427544,
+      "loss": 0.7269,
       "step": 485
     },
     {
+      "epoch": 0.47,
+      "grad_norm": 0.11086472292549841,
+      "learning_rate": 0.00012704297247848216,
+      "loss": 0.7503,
       "step": 490
     },
     {
+      "epoch": 0.48,
+      "grad_norm": 0.0724703886664607,
+      "learning_rate": 0.00012542183341934872,
+      "loss": 0.81,
       "step": 495
     },
     {
+      "epoch": 0.48,
+      "grad_norm": 0.08424198871373768,
+      "learning_rate": 0.00012379351950426187,
+      "loss": 0.7102,
       "step": 500
     },
     {
+      "epoch": 0.49,
+      "grad_norm": 0.07399506224237913,
+      "learning_rate": 0.0001221584902955827,
+      "loss": 0.811,
       "step": 505
     },
     {
+      "epoch": 0.49,
+      "grad_norm": 0.07513530266538687,
+      "learning_rate": 0.00012051720725094324,
+      "loss": 0.7328,
       "step": 510
     },
     {
+      "epoch": 0.5,
+      "grad_norm": 0.08300665590930759,
+      "learning_rate": 0.00011887013359300837,
+      "loss": 0.7728,
       "step": 515
     },
     {
+      "epoch": 0.5,
+      "grad_norm": 0.08603160091015216,
+      "learning_rate": 0.00011721773417873965,
+      "loss": 0.8092,
       "step": 520
     },
     {
+      "epoch": 0.51,
+      "grad_norm": 0.06828684892985817,
+      "learning_rate": 0.00011556047536819777,
+      "loss": 0.7905,
       "step": 525
     },
     {
+      "epoch": 0.51,
+      "grad_norm": 0.0888375409363043,
+      "learning_rate": 0.00011389882489292061,
+      "loss": 0.7616,
       "step": 530
     },
     {
+      "epoch": 0.51,
+      "grad_norm": 0.07153992771335722,
+      "learning_rate": 0.0001122332517239147,
+      "loss": 0.7231,
       "step": 535
     },
     {
+      "epoch": 0.52,
+      "grad_norm": 0.07078385208785225,
+      "learning_rate": 0.00011056422593929635,
+      "loss": 0.7744,
       "step": 540
     },
     {
+      "epoch": 0.52,
+      "grad_norm": 0.06724179763601197,
+      "learning_rate": 0.00010889221859162062,
+      "loss": 0.7385,
       "step": 545
     },
     {
+      "epoch": 0.53,
+      "grad_norm": 0.08127731002438206,
+      "learning_rate": 0.00010721770157493527,
+      "loss": 0.737,
       "step": 550
     },
     {
+      "epoch": 0.53,
+      "grad_norm": 0.07971841929977522,
+      "learning_rate": 0.000105541147491597,
+      "loss": 0.7129,
       "step": 555
     },
     {
+      "epoch": 0.54,
+      "grad_norm": 0.09905140750790473,
+      "learning_rate": 0.00010386302951888804,
+      "loss": 0.7682,
       "step": 560
     },
     {
+      "epoch": 0.54,
+      "grad_norm": 0.07787324370915785,
+      "learning_rate": 0.00010218382127547022,
+      "loss": 0.7988,
       "step": 565
     },
     {
+      "epoch": 0.55,
+      "grad_norm": 0.06979898484314451,
+      "learning_rate": 0.00010050399668771479,
+      "loss": 0.7505,
       "step": 570
     },
     {
+      "epoch": 0.55,
+      "grad_norm": 0.08294483662862129,
+      "learning_rate": 9.882402985594515e-05,
+      "loss": 0.7254,
       "step": 575
     },
     {
+      "epoch": 0.56,
+      "grad_norm": 0.09404505448514887,
+      "learning_rate": 9.71443949206304e-05,
+      "loss": 0.7744,
       "step": 580
     },
     {
+      "epoch": 0.56,
+      "grad_norm": 0.08296480696219607,
+      "learning_rate": 9.546556592856789e-05,
+      "loss": 0.7255,
       "step": 585
     },
     {
+      "epoch": 0.57,
+      "grad_norm": 0.08380203772972816,
+      "learning_rate": 9.378801669909197e-05,
+      "loss": 0.6704,
       "step": 590
     },
     {
+      "epoch": 0.57,
+      "grad_norm": 0.09814150545542286,
+      "learning_rate": 9.211222069034695e-05,
+      "loss": 0.7107,
       "step": 595
     },
     {
+      "epoch": 0.58,
+      "grad_norm": 0.0951778654867295,
+      "learning_rate": 9.043865086566214e-05,
+      "loss": 0.7158,
       "step": 600
     },
     {
+      "epoch": 0.58,
+      "grad_norm": 0.08511949253631121,
+      "learning_rate": 8.87677795600663e-05,
+      "loss": 0.7572,
       "step": 605
     },
     {
+      "epoch": 0.59,
+      "grad_norm": 0.1143213911299264,
+      "learning_rate": 8.710007834697969e-05,
+      "loss": 0.7785,
       "step": 610
     },
     {
+      "epoch": 0.59,
+      "grad_norm": 0.08008142575287244,
+      "learning_rate": 8.543601790512083e-05,
+      "loss": 0.7327,
       "step": 615
     },
     {
+      "epoch": 0.6,
+      "grad_norm": 0.09526058947958355,
+      "learning_rate": 8.377606788566597e-05,
+      "loss": 0.703,
       "step": 620
     },
     {
+      "epoch": 0.6,
+      "grad_norm": 0.08273005979279956,
+      "learning_rate": 8.212069677969851e-05,
+      "loss": 0.7497,
       "step": 625
     },
     {
+      "epoch": 0.61,
+      "grad_norm": 0.08864939979402765,
+      "learning_rate": 8.047037178598567e-05,
+      "loss": 0.7573,
       "step": 630
     },
     {
+      "epoch": 0.61,
+      "grad_norm": 0.08394557070047488,
+      "learning_rate": 7.882555867912017e-05,
+      "loss": 0.7827,
       "step": 635
     },
     {
+      "epoch": 0.62,
+      "grad_norm": 0.09978852942456092,
+      "learning_rate": 7.718672167806354e-05,
+      "loss": 0.7201,
       "step": 640
     },
     {
+      "epoch": 0.62,
+      "grad_norm": 0.07987549874350373,
+      "learning_rate": 7.55543233151289e-05,
+      "loss": 0.7129,
       "step": 645
     },
     {
+      "epoch": 0.63,
+      "grad_norm": 0.09662339979538469,
+      "learning_rate": 7.392882430543928e-05,
+      "loss": 0.7593,
       "step": 650
     },
     {
+      "epoch": 0.63,
+      "grad_norm": 0.0966150953702372,
+      "learning_rate": 7.231068341689923e-05,
+      "loss": 0.6704,
       "step": 655
     },
     {
+      "epoch": 0.63,
+      "grad_norm": 0.08961534426031717,
+      "learning_rate": 7.070035734071574e-05,
+      "loss": 0.781,
       "step": 660
     },
     {
+      "epoch": 0.64,
+      "grad_norm": 0.09684345543910731,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 0.7787,
       "step": 665
     },
     {
+      "epoch": 0.64,
+      "grad_norm": 0.10518057650982461,
+      "learning_rate": 6.750496523402352e-05,
+      "loss": 0.7658,
       "step": 670
     },
     {
+      "epoch": 0.65,
+      "grad_norm": 0.08726496496105966,
+      "learning_rate": 6.592080104555357e-05,
+      "loss": 0.7515,
       "step": 675
     },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.0933748204446253,
+      "learning_rate": 6.434625509898897e-05,
+      "loss": 0.7474,
+      "step": 680
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.09987049741300834,
+      "learning_rate": 6.278177178164721e-05,
+      "loss": 0.7458,
+      "step": 685
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.10265280285088377,
+      "learning_rate": 6.122779264084932e-05,
+      "loss": 0.7194,
+      "step": 690
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.09519605624873385,
+      "learning_rate": 5.968475625930124e-05,
+      "loss": 0.7788,
+      "step": 695
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.10432725726790697,
+      "learning_rate": 5.815309813131153e-05,
+      "loss": 0.6987,
+      "step": 700
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.0895838664526722,
+      "learning_rate": 5.663325053988112e-05,
+      "loss": 0.7438,
+      "step": 705
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.1020683075125265,
+      "learning_rate": 5.5125642434699044e-05,
+      "loss": 0.7329,
+      "step": 710
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.1254242358800245,
+      "learning_rate": 5.363069931107902e-05,
+      "loss": 0.7701,
+      "step": 715
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.09171973242978361,
+      "learning_rate": 5.214884308987136e-05,
+      "loss": 0.7614,
+      "step": 720
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.0973938129022939,
+      "learning_rate": 5.068049199838307e-05,
+      "loss": 0.7654,
+      "step": 725
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.10651861956923717,
+      "learning_rate": 4.9226060452340825e-05,
+      "loss": 0.7459,
+      "step": 730
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.10089952208386434,
+      "learning_rate": 4.7785958938929644e-05,
+      "loss": 0.7259,
+      "step": 735
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.1210514333134373,
+      "learning_rate": 4.6360593900940074e-05,
+      "loss": 0.7434,
+      "step": 740
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.09318485524858554,
+      "learning_rate": 4.4950367622057173e-05,
+      "loss": 0.7452,
+      "step": 745
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.0982653281602369,
+      "learning_rate": 4.355567811332311e-05,
+      "loss": 0.7647,
+      "step": 750
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.10433832624807006,
+      "learning_rate": 4.21769190008056e-05,
+      "loss": 0.7786,
+      "step": 755
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.10945535727159489,
+      "learning_rate": 4.081447941450428e-05,
+      "loss": 0.7534,
+      "step": 760
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.08979908942148239,
+      "learning_rate": 3.946874387852545e-05,
+      "loss": 0.7684,
+      "step": 765
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.10406027811162083,
+      "learning_rate": 3.8140092202557185e-05,
+      "loss": 0.722,
+      "step": 770
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.1035268479061034,
+      "learning_rate": 3.682889937467493e-05,
+      "loss": 0.7553,
+      "step": 775
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.10887919806436043,
+      "learning_rate": 3.553553545550768e-05,
+      "loss": 0.7246,
+      "step": 780
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.09604076773803726,
+      "learning_rate": 3.426036547379528e-05,
+      "loss": 0.7608,
+      "step": 785
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.10956385216460547,
+      "learning_rate": 3.300374932336533e-05,
+      "loss": 0.7338,
+      "step": 790
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.10172528555623694,
+      "learning_rate": 3.176604166155976e-05,
+      "loss": 0.7495,
+      "step": 795
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.10744826094269415,
+      "learning_rate": 3.054759180913921e-05,
+      "loss": 0.8015,
+      "step": 800
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.09531428953628962,
+      "learning_rate": 2.9348743651693357e-05,
+      "loss": 0.7432,
+      "step": 805
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.09578814140538712,
+      "learning_rate": 2.8169835542585587e-05,
+      "loss": 0.6876,
+      "step": 810
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.10906430952489729,
+      "learning_rate": 2.7011200207458677e-05,
+      "loss": 0.7461,
+      "step": 815
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.09924057812689555,
+      "learning_rate": 2.5873164650328996e-05,
+      "loss": 0.7403,
+      "step": 820
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.102534066373728,
+      "learning_rate": 2.4756050061295534e-05,
+      "loss": 0.7771,
+      "step": 825
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.11379854278214475,
+      "learning_rate": 2.36601717258897e-05,
+      "loss": 0.7494,
+      "step": 830
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.09555943569834097,
+      "learning_rate": 2.2585838936091754e-05,
+      "loss": 0.7062,
+      "step": 835
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.08701341120074446,
+      "learning_rate": 2.153335490303856e-05,
+      "loss": 0.7029,
+      "step": 840
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.10955955931583401,
+      "learning_rate": 2.0503016671447785e-05,
+      "loss": 0.7119,
+      "step": 845
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.09961678142736638,
+      "learning_rate": 1.9495115035782307e-05,
+      "loss": 0.7181,
+      "step": 850
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.10875433120436596,
+      "learning_rate": 1.8509934458178712e-05,
+      "loss": 0.7221,
+      "step": 855
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.10419310033343766,
+      "learning_rate": 1.754775298816307e-05,
+      "loss": 0.7627,
+      "step": 860
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.11125643674385945,
+      "learning_rate": 1.6608842184176243e-05,
+      "loss": 0.783,
+      "step": 865
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.09805407303806633,
+      "learning_rate": 1.5693467036931576e-05,
+      "loss": 0.7754,
+      "step": 870
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.09382093887295272,
+      "learning_rate": 1.48018858946259e-05,
+      "loss": 0.7306,
+      "step": 875
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.08828049016039903,
+      "learning_rate": 1.3934350390025463e-05,
+      "loss": 0.7277,
+      "step": 880
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.10111049793555948,
+      "learning_rate": 1.3091105369447165e-05,
+      "loss": 0.7433,
+      "step": 885
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.09959284476126003,
+      "learning_rate": 1.22723888236549e-05,
+      "loss": 0.7608,
+      "step": 890
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.12000298215227106,
+      "learning_rate": 1.1478431820691083e-05,
+      "loss": 0.7249,
+      "step": 895
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.09846825711770221,
+      "learning_rate": 1.0709458440661801e-05,
+      "loss": 0.7474,
+      "step": 900
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.08826572755281627,
+      "learning_rate": 9.965685712494199e-06,
+      "loss": 0.7125,
+      "step": 905
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.12402329261776916,
+      "learning_rate": 9.247323552684051e-06,
+      "loss": 0.7685,
+      "step": 910
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.10462400264867344,
+      "learning_rate": 8.554574706050488e-06,
+      "loss": 0.7884,
+      "step": 915
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.11275389812311376,
+      "learning_rate": 7.887634688515e-06,
+      "loss": 0.7487,
+      "step": 920
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.11871268386808256,
+      "learning_rate": 7.246691731920485e-06,
+      "loss": 0.7607,
+      "step": 925
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.12345475324952107,
+      "learning_rate": 6.631926730906324e-06,
+      "loss": 0.7716,
+      "step": 930
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.10653833929745236,
+      "learning_rate": 6.043513191853978e-06,
+      "loss": 0.7465,
+      "step": 935
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.09013425987778603,
+      "learning_rate": 5.481617183918053e-06,
+      "loss": 0.7543,
+      "step": 940
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.09465293194517142,
+      "learning_rate": 4.946397292156158e-06,
+      "loss": 0.736,
+      "step": 945
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.10787353868301028,
+      "learning_rate": 4.438004572771182e-06,
+      "loss": 0.7284,
+      "step": 950
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.1429868308987827,
+      "learning_rate": 3.9565825104783685e-06,
+      "loss": 0.7907,
+      "step": 955
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.10249198050493027,
+      "learning_rate": 3.5022669780093497e-06,
+      "loss": 0.7203,
+      "step": 960
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.1028361192764587,
+      "learning_rate": 3.0751861977645125e-06,
+      "loss": 0.7284,
+      "step": 965
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.1278500182671187,
+      "learning_rate": 2.6754607056244883e-06,
+      "loss": 0.7447,
+      "step": 970
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.09199143538365835,
+      "learning_rate": 2.303203316931102e-06,
+      "loss": 0.7173,
+      "step": 975
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.10797760289216082,
+      "learning_rate": 1.9585190946472488e-06,
+      "loss": 0.7163,
+      "step": 980
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.09281102451549036,
+      "learning_rate": 1.6415053197047725e-06,
+      "loss": 0.7284,
+      "step": 985
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.11431066643386725,
+      "learning_rate": 1.3522514635486816e-06,
+      "loss": 0.7723,
+      "step": 990
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.09277614207193054,
+      "learning_rate": 1.0908391628854041e-06,
+      "loss": 0.7623,
+      "step": 995
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.0970512860952287,
+      "learning_rate": 8.57342196642319e-07,
+      "loss": 0.6736,
+      "step": 1000
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.10404789525698893,
+      "learning_rate": 6.518264651449779e-07,
+      "loss": 0.7771,
+      "step": 1005
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.09905350029610553,
+      "learning_rate": 4.743499715179067e-07,
+      "loss": 0.7495,
+      "step": 1010
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.08940324837180028,
+      "learning_rate": 3.249628053142884e-07,
+      "loss": 0.7587,
+      "step": 1015
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.10668411274319571,
+      "learning_rate": 2.0370712837906037e-07,
+      "loss": 0.7657,
+      "step": 1020
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.0933938060668371,
+      "learning_rate": 1.1061716294951118e-07,
+      "loss": 0.7493,
+      "step": 1025
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.09579015452060988,
+      "learning_rate": 4.5719181996650705e-08,
+      "loss": 0.7677,
+      "step": 1030
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.1038186774195005,
+      "learning_rate": 9.031501810174981e-09,
+      "loss": 0.7327,
+      "step": 1035
+    },
     {
       "epoch": 1.0,
       "eval_loss": NaN,
+      "eval_runtime": 3002.6712,
+      "eval_samples_per_second": 0.769,
+      "eval_steps_per_second": 0.192,
+      "step": 1039
     },
     {
       "epoch": 1.0,
+      "step": 1039,
+      "total_flos": 3.479777588753203e+16,
+      "train_loss": 0.7603742487735766,
+      "train_runtime": 32307.7024,
+      "train_samples_per_second": 1.029,
+      "train_steps_per_second": 0.032
     }
   ],
   "logging_steps": 5,
+  "max_steps": 1039,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 100,
+  "total_flos": 3.479777588753203e+16,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null