Model save

Browse files

Files changed (4) hide show

README.md +69 -0
all_results.json +9 -0
train_results.json +9 -0
trainer_state.json +1513 -0

README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+---
+license: llama3
+library_name: peft
+tags:
+- trl
+- sft
+- generated_from_trainer
+base_model: meta-llama/Meta-Llama-3-8B
+datasets:
+- generator
+model-index:
+- name: downstream-7b-p0.1_seed42_rare
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# downstream-7b-p0.1_seed42_rare
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the generator dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.0771
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 4
+- eval_batch_size: 1
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 4
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 64
+- total_eval_batch_size: 4
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.0444        | 0.9993 | 1044 | 1.0771          |
+### Framework versions
+- PEFT 0.11.1
+- Transformers 4.43.4
+- Pytorch 2.3.1+cu121
+- Datasets 2.19.1
+- Tokenizers 0.19.1

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9992821249102656,
+    "total_flos": 2155604625850368.0,
+    "train_loss": 1.091635976486279,
+    "train_runtime": 24351.9745,
+    "train_samples": 114325,
+    "train_samples_per_second": 2.746,
+    "train_steps_per_second": 0.043
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9992821249102656,
+    "total_flos": 2155604625850368.0,
+    "train_loss": 1.091635976486279,
+    "train_runtime": 24351.9745,
+    "train_samples": 114325,
+    "train_samples_per_second": 2.746,
+    "train_steps_per_second": 0.043
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1513 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9992821249102656,
+  "eval_steps": 500,
+  "global_step": 1044,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.000957166786312515,
+      "grad_norm": 3.9142152723532337,
+      "learning_rate": 1.9047619047619051e-06,
+      "loss": 1.3978,
+      "step": 1
+    },
+    {
+      "epoch": 0.004785833931562575,
+      "grad_norm": 1.3430217420733046,
+      "learning_rate": 9.523809523809523e-06,
+      "loss": 1.3489,
+      "step": 5
+    },
+    {
+      "epoch": 0.00957166786312515,
+      "grad_norm": 0.5850408636793494,
+      "learning_rate": 1.9047619047619046e-05,
+      "loss": 1.2871,
+      "step": 10
+    },
+    {
+      "epoch": 0.014357501794687724,
+      "grad_norm": 0.46666716038967326,
+      "learning_rate": 2.857142857142857e-05,
+      "loss": 1.2106,
+      "step": 15
+    },
+    {
+      "epoch": 0.0191433357262503,
+      "grad_norm": 0.35044064248530404,
+      "learning_rate": 3.809523809523809e-05,
+      "loss": 1.189,
+      "step": 20
+    },
+    {
+      "epoch": 0.023929169657812874,
+      "grad_norm": 0.27361957875198517,
+      "learning_rate": 4.761904761904762e-05,
+      "loss": 1.1469,
+      "step": 25
+    },
+    {
+      "epoch": 0.028715003589375447,
+      "grad_norm": 0.2368453005937937,
+      "learning_rate": 5.714285714285714e-05,
+      "loss": 1.158,
+      "step": 30
+    },
+    {
+      "epoch": 0.03350083752093802,
+      "grad_norm": 0.2277332385016794,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 1.1437,
+      "step": 35
+    },
+    {
+      "epoch": 0.0382866714525006,
+      "grad_norm": 0.2265653549311157,
+      "learning_rate": 7.619047619047618e-05,
+      "loss": 1.1302,
+      "step": 40
+    },
+    {
+      "epoch": 0.043072505384063174,
+      "grad_norm": 0.22079711284915807,
+      "learning_rate": 8.571428571428571e-05,
+      "loss": 1.13,
+      "step": 45
+    },
+    {
+      "epoch": 0.04785833931562575,
+      "grad_norm": 0.20813516832540208,
+      "learning_rate": 9.523809523809524e-05,
+      "loss": 1.106,
+      "step": 50
+    },
+    {
+      "epoch": 0.05264417324718832,
+      "grad_norm": 0.2044131638757028,
+      "learning_rate": 0.00010476190476190477,
+      "loss": 1.1348,
+      "step": 55
+    },
+    {
+      "epoch": 0.057430007178750894,
+      "grad_norm": 0.20101729146508107,
+      "learning_rate": 0.00011428571428571428,
+      "loss": 1.1018,
+      "step": 60
+    },
+    {
+      "epoch": 0.062215841110313475,
+      "grad_norm": 0.21865369935553125,
+      "learning_rate": 0.0001238095238095238,
+      "loss": 1.1129,
+      "step": 65
+    },
+    {
+      "epoch": 0.06700167504187604,
+      "grad_norm": 0.18405578482864565,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 1.1018,
+      "step": 70
+    },
+    {
+      "epoch": 0.07178750897343862,
+      "grad_norm": 0.18488079729650672,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 1.1417,
+      "step": 75
+    },
+    {
+      "epoch": 0.0765733429050012,
+      "grad_norm": 0.18433481759594844,
+      "learning_rate": 0.00015238095238095237,
+      "loss": 1.111,
+      "step": 80
+    },
+    {
+      "epoch": 0.08135917683656377,
+      "grad_norm": 0.20377971597879482,
+      "learning_rate": 0.00016190476190476192,
+      "loss": 1.0709,
+      "step": 85
+    },
+    {
+      "epoch": 0.08614501076812635,
+      "grad_norm": 0.20225554382239913,
+      "learning_rate": 0.00017142857142857143,
+      "loss": 1.1142,
+      "step": 90
+    },
+    {
+      "epoch": 0.09093084469968891,
+      "grad_norm": 0.18520967333311886,
+      "learning_rate": 0.00018095238095238095,
+      "loss": 1.1142,
+      "step": 95
+    },
+    {
+      "epoch": 0.0957166786312515,
+      "grad_norm": 0.19606367225373053,
+      "learning_rate": 0.00019047619047619048,
+      "loss": 1.1049,
+      "step": 100
+    },
+    {
+      "epoch": 0.10050251256281408,
+      "grad_norm": 0.1867473714189168,
+      "learning_rate": 0.0002,
+      "loss": 1.0927,
+      "step": 105
+    },
+    {
+      "epoch": 0.10528834649437664,
+      "grad_norm": 0.185817071062854,
+      "learning_rate": 0.00019998600836567816,
+      "loss": 1.1206,
+      "step": 110
+    },
+    {
+      "epoch": 0.11007418042593922,
+      "grad_norm": 0.1762396939142846,
+      "learning_rate": 0.00019994403737802927,
+      "loss": 1.1022,
+      "step": 115
+    },
+    {
+      "epoch": 0.11486001435750179,
+      "grad_norm": 0.16668242539032083,
+      "learning_rate": 0.00019987409878190752,
+      "loss": 1.1052,
+      "step": 120
+    },
+    {
+      "epoch": 0.11964584828906437,
+      "grad_norm": 0.1764182848939352,
+      "learning_rate": 0.00019977621214841822,
+      "loss": 1.1059,
+      "step": 125
+    },
+    {
+      "epoch": 0.12443168222062695,
+      "grad_norm": 0.1787380695504439,
+      "learning_rate": 0.0001996504048694409,
+      "loss": 1.1102,
+      "step": 130
+    },
+    {
+      "epoch": 0.12921751615218952,
+      "grad_norm": 0.18152638872711746,
+      "learning_rate": 0.00019949671214996445,
+      "loss": 1.0986,
+      "step": 135
+    },
+    {
+      "epoch": 0.13400335008375208,
+      "grad_norm": 0.1745817045352934,
+      "learning_rate": 0.00019931517699823547,
+      "loss": 1.085,
+      "step": 140
+    },
+    {
+      "epoch": 0.13878918401531468,
+      "grad_norm": 0.17554030590687575,
+      "learning_rate": 0.0001991058502137231,
+      "loss": 1.1363,
+      "step": 145
+    },
+    {
+      "epoch": 0.14357501794687724,
+      "grad_norm": 0.1800405053641118,
+      "learning_rate": 0.00019886879037290384,
+      "loss": 1.0924,
+      "step": 150
+    },
+    {
+      "epoch": 0.1483608518784398,
+      "grad_norm": 0.19148632700424054,
+      "learning_rate": 0.0001986040638128698,
+      "loss": 1.0824,
+      "step": 155
+    },
+    {
+      "epoch": 0.1531466858100024,
+      "grad_norm": 0.17313730878343153,
+      "learning_rate": 0.0001983117446127654,
+      "loss": 1.1071,
+      "step": 160
+    },
+    {
+      "epoch": 0.15793251974156497,
+      "grad_norm": 0.16961371492797625,
+      "learning_rate": 0.00019799191457305768,
+      "loss": 1.1311,
+      "step": 165
+    },
+    {
+      "epoch": 0.16271835367312754,
+      "grad_norm": 0.17327733992734365,
+      "learning_rate": 0.00019764466319264595,
+      "loss": 1.1133,
+      "step": 170
+    },
+    {
+      "epoch": 0.16750418760469013,
+      "grad_norm": 0.17176717180251114,
+      "learning_rate": 0.00019727008764381675,
+      "loss": 1.1153,
+      "step": 175
+    },
+    {
+      "epoch": 0.1722900215362527,
+      "grad_norm": 0.17138393560502058,
+      "learning_rate": 0.0001968682927450523,
+      "loss": 1.1006,
+      "step": 180
+    },
+    {
+      "epoch": 0.17707585546781526,
+      "grad_norm": 0.16465106542109514,
+      "learning_rate": 0.00019643939093169844,
+      "loss": 1.104,
+      "step": 185
+    },
+    {
+      "epoch": 0.18186168939937783,
+      "grad_norm": 0.1699157446206454,
+      "learning_rate": 0.00019598350222450178,
+      "loss": 1.1167,
+      "step": 190
+    },
+    {
+      "epoch": 0.18664752333094042,
+      "grad_norm": 0.17602023031266878,
+      "learning_rate": 0.00019550075419602408,
+      "loss": 1.1131,
+      "step": 195
+    },
+    {
+      "epoch": 0.191433357262503,
+      "grad_norm": 0.18282225583073394,
+      "learning_rate": 0.00019499128193494297,
+      "loss": 1.0889,
+      "step": 200
+    },
+    {
+      "epoch": 0.19621919119406556,
+      "grad_norm": 0.16528930064048408,
+      "learning_rate": 0.0001944552280082499,
+      "loss": 1.1013,
+      "step": 205
+    },
+    {
+      "epoch": 0.20100502512562815,
+      "grad_norm": 0.16504045631379008,
+      "learning_rate": 0.0001938927424213553,
+      "loss": 1.1003,
+      "step": 210
+    },
+    {
+      "epoch": 0.20579085905719072,
+      "grad_norm": 0.16559761424705557,
+      "learning_rate": 0.000193303982576112,
+      "loss": 1.0998,
+      "step": 215
+    },
+    {
+      "epoch": 0.21057669298875328,
+      "grad_norm": 0.16791680669282769,
+      "learning_rate": 0.0001926891132267692,
+      "loss": 1.0919,
+      "step": 220
+    },
+    {
+      "epoch": 0.21536252692031588,
+      "grad_norm": 0.16689008275055853,
+      "learning_rate": 0.00019204830643386868,
+      "loss": 1.1069,
+      "step": 225
+    },
+    {
+      "epoch": 0.22014836085187844,
+      "grad_norm": 0.17305744880787605,
+      "learning_rate": 0.00019138174151609683,
+      "loss": 1.1272,
+      "step": 230
+    },
+    {
+      "epoch": 0.224934194783441,
+      "grad_norm": 0.1618964004882894,
+      "learning_rate": 0.00019068960500010523,
+      "loss": 1.0827,
+      "step": 235
+    },
+    {
+      "epoch": 0.22972002871500358,
+      "grad_norm": 0.16411871929076272,
+      "learning_rate": 0.00018997209056831462,
+      "loss": 1.1164,
+      "step": 240
+    },
+    {
+      "epoch": 0.23450586264656617,
+      "grad_norm": 0.1691454723246668,
+      "learning_rate": 0.0001892293990047159,
+      "loss": 1.1079,
+      "step": 245
+    },
+    {
+      "epoch": 0.23929169657812874,
+      "grad_norm": 0.1649265891086064,
+      "learning_rate": 0.00018846173813868454,
+      "loss": 1.0825,
+      "step": 250
+    },
+    {
+      "epoch": 0.2440775305096913,
+      "grad_norm": 0.17018378559137046,
+      "learning_rate": 0.000187669322786823,
+      "loss": 1.1216,
+      "step": 255
+    },
+    {
+      "epoch": 0.2488633644412539,
+      "grad_norm": 0.1703316481606123,
+      "learning_rate": 0.0001868523746928479,
+      "loss": 1.0783,
+      "step": 260
+    },
+    {
+      "epoch": 0.25364919837281646,
+      "grad_norm": 0.16735434292797727,
+      "learning_rate": 0.0001860111224655391,
+      "loss": 1.1149,
+      "step": 265
+    },
+    {
+      "epoch": 0.25843503230437903,
+      "grad_norm": 0.1555718882454273,
+      "learning_rate": 0.0001851458015147673,
+      "loss": 1.1075,
+      "step": 270
+    },
+    {
+      "epoch": 0.2632208662359416,
+      "grad_norm": 0.16977397286003804,
+      "learning_rate": 0.00018425665398561883,
+      "loss": 1.0852,
+      "step": 275
+    },
+    {
+      "epoch": 0.26800670016750416,
+      "grad_norm": 0.1663432558952086,
+      "learning_rate": 0.00018334392869063536,
+      "loss": 1.0811,
+      "step": 280
+    },
+    {
+      "epoch": 0.2727925340990668,
+      "grad_norm": 0.1666368907912767,
+      "learning_rate": 0.00018240788104018822,
+      "loss": 1.1014,
+      "step": 285
+    },
+    {
+      "epoch": 0.27757836803062935,
+      "grad_norm": 0.15687924477714424,
+      "learning_rate": 0.00018144877297100606,
+      "loss": 1.0736,
+      "step": 290
+    },
+    {
+      "epoch": 0.2823642019621919,
+      "grad_norm": 0.16061101558707236,
+      "learning_rate": 0.0001804668728728764,
+      "loss": 1.0931,
+      "step": 295
+    },
+    {
+      "epoch": 0.2871500358937545,
+      "grad_norm": 0.160793101364396,
+      "learning_rate": 0.00017946245551354157,
+      "loss": 1.0999,
+      "step": 300
+    },
+    {
+      "epoch": 0.29193586982531705,
+      "grad_norm": 0.1633263725476254,
+      "learning_rate": 0.00017843580196180952,
+      "loss": 1.0948,
+      "step": 305
+    },
+    {
+      "epoch": 0.2967217037568796,
+      "grad_norm": 0.1631452387362867,
+      "learning_rate": 0.00017738719950890168,
+      "loss": 1.1013,
+      "step": 310
+    },
+    {
+      "epoch": 0.3015075376884422,
+      "grad_norm": 0.16210535171429089,
+      "learning_rate": 0.00017631694158805946,
+      "loss": 1.0798,
+      "step": 315
+    },
+    {
+      "epoch": 0.3062933716200048,
+      "grad_norm": 0.16229934636841442,
+      "learning_rate": 0.000175225327692432,
+      "loss": 1.0575,
+      "step": 320
+    },
+    {
+      "epoch": 0.3110792055515674,
+      "grad_norm": 0.16588195121854765,
+      "learning_rate": 0.00017411266329126824,
+      "loss": 1.096,
+      "step": 325
+    },
+    {
+      "epoch": 0.31586503948312994,
+      "grad_norm": 0.158210200244668,
+      "learning_rate": 0.00017297925974443673,
+      "loss": 1.1071,
+      "step": 330
+    },
+    {
+      "epoch": 0.3206508734146925,
+      "grad_norm": 0.1663784778299078,
+      "learning_rate": 0.00017182543421529676,
+      "loss": 1.0739,
+      "step": 335
+    },
+    {
+      "epoch": 0.32543670734625507,
+      "grad_norm": 0.15384028200238806,
+      "learning_rate": 0.00017065150958194586,
+      "loss": 1.0848,
+      "step": 340
+    },
+    {
+      "epoch": 0.33022254127781764,
+      "grad_norm": 0.16073214574887584,
+      "learning_rate": 0.00016945781434686783,
+      "loss": 1.1157,
+      "step": 345
+    },
+    {
+      "epoch": 0.33500837520938026,
+      "grad_norm": 0.1745939193140414,
+      "learning_rate": 0.00016824468254500704,
+      "loss": 1.0815,
+      "step": 350
+    },
+    {
+      "epoch": 0.3397942091409428,
+      "grad_norm": 0.15802897019970708,
+      "learning_rate": 0.0001670124536502947,
+      "loss": 1.0779,
+      "step": 355
+    },
+    {
+      "epoch": 0.3445800430725054,
+      "grad_norm": 0.1579431494377225,
+      "learning_rate": 0.00016576147248065267,
+      "loss": 1.1031,
+      "step": 360
+    },
+    {
+      "epoch": 0.34936587700406796,
+      "grad_norm": 0.16455288633589932,
+      "learning_rate": 0.00016449208910150232,
+      "loss": 1.1207,
+      "step": 365
+    },
+    {
+      "epoch": 0.3541517109356305,
+      "grad_norm": 0.15512720174775488,
+      "learning_rate": 0.00016320465872780477,
+      "loss": 1.0843,
+      "step": 370
+    },
+    {
+      "epoch": 0.3589375448671931,
+      "grad_norm": 0.15810739086397552,
+      "learning_rate": 0.00016189954162466012,
+      "loss": 1.0674,
+      "step": 375
+    },
+    {
+      "epoch": 0.36372337879875566,
+      "grad_norm": 0.15539897008538223,
+      "learning_rate": 0.0001605771030064934,
+      "loss": 1.1075,
+      "step": 380
+    },
+    {
+      "epoch": 0.3685092127303183,
+      "grad_norm": 0.16059302879871643,
+      "learning_rate": 0.00015923771293485585,
+      "loss": 1.1083,
+      "step": 385
+    },
+    {
+      "epoch": 0.37329504666188085,
+      "grad_norm": 0.1726863039386017,
+      "learning_rate": 0.00015788174621486934,
+      "loss": 1.0839,
+      "step": 390
+    },
+    {
+      "epoch": 0.3780808805934434,
+      "grad_norm": 0.160896911699282,
+      "learning_rate": 0.00015650958229034391,
+      "loss": 1.093,
+      "step": 395
+    },
+    {
+      "epoch": 0.382866714525006,
+      "grad_norm": 0.1539033105501165,
+      "learning_rate": 0.00015512160513759672,
+      "loss": 1.0824,
+      "step": 400
+    },
+    {
+      "epoch": 0.38765254845656855,
+      "grad_norm": 0.15253934847352404,
+      "learning_rate": 0.00015371820315800315,
+      "loss": 1.0611,
+      "step": 405
+    },
+    {
+      "epoch": 0.3924383823881311,
+      "grad_norm": 0.1549203336671571,
+      "learning_rate": 0.00015229976906930935,
+      "loss": 1.0926,
+      "step": 410
+    },
+    {
+      "epoch": 0.3972242163196937,
+      "grad_norm": 0.15736586699846142,
+      "learning_rate": 0.0001508666997957369,
+      "loss": 1.0838,
+      "step": 415
+    },
+    {
+      "epoch": 0.4020100502512563,
+      "grad_norm": 0.15414651629074486,
+      "learning_rate": 0.00014941939635691035,
+      "loss": 1.0962,
+      "step": 420
+    },
+    {
+      "epoch": 0.40679588418281887,
+      "grad_norm": 0.15216014768555902,
+      "learning_rate": 0.00014795826375563925,
+      "loss": 1.0837,
+      "step": 425
+    },
+    {
+      "epoch": 0.41158171811438143,
+      "grad_norm": 0.1551252486012846,
+      "learning_rate": 0.0001464837108645845,
+      "loss": 1.096,
+      "step": 430
+    },
+    {
+      "epoch": 0.416367552045944,
+      "grad_norm": 0.15880410911617168,
+      "learning_rate": 0.00014499615031184296,
+      "loss": 1.0947,
+      "step": 435
+    },
+    {
+      "epoch": 0.42115338597750657,
+      "grad_norm": 0.16084656769756484,
+      "learning_rate": 0.00014349599836548034,
+      "loss": 1.0955,
+      "step": 440
+    },
+    {
+      "epoch": 0.42593921990906913,
+      "grad_norm": 0.14942909791958908,
+      "learning_rate": 0.0001419836748170459,
+      "loss": 1.0911,
+      "step": 445
+    },
+    {
+      "epoch": 0.43072505384063176,
+      "grad_norm": 0.16134597273400678,
+      "learning_rate": 0.0001404596028641009,
+      "loss": 1.1136,
+      "step": 450
+    },
+    {
+      "epoch": 0.4355108877721943,
+      "grad_norm": 0.15552785776756606,
+      "learning_rate": 0.0001389242089917943,
+      "loss": 1.1005,
+      "step": 455
+    },
+    {
+      "epoch": 0.4402967217037569,
+      "grad_norm": 0.1544583591443468,
+      "learning_rate": 0.00013737792285351805,
+      "loss": 1.0896,
+      "step": 460
+    },
+    {
+      "epoch": 0.44508255563531945,
+      "grad_norm": 0.15743294110434283,
+      "learning_rate": 0.0001358211771506763,
+      "loss": 1.0687,
+      "step": 465
+    },
+    {
+      "epoch": 0.449868389566882,
+      "grad_norm": 0.15489693015617015,
+      "learning_rate": 0.00013425440751160112,
+      "loss": 1.0909,
+      "step": 470
+    },
+    {
+      "epoch": 0.4546542234984446,
+      "grad_norm": 0.1556280787651109,
+      "learning_rate": 0.00013267805236964967,
+      "loss": 1.1008,
+      "step": 475
+    },
+    {
+      "epoch": 0.45944005743000715,
+      "grad_norm": 0.16139496091159036,
+      "learning_rate": 0.00013109255284051615,
+      "loss": 1.1167,
+      "step": 480
+    },
+    {
+      "epoch": 0.4642258913615698,
+      "grad_norm": 0.15380326887200926,
+      "learning_rate": 0.00012949835259879304,
+      "loss": 1.1021,
+      "step": 485
+    },
+    {
+      "epoch": 0.46901172529313234,
+      "grad_norm": 0.1504710821626308,
+      "learning_rate": 0.00012789589775381676,
+      "loss": 1.0824,
+      "step": 490
+    },
+    {
+      "epoch": 0.4737975592246949,
+      "grad_norm": 0.16882632755621252,
+      "learning_rate": 0.00012628563672483146,
+      "loss": 1.091,
+      "step": 495
+    },
+    {
+      "epoch": 0.4785833931562575,
+      "grad_norm": 0.16236683430294702,
+      "learning_rate": 0.0001246680201155068,
+      "loss": 1.0609,
+      "step": 500
+    },
+    {
+      "epoch": 0.48336922708782004,
+      "grad_norm": 0.1534881294655078,
+      "learning_rate": 0.00012304350058784405,
+      "loss": 1.0611,
+      "step": 505
+    },
+    {
+      "epoch": 0.4881550610193826,
+      "grad_norm": 0.16620841316700394,
+      "learning_rate": 0.00012141253273550696,
+      "loss": 1.0932,
+      "step": 510
+    },
+    {
+      "epoch": 0.49294089495094523,
+      "grad_norm": 0.16942714030828704,
+      "learning_rate": 0.00011977557295661108,
+      "loss": 1.0856,
+      "step": 515
+    },
+    {
+      "epoch": 0.4977267288825078,
+      "grad_norm": 0.15500201031703087,
+      "learning_rate": 0.00011813307932600887,
+      "loss": 1.0852,
+      "step": 520
+    },
+    {
+      "epoch": 0.5025125628140703,
+      "grad_norm": 0.15248801968172002,
+      "learning_rate": 0.00011648551146710556,
+      "loss": 1.1069,
+      "step": 525
+    },
+    {
+      "epoch": 0.5072983967456329,
+      "grad_norm": 0.14978453385390675,
+      "learning_rate": 0.0001148333304232411,
+      "loss": 1.088,
+      "step": 530
+    },
+    {
+      "epoch": 0.5120842306771956,
+      "grad_norm": 0.14736066147246124,
+      "learning_rate": 0.00011317699852867548,
+      "loss": 1.0506,
+      "step": 535
+    },
+    {
+      "epoch": 0.5168700646087581,
+      "grad_norm": 0.15088998664120562,
+      "learning_rate": 0.0001115169792792124,
+      "loss": 1.0972,
+      "step": 540
+    },
+    {
+      "epoch": 0.5216558985403207,
+      "grad_norm": 0.14676026138747209,
+      "learning_rate": 0.00010985373720249801,
+      "loss": 1.0871,
+      "step": 545
+    },
+    {
+      "epoch": 0.5264417324718832,
+      "grad_norm": 0.17054822297185676,
+      "learning_rate": 0.00010818773772803082,
+      "loss": 1.0957,
+      "step": 550
+    },
+    {
+      "epoch": 0.5312275664034458,
+      "grad_norm": 0.15081743477470166,
+      "learning_rate": 0.0001065194470569193,
+      "loss": 1.1114,
+      "step": 555
+    },
+    {
+      "epoch": 0.5360134003350083,
+      "grad_norm": 0.1556600989117304,
+      "learning_rate": 0.0001048493320314238,
+      "loss": 1.0747,
+      "step": 560
+    },
+    {
+      "epoch": 0.540799234266571,
+      "grad_norm": 0.15346464585086714,
+      "learning_rate": 0.00010317786000431851,
+      "loss": 1.0761,
+      "step": 565
+    },
+    {
+      "epoch": 0.5455850681981336,
+      "grad_norm": 0.15178562379014646,
+      "learning_rate": 0.00010150549870811107,
+      "loss": 1.0839,
+      "step": 570
+    },
+    {
+      "epoch": 0.5503709021296961,
+      "grad_norm": 0.15263581024104103,
+      "learning_rate": 9.983271612415575e-05,
+      "loss": 1.0742,
+      "step": 575
+    },
+    {
+      "epoch": 0.5551567360612587,
+      "grad_norm": 0.15166582071053056,
+      "learning_rate": 9.81599803516968e-05,
+      "loss": 1.0725,
+      "step": 580
+    },
+    {
+      "epoch": 0.5599425699928212,
+      "grad_norm": 0.14735687803417952,
+      "learning_rate": 9.648775947687912e-05,
+      "loss": 1.0705,
+      "step": 585
+    },
+    {
+      "epoch": 0.5647284039243838,
+      "grad_norm": 0.14825818203221888,
+      "learning_rate": 9.48165214417624e-05,
+      "loss": 1.0871,
+      "step": 590
+    },
+    {
+      "epoch": 0.5695142378559463,
+      "grad_norm": 0.15700946642781993,
+      "learning_rate": 9.314673391337576e-05,
+      "loss": 1.0979,
+      "step": 595
+    },
+    {
+      "epoch": 0.574300071787509,
+      "grad_norm": 0.15580031067347558,
+      "learning_rate": 9.147886415284903e-05,
+      "loss": 1.0592,
+      "step": 600
+    },
+    {
+      "epoch": 0.5790859057190716,
+      "grad_norm": 0.14548002556094225,
+      "learning_rate": 8.981337888465788e-05,
+      "loss": 1.0787,
+      "step": 605
+    },
+    {
+      "epoch": 0.5838717396506341,
+      "grad_norm": 0.14237124600928142,
+      "learning_rate": 8.815074416601913e-05,
+      "loss": 1.0698,
+      "step": 610
+    },
+    {
+      "epoch": 0.5886575735821967,
+      "grad_norm": 0.15304745525626437,
+      "learning_rate": 8.649142525647272e-05,
+      "loss": 1.0848,
+      "step": 615
+    },
+    {
+      "epoch": 0.5934434075137592,
+      "grad_norm": 0.14513336716190856,
+      "learning_rate": 8.48358864876867e-05,
+      "loss": 1.0462,
+      "step": 620
+    },
+    {
+      "epoch": 0.5982292414453219,
+      "grad_norm": 0.1468415945819683,
+      "learning_rate": 8.318459113352221e-05,
+      "loss": 1.0906,
+      "step": 625
+    },
+    {
+      "epoch": 0.6030150753768844,
+      "grad_norm": 0.14408143553897426,
+      "learning_rate": 8.153800128039441e-05,
+      "loss": 1.085,
+      "step": 630
+    },
+    {
+      "epoch": 0.607800909308447,
+      "grad_norm": 0.15046217184291616,
+      "learning_rate": 7.989657769796533e-05,
+      "loss": 1.0882,
+      "step": 635
+    },
+    {
+      "epoch": 0.6125867432400096,
+      "grad_norm": 0.14348283659906289,
+      "learning_rate": 7.82607797102056e-05,
+      "loss": 1.0861,
+      "step": 640
+    },
+    {
+      "epoch": 0.6173725771715721,
+      "grad_norm": 0.14685503152106738,
+      "learning_rate": 7.663106506686057e-05,
+      "loss": 1.1003,
+      "step": 645
+    },
+    {
+      "epoch": 0.6221584111031347,
+      "grad_norm": 0.1480277391784376,
+      "learning_rate": 7.500788981535708e-05,
+      "loss": 1.0758,
+      "step": 650
+    },
+    {
+      "epoch": 0.6269442450346973,
+      "grad_norm": 0.1477910922274185,
+      "learning_rate": 7.339170817318625e-05,
+      "loss": 1.0695,
+      "step": 655
+    },
+    {
+      "epoch": 0.6317300789662599,
+      "grad_norm": 0.1551465349289344,
+      "learning_rate": 7.178297240079882e-05,
+      "loss": 1.0942,
+      "step": 660
+    },
+    {
+      "epoch": 0.6365159128978225,
+      "grad_norm": 0.148811465121087,
+      "learning_rate": 7.018213267504775e-05,
+      "loss": 1.0825,
+      "step": 665
+    },
+    {
+      "epoch": 0.641301746829385,
+      "grad_norm": 0.146937156337137,
+      "learning_rate": 6.858963696321403e-05,
+      "loss": 1.0985,
+      "step": 670
+    },
+    {
+      "epoch": 0.6460875807609476,
+      "grad_norm": 0.14703161191479286,
+      "learning_rate": 6.700593089765086e-05,
+      "loss": 1.06,
+      "step": 675
+    },
+    {
+      "epoch": 0.6508734146925101,
+      "grad_norm": 0.14564360148371303,
+      "learning_rate": 6.543145765108106e-05,
+      "loss": 1.0853,
+      "step": 680
+    },
+    {
+      "epoch": 0.6556592486240728,
+      "grad_norm": 0.14887365645849163,
+      "learning_rate": 6.3866657812583e-05,
+      "loss": 1.0787,
+      "step": 685
+    },
+    {
+      "epoch": 0.6604450825556353,
+      "grad_norm": 0.14533659914404762,
+      "learning_rate": 6.231196926429913e-05,
+      "loss": 1.073,
+      "step": 690
+    },
+    {
+      "epoch": 0.6652309164871979,
+      "grad_norm": 0.2354314895944445,
+      "learning_rate": 6.076782705890257e-05,
+      "loss": 1.0815,
+      "step": 695
+    },
+    {
+      "epoch": 0.6700167504187605,
+      "grad_norm": 0.14132233475416703,
+      "learning_rate": 5.9234663297854876e-05,
+      "loss": 1.0555,
+      "step": 700
+    },
+    {
+      "epoch": 0.674802584350323,
+      "grad_norm": 0.14913316600220797,
+      "learning_rate": 5.7712907010490036e-05,
+      "loss": 1.0785,
+      "step": 705
+    },
+    {
+      "epoch": 0.6795884182818857,
+      "grad_norm": 0.15328072297180578,
+      "learning_rate": 5.620298403395805e-05,
+      "loss": 1.0857,
+      "step": 710
+    },
+    {
+      "epoch": 0.6843742522134482,
+      "grad_norm": 0.17603388258774993,
+      "learning_rate": 5.4705316894061765e-05,
+      "loss": 1.0898,
+      "step": 715
+    },
+    {
+      "epoch": 0.6891600861450108,
+      "grad_norm": 0.1448443355064005,
+      "learning_rate": 5.322032468702036e-05,
+      "loss": 1.0714,
+      "step": 720
+    },
+    {
+      "epoch": 0.6939459200765733,
+      "grad_norm": 0.4624474555190123,
+      "learning_rate": 5.1748422962192376e-05,
+      "loss": 1.0994,
+      "step": 725
+    },
+    {
+      "epoch": 0.6987317540081359,
+      "grad_norm": 0.14868980834848183,
+      "learning_rate": 5.0290023605791666e-05,
+      "loss": 1.0725,
+      "step": 730
+    },
+    {
+      "epoch": 0.7035175879396985,
+      "grad_norm": 0.15278504704361137,
+      "learning_rate": 4.8845534725628086e-05,
+      "loss": 1.0962,
+      "step": 735
+    },
+    {
+      "epoch": 0.708303421871261,
+      "grad_norm": 0.14605679246576617,
+      "learning_rate": 4.741536053690552e-05,
+      "loss": 1.0947,
+      "step": 740
+    },
+    {
+      "epoch": 0.7130892558028237,
+      "grad_norm": 0.172204603811799,
+      "learning_rate": 4.599990124910918e-05,
+      "loss": 1.0758,
+      "step": 745
+    },
+    {
+      "epoch": 0.7178750897343862,
+      "grad_norm": 0.14357849865669614,
+      "learning_rate": 4.4599552954014145e-05,
+      "loss": 1.0682,
+      "step": 750
+    },
+    {
+      "epoch": 0.7226609236659488,
+      "grad_norm": 0.14980923833672957,
+      "learning_rate": 4.32147075148458e-05,
+      "loss": 1.0814,
+      "step": 755
+    },
+    {
+      "epoch": 0.7274467575975113,
+      "grad_norm": 0.16395768222951593,
+      "learning_rate": 4.1845752456623665e-05,
+      "loss": 1.0583,
+      "step": 760
+    },
+    {
+      "epoch": 0.7322325915290739,
+      "grad_norm": 0.14059821304657993,
+      "learning_rate": 4.049307085771931e-05,
+      "loss": 1.0839,
+      "step": 765
+    },
+    {
+      "epoch": 0.7370184254606366,
+      "grad_norm": 0.1472110334031576,
+      "learning_rate": 3.9157041242658477e-05,
+      "loss": 1.1079,
+      "step": 770
+    },
+    {
+      "epoch": 0.7418042593921991,
+      "grad_norm": 0.14020342123522012,
+      "learning_rate": 3.783803747619741e-05,
+      "loss": 1.0829,
+      "step": 775
+    },
+    {
+      "epoch": 0.7465900933237617,
+      "grad_norm": 0.17437047699695307,
+      "learning_rate": 3.653642865870359e-05,
+      "loss": 1.0808,
+      "step": 780
+    },
+    {
+      "epoch": 0.7513759272553242,
+      "grad_norm": 0.14320013892049976,
+      "learning_rate": 3.525257902286908e-05,
+      "loss": 1.0608,
+      "step": 785
+    },
+    {
+      "epoch": 0.7561617611868868,
+      "grad_norm": 0.14437417000631428,
+      "learning_rate": 3.398684783178648e-05,
+      "loss": 1.0618,
+      "step": 790
+    },
+    {
+      "epoch": 0.7609475951184493,
+      "grad_norm": 0.14321363672597254,
+      "learning_rate": 3.273958927841525e-05,
+      "loss": 1.0659,
+      "step": 795
+    },
+    {
+      "epoch": 0.765733429050012,
+      "grad_norm": 0.14121990349576288,
+      "learning_rate": 3.1511152386467055e-05,
+      "loss": 1.0936,
+      "step": 800
+    },
+    {
+      "epoch": 0.7705192629815746,
+      "grad_norm": 0.16146069783583863,
+      "learning_rate": 3.0301880912737568e-05,
+      "loss": 1.0647,
+      "step": 805
+    },
+    {
+      "epoch": 0.7753050969131371,
+      "grad_norm": 0.1447026626027737,
+      "learning_rate": 2.9112113250911844e-05,
+      "loss": 1.0747,
+      "step": 810
+    },
+    {
+      "epoch": 0.7800909308446997,
+      "grad_norm": 0.14724228311552523,
+      "learning_rate": 2.7942182336870925e-05,
+      "loss": 1.1046,
+      "step": 815
+    },
+    {
+      "epoch": 0.7848767647762622,
+      "grad_norm": 0.14612792897080507,
+      "learning_rate": 2.6792415555525463e-05,
+      "loss": 1.0391,
+      "step": 820
+    },
+    {
+      "epoch": 0.7896625987078248,
+      "grad_norm": 0.14445016139434405,
+      "learning_rate": 2.5663134649202647e-05,
+      "loss": 1.0808,
+      "step": 825
+    },
+    {
+      "epoch": 0.7944484326393874,
+      "grad_norm": 0.14283033243615206,
+      "learning_rate": 2.4554655627612245e-05,
+      "loss": 1.0767,
+      "step": 830
+    },
+    {
+      "epoch": 0.79923426657095,
+      "grad_norm": 0.1428104588189023,
+      "learning_rate": 2.34672886794167e-05,
+      "loss": 1.0884,
+      "step": 835
+    },
+    {
+      "epoch": 0.8040201005025126,
+      "grad_norm": 0.14106416222944104,
+      "learning_rate": 2.2401338085430323e-05,
+      "loss": 1.0891,
+      "step": 840
+    },
+    {
+      "epoch": 0.8088059344340751,
+      "grad_norm": 0.14453431354715718,
+      "learning_rate": 2.135710213347134e-05,
+      "loss": 1.0829,
+      "step": 845
+    },
+    {
+      "epoch": 0.8135917683656377,
+      "grad_norm": 0.1436138017414945,
+      "learning_rate": 2.0334873034891554e-05,
+      "loss": 1.0823,
+      "step": 850
+    },
+    {
+      "epoch": 0.8183776022972002,
+      "grad_norm": 0.14415504753616376,
+      "learning_rate": 1.933493684280574e-05,
+      "loss": 1.0749,
+      "step": 855
+    },
+    {
+      "epoch": 0.8231634362287629,
+      "grad_norm": 0.14188286670890893,
+      "learning_rate": 1.8357573372044834e-05,
+      "loss": 1.0775,
+      "step": 860
+    },
+    {
+      "epoch": 0.8279492701603255,
+      "grad_norm": 0.14043422592547342,
+      "learning_rate": 1.740305612085439e-05,
+      "loss": 1.0852,
+      "step": 865
+    },
+    {
+      "epoch": 0.832735104091888,
+      "grad_norm": 0.14014109535516273,
+      "learning_rate": 1.647165219436113e-05,
+      "loss": 1.0716,
+      "step": 870
+    },
+    {
+      "epoch": 0.8375209380234506,
+      "grad_norm": 0.18266681120520475,
+      "learning_rate": 1.556362222982799e-05,
+      "loss": 1.0711,
+      "step": 875
+    },
+    {
+      "epoch": 0.8423067719550131,
+      "grad_norm": 0.14585487433506303,
+      "learning_rate": 1.4679220323719234e-05,
+      "loss": 1.0561,
+      "step": 880
+    },
+    {
+      "epoch": 0.8470926058865758,
+      "grad_norm": 0.13911103035630754,
+      "learning_rate": 1.3818693960596185e-05,
+      "loss": 1.0707,
+      "step": 885
+    },
+    {
+      "epoch": 0.8518784398181383,
+      "grad_norm": 0.15612123605821987,
+      "learning_rate": 1.2982283943862738e-05,
+      "loss": 1.0494,
+      "step": 890
+    },
+    {
+      "epoch": 0.8566642737497009,
+      "grad_norm": 0.14067555622023134,
+      "learning_rate": 1.217022432838093e-05,
+      "loss": 1.0686,
+      "step": 895
+    },
+    {
+      "epoch": 0.8614501076812635,
+      "grad_norm": 0.1457410414761679,
+      "learning_rate": 1.1382742354974429e-05,
+      "loss": 1.0562,
+      "step": 900
+    },
+    {
+      "epoch": 0.866235941612826,
+      "grad_norm": 0.1398250627278749,
+      "learning_rate": 1.0620058386839393e-05,
+      "loss": 1.0753,
+      "step": 905
+    },
+    {
+      "epoch": 0.8710217755443886,
+      "grad_norm": 0.14690238478434312,
+      "learning_rate": 9.882385847879539e-06,
+      "loss": 1.0539,
+      "step": 910
+    },
+    {
+      "epoch": 0.8758076094759512,
+      "grad_norm": 0.14224902345010998,
+      "learning_rate": 9.169931162983137e-06,
+      "loss": 1.0575,
+      "step": 915
+    },
+    {
+      "epoch": 0.8805934434075138,
+      "grad_norm": 0.14002967562121116,
+      "learning_rate": 8.482893700258643e-06,
+      "loss": 1.0831,
+      "step": 920
+    },
+    {
+      "epoch": 0.8853792773390763,
+      "grad_norm": 0.14652920530592364,
+      "learning_rate": 7.821465715244947e-06,
+      "loss": 1.0844,
+      "step": 925
+    },
+    {
+      "epoch": 0.8901651112706389,
+      "grad_norm": 0.13985808750925746,
+      "learning_rate": 7.185832297111938e-06,
+      "loss": 1.0618,
+      "step": 930
+    },
+    {
+      "epoch": 0.8949509452022015,
+      "grad_norm": 0.15160308510490375,
+      "learning_rate": 6.576171316866608e-06,
+      "loss": 1.0773,
+      "step": 935
+    },
+    {
+      "epoch": 0.899736779133764,
+      "grad_norm": 0.14784429409642344,
+      "learning_rate": 5.9926533775789055e-06,
+      "loss": 1.0951,
+      "step": 940
+    },
+    {
+      "epoch": 0.9045226130653267,
+      "grad_norm": 0.14167088318411009,
+      "learning_rate": 5.435441766641369e-06,
+      "loss": 1.0841,
+      "step": 945
+    },
+    {
+      "epoch": 0.9093084469968892,
+      "grad_norm": 0.14256818695069146,
+      "learning_rate": 4.904692410075973e-06,
+      "loss": 1.0647,
+      "step": 950
+    },
+    {
+      "epoch": 0.9140942809284518,
+      "grad_norm": 0.15531748633710526,
+      "learning_rate": 4.400553828900989e-06,
+      "loss": 1.0757,
+      "step": 955
+    },
+    {
+      "epoch": 0.9188801148600143,
+      "grad_norm": 0.14420681549864126,
+      "learning_rate": 3.923167097569935e-06,
+      "loss": 1.0903,
+      "step": 960
+    },
+    {
+      "epoch": 0.9236659487915769,
+      "grad_norm": 0.14398010788396462,
+      "learning_rate": 3.4726658044943126e-06,
+      "loss": 1.0668,
+      "step": 965
+    },
+    {
+      "epoch": 0.9284517827231396,
+      "grad_norm": 0.14589900176146645,
+      "learning_rate": 3.0491760146611926e-06,
+      "loss": 1.0845,
+      "step": 970
+    },
+    {
+      "epoch": 0.9332376166547021,
+      "grad_norm": 0.13882750982702796,
+      "learning_rate": 2.652816234356159e-06,
+      "loss": 1.0382,
+      "step": 975
+    },
+    {
+      "epoch": 0.9380234505862647,
+      "grad_norm": 0.14112035905216325,
+      "learning_rate": 2.283697378001315e-06,
+      "loss": 1.0825,
+      "step": 980
+    },
+    {
+      "epoch": 0.9428092845178272,
+      "grad_norm": 0.13934480624047157,
+      "learning_rate": 1.9419227371178627e-06,
+      "loss": 1.0679,
+      "step": 985
+    },
+    {
+      "epoch": 0.9475951184493898,
+      "grad_norm": 0.14117739445269173,
+      "learning_rate": 1.6275879514217052e-06,
+      "loss": 1.0772,
+      "step": 990
+    },
+    {
+      "epoch": 0.9523809523809523,
+      "grad_norm": 0.14031209854381504,
+      "learning_rate": 1.3407809820603856e-06,
+      "loss": 1.0767,
+      "step": 995
+    },
+    {
+      "epoch": 0.957166786312515,
+      "grad_norm": 0.14091355128035063,
+      "learning_rate": 1.0815820869985893e-06,
+      "loss": 1.0635,
+      "step": 1000
+    },
+    {
+      "epoch": 0.9619526202440776,
+      "grad_norm": 0.14100298765660577,
+      "learning_rate": 8.50063798559475e-07,
+      "loss": 1.0861,
+      "step": 1005
+    },
+    {
+      "epoch": 0.9667384541756401,
+      "grad_norm": 0.1412224316948679,
+      "learning_rate": 6.462909031276443e-07,
+      "loss": 1.0633,
+      "step": 1010
+    },
+    {
+      "epoch": 0.9715242881072027,
+      "grad_norm": 0.1385906964183353,
+      "learning_rate": 4.7032042301985434e-07,
+      "loss": 1.0726,
+      "step": 1015
+    },
+    {
+      "epoch": 0.9763101220387652,
+      "grad_norm": 0.151976410727331,
+      "learning_rate": 3.222016005282824e-07,
+      "loss": 1.0645,
+      "step": 1020
+    },
+    {
+      "epoch": 0.9810959559703278,
+      "grad_norm": 0.14304003914264313,
+      "learning_rate": 2.0197588414094804e-07,
+      "loss": 1.0785,
+      "step": 1025
+    },
+    {
+      "epoch": 0.9858817899018905,
+      "grad_norm": 0.1395525833943131,
+      "learning_rate": 1.0967691694302451e-07,
+      "loss": 1.0582,
+      "step": 1030
+    },
+    {
+      "epoch": 0.990667623833453,
+      "grad_norm": 0.14208441339069042,
+      "learning_rate": 4.5330527202480654e-08,
+      "loss": 1.0763,
+      "step": 1035
+    },
+    {
+      "epoch": 0.9954534577650156,
+      "grad_norm": 0.13931375530799342,
+      "learning_rate": 8.95472114241791e-09,
+      "loss": 1.0444,
+      "step": 1040
+    },
+    {
+      "epoch": 0.9992821249102656,
+      "eval_loss": 1.077100157737732,
+      "eval_runtime": 3923.6787,
+      "eval_samples_per_second": 3.43,
+      "eval_steps_per_second": 0.858,
+      "step": 1044
+    },
+    {
+      "epoch": 0.9992821249102656,
+      "step": 1044,
+      "total_flos": 2155604625850368.0,
+      "train_loss": 1.091635976486279,
+      "train_runtime": 24351.9745,
+      "train_samples_per_second": 2.746,
+      "train_steps_per_second": 0.043
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1044,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2155604625850368.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}