Locutusque commited on 17 days ago

Commit

8a86c42

verified ·

1 Parent(s): 1958717

Upload folder using huggingface_hub

Browse files

Files changed (26) hide show

.gitattributes +1 -0
chat_template.jinja +4 -0
config.json +26 -0
generation_config.json +7 -0
optimizer.bin +3 -0
pytorch_model-00001-of-00005.bin +3 -0
pytorch_model-00002-of-00005.bin +3 -0
pytorch_model-00003-of-00005.bin +3 -0
pytorch_model-00004-of-00005.bin +3 -0
pytorch_model-00005-of-00005.bin +3 -0
pytorch_model.bin.index.json +371 -0
pytorch_model_fsdp.bin +3 -0
rng_state_0.pth +3 -0
rng_state_1.pth +3 -0
rng_state_2.pth +3 -0
rng_state_3.pth +3 -0
rng_state_4.pth +3 -0
rng_state_5.pth +3 -0
rng_state_6.pth +3 -0
rng_state_7.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +30 -0
tokenizer.json +3 -0
tokenizer_config.json +0 -0
trainer_state.json +755 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,4 @@

+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "architectures": [
+    "FSDPMistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.55.3",
+  "use_cache": false,
+  "vocab_size": 131074
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "do_sample": true,
+  "eos_token_id": 2,
+  "transformers_version": "4.55.3"
+}

optimizer.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:398bd8ea94c8228b85fee1356702734430a4a15b25822c75f9109b6eaf061452
+size 48991538020

pytorch_model-00001-of-00005.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3880743a863323ba26c3bc841df4d166ffdb13012c395ba110f5aeab3d610ff9
+size 4865557157

pytorch_model-00002-of-00005.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1332590fd05e118295c926aa4eb6180278ce402fd67c25d900483f71b5babcbb
+size 4907548255

pytorch_model-00003-of-00005.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:164611b4b0e21b7e4ad3903d4c93b50f3fbdbe59313e302c2879f36708431a7a
+size 4907548319

pytorch_model-00004-of-00005.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df373bf5bc038a596ef55969f4899360237873a363ee5cbf71f07fa2aac30355
+size 4907548319

pytorch_model-00005-of-00005.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b10176998a0d82b4157c70854372ada1fba90cbee8668d45c2b248d87a0efcb
+size 4907530809

pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,371 @@

+{
+  "metadata": {
+    "total_parameters": 12247802880,
+    "total_size": 24495605760
+  },
+  "weight_map": {
+    "lm_head.weight": "pytorch_model-00005-of-00005.bin",
+    "model.embed_tokens.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.10.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.10.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.10.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.11.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.11.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.11.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.12.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.12.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.12.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.13.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.13.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.13.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.14.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.14.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.14.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.15.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.15.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.15.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.16.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.16.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.16.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.17.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.17.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.17.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.18.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.18.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.18.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.19.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.19.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.19.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.20.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.20.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.20.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.21.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.21.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.21.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.22.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.22.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.22.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.23.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.23.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.23.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.24.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.24.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.24.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.25.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.25.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.25.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.26.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.26.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.26.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.27.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.27.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.27.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.28.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.28.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.28.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.29.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.29.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.29.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.30.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.30.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.30.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.31.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.31.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.31.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.32.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.32.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.32.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.32.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.32.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.32.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.32.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.32.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.32.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.33.input_layernorm.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.33.mlp.down_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.33.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.33.mlp.up_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.33.post_attention_layernorm.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.33.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.33.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.33.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.33.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.34.input_layernorm.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.34.mlp.down_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.34.mlp.gate_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.34.mlp.up_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.34.post_attention_layernorm.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.34.self_attn.k_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.34.self_attn.o_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.34.self_attn.q_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.34.self_attn.v_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.35.input_layernorm.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.35.mlp.down_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.35.mlp.gate_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.35.mlp.up_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.35.post_attention_layernorm.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.35.self_attn.k_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.35.self_attn.o_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.35.self_attn.q_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.35.self_attn.v_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.36.input_layernorm.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.36.mlp.down_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.36.mlp.gate_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.36.mlp.up_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.36.post_attention_layernorm.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.36.self_attn.k_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.36.self_attn.o_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.36.self_attn.q_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.36.self_attn.v_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.37.input_layernorm.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.37.mlp.down_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.37.mlp.gate_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.37.mlp.up_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.37.post_attention_layernorm.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.37.self_attn.k_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.37.self_attn.o_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.37.self_attn.q_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.37.self_attn.v_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.38.input_layernorm.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.38.mlp.down_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.38.mlp.gate_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.38.mlp.up_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.38.post_attention_layernorm.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.38.self_attn.k_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.38.self_attn.o_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.38.self_attn.q_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.38.self_attn.v_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.39.input_layernorm.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.39.mlp.down_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.39.mlp.gate_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.39.mlp.up_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.39.post_attention_layernorm.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.39.self_attn.k_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.39.self_attn.o_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.39.self_attn.q_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.39.self_attn.v_proj.weight": "pytorch_model-00005-of-00005.bin",
+    "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.6.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.6.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.6.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.7.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.7.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.7.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.8.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.8.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.8.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.9.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.9.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.9.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.norm.weight": "pytorch_model-00005-of-00005.bin"
+  }
+}

pytorch_model_fsdp.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0497ed1d2c3b3e7b9761b524199e149dce64fe8f58e4d309dff1df953a4c3f6e
+size 24495729540

rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f469fb6a869fe76761e1194ed0a7948ca397689bbc8ac0a9ea85a077fd50929
+size 16389

rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77e31efd49e7c2510fff79f966c879db58740a4187714c13003ffa53d0d441c5
+size 16389

rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:755aba68d004de8b7239e4451f96d8aaad4274ed7f03ec57d204f73d7b768a54
+size 16389

rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1e57948798e97ec4bd65e4f2bab0090fd58ab95e9d421be20702021446d2636
+size 16389

rng_state_4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d7930f1c6dd64fe161f166b710675bea007029bf2a54e835287c8517c8d61b7e
+size 16389

rng_state_5.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c098b3d19df4c6a4261105183eb9357e2715d784d681c1426e4bb88c847c317
+size 16389

rng_state_6.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c914eacfcdfa6cf1a18175490235b8bf14f4521cc8ebda28a827fb8611e2958d
+size 16389

rng_state_7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad02907448ab52a0de6407d8cc85b4523850947654c14a8ca1a3772f6c8c9cf8
+size 16389

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:56fa33f71aa3373959348e1b293b953d482c9514481a257d258125930b278585
+size 1465

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a4e4efa2e5c904d27102d0cb88f1efcd446c7abbfeb829864ce12fe1b4a90a0
+size 17078670

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

trainer_state.json ADDED Viewed

	@@ -0,0 +1,755 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9895470383275261,
+  "eval_steps": 500,
+  "global_step": 71,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0,
+      "eval_loss": 1.0074056386947632,
+      "eval_runtime": 98.4091,
+      "eval_samples_per_second": 47.221,
+      "eval_steps_per_second": 0.193,
+      "memory/device_mem_reserved(gib)": 26.24,
+      "memory/max_mem_active(gib)": 22.2,
+      "memory/max_mem_allocated(gib)": 22.2,
+      "step": 0
+    },
+    {
+      "epoch": 0.013937282229965157,
+      "grad_norm": 33.0,
+      "learning_rate": 0.0,
+      "loss": 1.0193,
+      "memory/device_mem_reserved(gib)": 101.63,
+      "memory/max_mem_active(gib)": 96.87,
+      "memory/max_mem_allocated(gib)": 96.87,
+      "step": 1
+    },
+    {
+      "epoch": 0.027874564459930314,
+      "grad_norm": 30.25,
+      "learning_rate": 3.5e-06,
+      "loss": 0.9952,
+      "memory/device_mem_reserved(gib)": 128.11,
+      "memory/max_mem_active(gib)": 122.89,
+      "memory/max_mem_allocated(gib)": 122.89,
+      "step": 2
+    },
+    {
+      "epoch": 0.041811846689895474,
+      "grad_norm": 2.171875,
+      "learning_rate": 7e-06,
+      "loss": 0.9583,
+      "memory/device_mem_reserved(gib)": 128.11,
+      "memory/max_mem_active(gib)": 122.89,
+      "memory/max_mem_allocated(gib)": 122.89,
+      "step": 3
+    },
+    {
+      "epoch": 0.05574912891986063,
+      "grad_norm": 1.578125,
+      "learning_rate": 6.9963728577635466e-06,
+      "loss": 0.9165,
+      "memory/device_mem_reserved(gib)": 128.11,
+      "memory/max_mem_active(gib)": 122.89,
+      "memory/max_mem_allocated(gib)": 122.89,
+      "step": 4
+    },
+    {
+      "epoch": 0.06968641114982578,
+      "grad_norm": 1.015625,
+      "learning_rate": 6.98549894886036e-06,
+      "loss": 0.8888,
+      "memory/device_mem_reserved(gib)": 128.11,
+      "memory/max_mem_active(gib)": 122.89,
+      "memory/max_mem_allocated(gib)": 122.89,
+      "step": 5
+    },
+    {
+      "epoch": 0.08362369337979095,
+      "grad_norm": 0.7421875,
+      "learning_rate": 6.9674008111271575e-06,
+      "loss": 0.8721,
+      "memory/device_mem_reserved(gib)": 128.11,
+      "memory/max_mem_active(gib)": 122.89,
+      "memory/max_mem_allocated(gib)": 122.89,
+      "step": 6
+    },
+    {
+      "epoch": 0.0975609756097561,
+      "grad_norm": 0.671875,
+      "learning_rate": 6.942115955718097e-06,
+      "loss": 0.8577,
+      "memory/device_mem_reserved(gib)": 128.11,
+      "memory/max_mem_active(gib)": 122.89,
+      "memory/max_mem_allocated(gib)": 122.89,
+      "step": 7
+    },
+    {
+      "epoch": 0.11149825783972125,
+      "grad_norm": 0.57421875,
+      "learning_rate": 6.909696789357177e-06,
+      "loss": 0.8596,
+      "memory/device_mem_reserved(gib)": 128.11,
+      "memory/max_mem_active(gib)": 122.89,
+      "memory/max_mem_allocated(gib)": 122.89,
+      "step": 8
+    },
+    {
+      "epoch": 0.1254355400696864,
+      "grad_norm": 0.5703125,
+      "learning_rate": 6.870210505717297e-06,
+      "loss": 0.8492,
+      "memory/device_mem_reserved(gib)": 128.11,
+      "memory/max_mem_active(gib)": 122.89,
+      "memory/max_mem_allocated(gib)": 122.89,
+      "step": 9
+    },
+    {
+      "epoch": 0.13937282229965156,
+      "grad_norm": 0.5625,
+      "learning_rate": 6.8237389461511175e-06,
+      "loss": 0.8379,
+      "memory/device_mem_reserved(gib)": 128.11,
+      "memory/max_mem_active(gib)": 122.89,
+      "memory/max_mem_allocated(gib)": 122.89,
+      "step": 10
+    },
+    {
+      "epoch": 0.15331010452961671,
+      "grad_norm": 0.458984375,
+      "learning_rate": 6.770378430062349e-06,
+      "loss": 0.8453,
+      "memory/device_mem_reserved(gib)": 128.11,
+      "memory/max_mem_active(gib)": 122.89,
+      "memory/max_mem_allocated(gib)": 122.89,
+      "step": 11
+    },
+    {
+      "epoch": 0.1672473867595819,
+      "grad_norm": 0.52734375,
+      "learning_rate": 6.710239555269086e-06,
+      "loss": 0.8091,
+      "memory/device_mem_reserved(gib)": 128.11,
+      "memory/max_mem_active(gib)": 122.89,
+      "memory/max_mem_allocated(gib)": 122.89,
+      "step": 12
+    },
+    {
+      "epoch": 0.18118466898954705,
+      "grad_norm": 0.45703125,
+      "learning_rate": 6.643446968772936e-06,
+      "loss": 0.838,
+      "memory/device_mem_reserved(gib)": 128.11,
+      "memory/max_mem_active(gib)": 122.89,
+      "memory/max_mem_allocated(gib)": 122.89,
+      "step": 13
+    },
+    {
+      "epoch": 0.1951219512195122,
+      "grad_norm": 0.375,
+      "learning_rate": 6.5701391084090805e-06,
+      "loss": 0.7947,
+      "memory/device_mem_reserved(gib)": 128.11,
+      "memory/max_mem_active(gib)": 122.89,
+      "memory/max_mem_allocated(gib)": 122.89,
+      "step": 14
+    },
+    {
+      "epoch": 0.20905923344947736,
+      "grad_norm": 0.359375,
+      "learning_rate": 6.49046791591271e-06,
+      "loss": 0.7993,
+      "memory/device_mem_reserved(gib)": 128.11,
+      "memory/max_mem_active(gib)": 122.89,
+      "memory/max_mem_allocated(gib)": 122.89,
+      "step": 15
+    },
+    {
+      "epoch": 0.2229965156794425,
+      "grad_norm": 0.3828125,
+      "learning_rate": 6.404598521996588e-06,
+      "loss": 0.8075,
+      "memory/device_mem_reserved(gib)": 128.11,
+      "memory/max_mem_active(gib)": 122.89,
+      "memory/max_mem_allocated(gib)": 122.89,
+      "step": 16
+    },
+    {
+      "epoch": 0.23693379790940766,
+      "grad_norm": 0.37109375,
+      "learning_rate": 6.312708904092424e-06,
+      "loss": 0.8114,
+      "memory/device_mem_reserved(gib)": 128.11,
+      "memory/max_mem_active(gib)": 122.89,
+      "memory/max_mem_allocated(gib)": 122.89,
+      "step": 17
+    },
+    {
+      "epoch": 0.2508710801393728,
+      "grad_norm": 0.38671875,
+      "learning_rate": 6.21498951746547e-06,
+      "loss": 0.7909,
+      "memory/device_mem_reserved(gib)": 128.11,
+      "memory/max_mem_active(gib)": 122.89,
+      "memory/max_mem_allocated(gib)": 122.89,
+      "step": 18
+    },
+    {
+      "epoch": 0.26480836236933797,
+      "grad_norm": 0.439453125,
+      "learning_rate": 6.111642900466899e-06,
+      "loss": 0.7892,
+      "memory/device_mem_reserved(gib)": 128.11,
+      "memory/max_mem_active(gib)": 122.89,
+      "memory/max_mem_allocated(gib)": 122.89,
+      "step": 19
+    },
+    {
+      "epoch": 0.2787456445993031,
+      "grad_norm": 0.330078125,
+      "learning_rate": 6.002883254742148e-06,
+      "loss": 0.7954,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 20
+    },
+    {
+      "epoch": 0.2926829268292683,
+      "grad_norm": 0.328125,
+      "learning_rate": 5.88893600126529e-06,
+      "loss": 0.7871,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 21
+    },
+    {
+      "epoch": 0.30662020905923343,
+      "grad_norm": 0.310546875,
+      "learning_rate": 5.770037313119646e-06,
+      "loss": 0.7897,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 22
+    },
+    {
+      "epoch": 0.3205574912891986,
+      "grad_norm": 0.333984375,
+      "learning_rate": 5.646433625993007e-06,
+      "loss": 0.8007,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 23
+    },
+    {
+      "epoch": 0.3344947735191638,
+      "grad_norm": 0.302734375,
+      "learning_rate": 5.518381127402035e-06,
+      "loss": 0.79,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 24
+    },
+    {
+      "epoch": 0.34843205574912894,
+      "grad_norm": 0.345703125,
+      "learning_rate": 5.386145225704515e-06,
+      "loss": 0.7838,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 25
+    },
+    {
+      "epoch": 0.3623693379790941,
+      "grad_norm": 0.306640625,
+      "learning_rate": 5.25e-06,
+      "loss": 0.7863,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 26
+    },
+    {
+      "epoch": 0.37630662020905925,
+      "grad_norm": 0.43359375,
+      "learning_rate": 5.110227632059032e-06,
+      "loss": 0.7719,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 27
+    },
+    {
+      "epoch": 0.3902439024390244,
+      "grad_norm": 0.294921875,
+      "learning_rate": 4.967117821458325e-06,
+      "loss": 0.7827,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 28
+    },
+    {
+      "epoch": 0.40418118466898956,
+      "grad_norm": 0.291015625,
+      "learning_rate": 4.82096718513415e-06,
+      "loss": 0.7893,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 29
+    },
+    {
+      "epoch": 0.4181184668989547,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 4.672078642598451e-06,
+      "loss": 0.7885,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 30
+    },
+    {
+      "epoch": 0.43205574912891986,
+      "grad_norm": 0.26953125,
+      "learning_rate": 4.5207607880918874e-06,
+      "loss": 0.7921,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 31
+    },
+    {
+      "epoch": 0.445993031358885,
+      "grad_norm": 0.279296875,
+      "learning_rate": 4.36732725097515e-06,
+      "loss": 0.7808,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 32
+    },
+    {
+      "epoch": 0.45993031358885017,
+      "grad_norm": 0.259765625,
+      "learning_rate": 4.212096045684219e-06,
+      "loss": 0.787,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 33
+    },
+    {
+      "epoch": 0.4738675958188153,
+      "grad_norm": 0.244140625,
+      "learning_rate": 4.055388912596879e-06,
+      "loss": 0.7653,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 34
+    },
+    {
+      "epoch": 0.4878048780487805,
+      "grad_norm": 0.248046875,
+      "learning_rate": 3.897530651176662e-06,
+      "loss": 0.7802,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 35
+    },
+    {
+      "epoch": 0.5017421602787456,
+      "grad_norm": 0.25390625,
+      "learning_rate": 3.7388484467763488e-06,
+      "loss": 0.7856,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 36
+    },
+    {
+      "epoch": 0.5156794425087108,
+      "grad_norm": 0.302734375,
+      "learning_rate": 3.5796711924963697e-06,
+      "loss": 0.7722,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 37
+    },
+    {
+      "epoch": 0.5296167247386759,
+      "grad_norm": 0.26953125,
+      "learning_rate": 3.42032880750363e-06,
+      "loss": 0.7803,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 38
+    },
+    {
+      "epoch": 0.5435540069686411,
+      "grad_norm": 0.2578125,
+      "learning_rate": 3.261151553223652e-06,
+      "loss": 0.7782,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 39
+    },
+    {
+      "epoch": 0.5574912891986062,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 3.1024693488233373e-06,
+      "loss": 0.7711,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 40
+    },
+    {
+      "epoch": 0.5714285714285714,
+      "grad_norm": 0.26953125,
+      "learning_rate": 2.94461108740312e-06,
+      "loss": 0.7634,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 41
+    },
+    {
+      "epoch": 0.5853658536585366,
+      "grad_norm": 0.296875,
+      "learning_rate": 2.7879039543157825e-06,
+      "loss": 0.7734,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 42
+    },
+    {
+      "epoch": 0.5993031358885017,
+      "grad_norm": 0.26171875,
+      "learning_rate": 2.6326727490248506e-06,
+      "loss": 0.7876,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 43
+    },
+    {
+      "epoch": 0.6132404181184669,
+      "grad_norm": 0.240234375,
+      "learning_rate": 2.4792392119081124e-06,
+      "loss": 0.7603,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 44
+    },
+    {
+      "epoch": 0.627177700348432,
+      "grad_norm": 0.251953125,
+      "learning_rate": 2.3279213574015483e-06,
+      "loss": 0.7671,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 45
+    },
+    {
+      "epoch": 0.6411149825783972,
+      "grad_norm": 0.263671875,
+      "learning_rate": 2.17903281486585e-06,
+      "loss": 0.7783,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 46
+    },
+    {
+      "epoch": 0.6550522648083623,
+      "grad_norm": 0.2734375,
+      "learning_rate": 2.0328821785416767e-06,
+      "loss": 0.7866,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 47
+    },
+    {
+      "epoch": 0.6689895470383276,
+      "grad_norm": 0.251953125,
+      "learning_rate": 1.8897723679409675e-06,
+      "loss": 0.782,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 48
+    },
+    {
+      "epoch": 0.6829268292682927,
+      "grad_norm": 0.333984375,
+      "learning_rate": 1.7500000000000008e-06,
+      "loss": 0.7747,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 49
+    },
+    {
+      "epoch": 0.6968641114982579,
+      "grad_norm": 0.255859375,
+      "learning_rate": 1.6138547742954857e-06,
+      "loss": 0.7854,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 50
+    },
+    {
+      "epoch": 0.710801393728223,
+      "grad_norm": 0.2421875,
+      "learning_rate": 1.4816188725979652e-06,
+      "loss": 0.7655,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 51
+    },
+    {
+      "epoch": 0.7247386759581882,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 1.3535663740069923e-06,
+      "loss": 0.7726,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 52
+    },
+    {
+      "epoch": 0.7386759581881533,
+      "grad_norm": 0.232421875,
+      "learning_rate": 1.229962686880354e-06,
+      "loss": 0.7836,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 125.6,
+      "memory/max_mem_allocated(gib)": 125.6,
+      "step": 53
+    },
+    {
+      "epoch": 0.7526132404181185,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 1.1110639987347114e-06,
+      "loss": 0.7531,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 128.58,
+      "memory/max_mem_allocated(gib)": 128.58,
+      "step": 54
+    },
+    {
+      "epoch": 0.7665505226480837,
+      "grad_norm": 0.24609375,
+      "learning_rate": 9.971167452578519e-07,
+      "loss": 0.7736,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 128.58,
+      "memory/max_mem_allocated(gib)": 128.58,
+      "step": 55
+    },
+    {
+      "epoch": 0.7804878048780488,
+      "grad_norm": 0.259765625,
+      "learning_rate": 8.883570995331009e-07,
+      "loss": 0.7662,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 128.58,
+      "memory/max_mem_allocated(gib)": 128.58,
+      "step": 56
+    },
+    {
+      "epoch": 0.794425087108014,
+      "grad_norm": 0.2578125,
+      "learning_rate": 7.850104825345303e-07,
+      "loss": 0.7646,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 128.58,
+      "memory/max_mem_allocated(gib)": 128.58,
+      "step": 57
+    },
+    {
+      "epoch": 0.8083623693379791,
+      "grad_norm": 0.24609375,
+      "learning_rate": 6.872910959075762e-07,
+      "loss": 0.7673,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 128.58,
+      "memory/max_mem_allocated(gib)": 128.58,
+      "step": 58
+    },
+    {
+      "epoch": 0.8222996515679443,
+      "grad_norm": 0.248046875,
+      "learning_rate": 5.954014780034123e-07,
+      "loss": 0.7748,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 128.58,
+      "memory/max_mem_allocated(gib)": 128.58,
+      "step": 59
+    },
+    {
+      "epoch": 0.8362369337979094,
+      "grad_norm": 0.24609375,
+      "learning_rate": 5.0953208408729e-07,
+      "loss": 0.7784,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 128.58,
+      "memory/max_mem_allocated(gib)": 128.58,
+      "step": 60
+    },
+    {
+      "epoch": 0.8501742160278746,
+      "grad_norm": 0.283203125,
+      "learning_rate": 4.2986089159092006e-07,
+      "loss": 0.7638,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 128.58,
+      "memory/max_mem_allocated(gib)": 128.58,
+      "step": 61
+    },
+    {
+      "epoch": 0.8641114982578397,
+      "grad_norm": 0.244140625,
+      "learning_rate": 3.5655303122706395e-07,
+      "loss": 0.7682,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 128.58,
+      "memory/max_mem_allocated(gib)": 128.58,
+      "step": 62
+    },
+    {
+      "epoch": 0.8780487804878049,
+      "grad_norm": 0.27734375,
+      "learning_rate": 2.897604447309151e-07,
+      "loss": 0.7706,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 128.58,
+      "memory/max_mem_allocated(gib)": 128.58,
+      "step": 63
+    },
+    {
+      "epoch": 0.89198606271777,
+      "grad_norm": 0.2578125,
+      "learning_rate": 2.2962156993765138e-07,
+      "loss": 0.777,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 128.58,
+      "memory/max_mem_allocated(gib)": 128.58,
+      "step": 64
+    },
+    {
+      "epoch": 0.9059233449477352,
+      "grad_norm": 0.255859375,
+      "learning_rate": 1.7626105384888284e-07,
+      "loss": 0.7701,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 128.58,
+      "memory/max_mem_allocated(gib)": 128.58,
+      "step": 65
+    },
+    {
+      "epoch": 0.9198606271777003,
+      "grad_norm": 0.248046875,
+      "learning_rate": 1.2978949428270303e-07,
+      "loss": 0.761,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 128.58,
+      "memory/max_mem_allocated(gib)": 128.58,
+      "step": 66
+    },
+    {
+      "epoch": 0.9337979094076655,
+      "grad_norm": 0.283203125,
+      "learning_rate": 9.030321064282354e-08,
+      "loss": 0.7711,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 128.58,
+      "memory/max_mem_allocated(gib)": 128.58,
+      "step": 67
+    },
+    {
+      "epoch": 0.9477351916376306,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 5.788404428190291e-08,
+      "loss": 0.7776,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 128.58,
+      "memory/max_mem_allocated(gib)": 128.58,
+      "step": 68
+    },
+    {
+      "epoch": 0.9616724738675958,
+      "grad_norm": 0.236328125,
+      "learning_rate": 3.259918887284235e-08,
+      "loss": 0.7955,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 128.58,
+      "memory/max_mem_allocated(gib)": 128.58,
+      "step": 69
+    },
+    {
+      "epoch": 0.975609756097561,
+      "grad_norm": 0.255859375,
+      "learning_rate": 1.4501051139640508e-08,
+      "loss": 0.7704,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 128.58,
+      "memory/max_mem_allocated(gib)": 128.58,
+      "step": 70
+    },
+    {
+      "epoch": 0.9895470383275261,
+      "grad_norm": 0.263671875,
+      "learning_rate": 3.627142236453551e-09,
+      "loss": 0.7826,
+      "memory/device_mem_reserved(gib)": 133.93,
+      "memory/max_mem_active(gib)": 128.58,
+      "memory/max_mem_allocated(gib)": 128.58,
+      "step": 71
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 71,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 20,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.4926199650791719e+19,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34986bd2d4c1c5f807bce3da001e912d8c59dccd1e6e7de6a0b5b14d109d5557
+size 8017