SystemAdmin123 commited on Feb 6

Commit

b560235

verified ·

1 Parent(s): 30a3e99

Training in progress, step 50, checkpoint

Browse files

Files changed (17) hide show

last-checkpoint/config.json +16 -19
last-checkpoint/generation_config.json +2 -3
last-checkpoint/model-00001-of-00003.safetensors +1 -1
last-checkpoint/model-00002-of-00003.safetensors +1 -1
last-checkpoint/model-00003-of-00003.safetensors +1 -1
last-checkpoint/optimizer.pt +2 -2
last-checkpoint/rng_state_0.pth +2 -2
last-checkpoint/rng_state_1.pth +2 -2
last-checkpoint/rng_state_2.pth +2 -2
last-checkpoint/rng_state_3.pth +2 -2
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/special_tokens_map.json +1 -1
last-checkpoint/tokenizer.json +2 -2
last-checkpoint/tokenizer.model +2 -2
last-checkpoint/tokenizer_config.json +25 -4
last-checkpoint/trainer_state.json +34 -217
last-checkpoint/training_args.bin +2 -2

last-checkpoint/config.json CHANGED Viewed

@@ -1,31 +1,28 @@
 {
-  "_name_or_path": "trl-internal-testing/tiny-random-LlamaForCausalLM",
   "architectures": [
-    "LlamaForCausalLM"
   ],
-  "attention_bias": false,
   "attention_dropout": 0.0,
-  "bos_token_id": 0,
-  "eos_token_id": 2,
-  "head_dim": 4,
   "hidden_act": "silu",
-  "hidden_size": 16,
   "initializer_range": 0.02,
-  "intermediate_size": 64,
-  "max_position_embeddings": 2048,
-  "mlp_bias": false,
-  "model_type": "llama",
-  "num_attention_heads": 4,
-  "num_hidden_layers": 2,
-  "num_key_value_heads": 4,
-  "pad_token_id": -1,
-  "pretraining_tp": 1,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
   "rope_theta": 10000.0,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.48.1",
   "use_cache": false,
-  "vocab_size": 32000
 }

 {
+  "_name_or_path": "unsloth/OpenHermes-2.5-Mistral-7B",
   "architectures": [
+    "MistralForCausalLM"
   ],
   "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 32000,
+  "head_dim": 128,
   "hidden_act": "silu",
+  "hidden_size": 4096,
   "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 32768,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "rms_norm_eps": 1e-05,
   "rope_theta": 10000.0,
+  "sliding_window": 4096,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.48.1",
   "use_cache": false,
+  "vocab_size": 32002
 }

last-checkpoint/generation_config.json CHANGED Viewed

@@ -1,8 +1,7 @@
 {
   "_from_model_config": true,
-  "bos_token_id": 0,
   "do_sample": true,
-  "eos_token_id": 1,
-  "pad_token_id": 2,
   "transformers_version": "4.48.1"
 }

 {
   "_from_model_config": true,
+  "bos_token_id": 1,
   "do_sample": true,
+  "eos_token_id": 32000,
   "transformers_version": "4.48.1"
 }

last-checkpoint/model-00001-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:720105d2b88abf15d23994fec61854b0a1497a10d1d208715bd59904ab5936ff
 size 4943178720

 version https://git-lfs.github.com/spec/v1
+oid sha256:d9e33d86b7643ca1dcb0ff586538eca0faa1c5ceb646f55ab91b4c3f84b9791f
 size 4943178720

last-checkpoint/model-00002-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:22f56733ac237ee5fada526e2a06e89d311e7ce1cb761f9b8f6986910b678378
 size 4999819336

 version https://git-lfs.github.com/spec/v1
+oid sha256:09671ff7f4231f95a58974cee08101fdbd61e0e44a2da6afaa3f08be535bc56c
 size 4999819336

last-checkpoint/model-00003-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:31711beccc0b973ef7bf69d348d844233b185f6888f0520ae136ada35d74f983
 size 4540532728

 version https://git-lfs.github.com/spec/v1
+oid sha256:867e8292011b488643369bc98aece5a8d9682c4d0825739096988e70f10694e1
 size 4540532728

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:45076cd7d7f407afe1d46ae5fb6859b5620dc1fca8a858a0851ce56bd11f8878
-size 2162798

 version https://git-lfs.github.com/spec/v1
+oid sha256:ea5954872831a1199bf6baadd994d97241ff683269e1fa383981e05c79f3d256
+size 14710155092

last-checkpoint/rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4faa065a55913b65f4b0549e4d93d87e8865c0f6ec216f40a3de4d251a15322a
-size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:d608751b30705d9fdab765f269290cd17ed21a1697e9fcb49bf7ebeaac38aebb
+size 15024

last-checkpoint/rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:10ae8864af9d168bc9a94e5c5625da874d35a133304d7d7414b10c80148467d4
-size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:e84ec92ba6fcaca80f594e6a478d1a67e74e4c4df966b365126aa5fced1503ad
+size 15024

last-checkpoint/rng_state_2.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ece1ed46b8aa193251efdc1d8393b3bb872b53f6ba93c31cc3efc627b34d74be
-size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:ff2e96f7570e4637dff53935f12387cbc820714a50eb737472244c44d20994b4
+size 15024

last-checkpoint/rng_state_3.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3018d94a8b9b3b95a3578032d80b8d3f31c01fab9a615c48039128422aba13ef
-size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:67a7f9fde521312c299d8fef03e73900bbba79d87446cea5f97a33fc79bebea8
+size 15024

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:74485e67705dc36efbfb69b1e54f842e1ff07894d01bb0e36d6d2526a318b300
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:1b5bd1b09d58ee3e65553e6f9772dc7c5ca98238b1cade33cccd500df2328864
 size 1064

last-checkpoint/special_tokens_map.json CHANGED Viewed

@@ -7,7 +7,7 @@
     "single_word": false
   },
   "eos_token": {
-    "content": "</s>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,

     "single_word": false
   },
   "eos_token": {
+    "content": "<|im_end|>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,

last-checkpoint/tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d606f46a8aa6f29f0a0abdec7c3ffddefc9f9bfe26919532d209a0a850e25029
-size 3619013

 version https://git-lfs.github.com/spec/v1
+oid sha256:04222cd76979c181cd3f72c3bf6982fe2a09d9f4b8f23d82902efde18f1d0668
+size 3506125

last-checkpoint/tokenizer.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
-size 499723

 version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

last-checkpoint/tokenizer_config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "add_bos_token": true,
   "add_eos_token": false,
-  "add_prefix_space": null,
   "added_tokens_decoder": {
     "0": {
       "content": "<unk>",
@@ -26,17 +26,38 @@
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "bos_token": "<s>",
-  "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '### Instruction: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response: ' + message['content'] + eos_token}}{% endif %}{% endfor %}",
   "clean_up_tokenization_spaces": false,
-  "eos_token": "</s>",
   "extra_special_tokens": {},
   "legacy": true,
-  "model_max_length": 2048,
   "pad_token": "<unk>",
   "tokenizer_class": "LlamaTokenizer",
   "unk_token": "<unk>",
   "use_default_system_prompt": true,
   "use_fast": true

 {
   "add_bos_token": true,
   "add_eos_token": false,
+  "add_prefix_space": true,
   "added_tokens_decoder": {
     "0": {
       "content": "<unk>",
       "rstrip": false,
       "single_word": false,
       "special": true
+    },
+    "32000": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
     }
   },
+  "additional_special_tokens": [],
   "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
   "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
   "extra_special_tokens": {},
   "legacy": true,
+  "model_max_length": 32768,
   "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
   "tokenizer_class": "LlamaTokenizer",
+  "trust_remote_code": false,
   "unk_token": "<unk>",
   "use_default_system_prompt": true,
   "use_fast": true

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,245 +1,62 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 100.0,
-  "eval_steps": 200,
-  "global_step": 300,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.3333333333333333,
-      "eval_loss": 10.375551223754883,
-      "eval_runtime": 0.9886,
-      "eval_samples_per_second": 1518.364,
-      "eval_steps_per_second": 6.069,
       "step": 1
     },
     {
-      "epoch": 3.3333333333333335,
-      "grad_norm": 0.1015625,
-      "learning_rate": 0.00019945218953682734,
-      "loss": 10.3718,
       "step": 10
     },
     {
-      "epoch": 6.666666666666667,
-      "grad_norm": 0.162109375,
-      "learning_rate": 0.00019781476007338058,
-      "loss": 10.3553,
       "step": 20
     },
     {
-      "epoch": 10.0,
-      "grad_norm": 0.291015625,
-      "learning_rate": 0.00019510565162951537,
-      "loss": 10.3251,
       "step": 30
     },
     {
-      "epoch": 13.333333333333334,
-      "grad_norm": 0.330078125,
-      "learning_rate": 0.0001913545457642601,
-      "loss": 10.2723,
       "step": 40
     },
     {
-      "epoch": 16.666666666666668,
-      "grad_norm": 0.328125,
-      "learning_rate": 0.00018660254037844388,
-      "loss": 10.2096,
       "step": 50
-    },
-    {
-      "epoch": 20.0,
-      "grad_norm": 0.328125,
-      "learning_rate": 0.00018090169943749476,
-      "loss": 10.1499,
-      "step": 60
-    },
-    {
-      "epoch": 23.333333333333332,
-      "grad_norm": 0.326171875,
-      "learning_rate": 0.00017431448254773944,
-      "loss": 10.0935,
-      "step": 70
-    },
-    {
-      "epoch": 26.666666666666668,
-      "grad_norm": 0.333984375,
-      "learning_rate": 0.00016691306063588583,
-      "loss": 10.0398,
-      "step": 80
-    },
-    {
-      "epoch": 30.0,
-      "grad_norm": 0.337890625,
-      "learning_rate": 0.00015877852522924732,
-      "loss": 9.9895,
-      "step": 90
-    },
-    {
-      "epoch": 33.333333333333336,
-      "grad_norm": 0.341796875,
-      "learning_rate": 0.00015000000000000001,
-      "loss": 9.9424,
-      "step": 100
-    },
-    {
-      "epoch": 36.666666666666664,
-      "grad_norm": 0.34375,
-      "learning_rate": 0.00014067366430758004,
-      "loss": 9.8995,
-      "step": 110
-    },
-    {
-      "epoch": 40.0,
-      "grad_norm": 0.345703125,
-      "learning_rate": 0.00013090169943749476,
-      "loss": 9.859,
-      "step": 120
-    },
-    {
-      "epoch": 43.333333333333336,
-      "grad_norm": 0.34765625,
-      "learning_rate": 0.00012079116908177593,
-      "loss": 9.8216,
-      "step": 130
-    },
-    {
-      "epoch": 46.666666666666664,
-      "grad_norm": 0.3515625,
-      "learning_rate": 0.00011045284632676536,
-      "loss": 9.7872,
-      "step": 140
-    },
-    {
-      "epoch": 50.0,
-      "grad_norm": 0.357421875,
-      "learning_rate": 0.0001,
-      "loss": 9.7569,
-      "step": 150
-    },
-    {
-      "epoch": 53.333333333333336,
-      "grad_norm": 0.35546875,
-      "learning_rate": 8.954715367323468e-05,
-      "loss": 9.7325,
-      "step": 160
-    },
-    {
-      "epoch": 56.666666666666664,
-      "grad_norm": 0.359375,
-      "learning_rate": 7.920883091822408e-05,
-      "loss": 9.712,
-      "step": 170
-    },
-    {
-      "epoch": 60.0,
-      "grad_norm": 0.359375,
-      "learning_rate": 6.909830056250527e-05,
-      "loss": 9.697,
-      "step": 180
-    },
-    {
-      "epoch": 63.333333333333336,
-      "grad_norm": 0.361328125,
-      "learning_rate": 5.9326335692419995e-05,
-      "loss": 9.6841,
-      "step": 190
-    },
-    {
-      "epoch": 66.66666666666667,
-      "grad_norm": 0.361328125,
-      "learning_rate": 5.000000000000002e-05,
-      "loss": 9.6746,
-      "step": 200
-    },
-    {
-      "epoch": 66.66666666666667,
-      "eval_loss": 9.681697845458984,
-      "eval_runtime": 0.8666,
-      "eval_samples_per_second": 1732.155,
-      "eval_steps_per_second": 6.924,
-      "step": 200
-    },
-    {
-      "epoch": 70.0,
-      "grad_norm": 0.36328125,
-      "learning_rate": 4.12214747707527e-05,
-      "loss": 9.6678,
-      "step": 210
-    },
-    {
-      "epoch": 73.33333333333333,
-      "grad_norm": 0.36328125,
-      "learning_rate": 3.308693936411421e-05,
-      "loss": 9.6641,
-      "step": 220
-    },
-    {
-      "epoch": 76.66666666666667,
-      "grad_norm": 0.36328125,
-      "learning_rate": 2.5685517452260567e-05,
-      "loss": 9.6616,
-      "step": 230
-    },
-    {
-      "epoch": 80.0,
-      "grad_norm": 0.36328125,
-      "learning_rate": 1.9098300562505266e-05,
-      "loss": 9.6605,
-      "step": 240
-    },
-    {
-      "epoch": 83.33333333333333,
-      "grad_norm": 0.365234375,
-      "learning_rate": 1.339745962155613e-05,
-      "loss": 9.6596,
-      "step": 250
-    },
-    {
-      "epoch": 86.66666666666667,
-      "grad_norm": 0.36328125,
-      "learning_rate": 8.645454235739903e-06,
-      "loss": 9.6597,
-      "step": 260
-    },
-    {
-      "epoch": 90.0,
-      "grad_norm": 0.36328125,
-      "learning_rate": 4.8943483704846475e-06,
-      "loss": 9.6595,
-      "step": 270
-    },
-    {
-      "epoch": 93.33333333333333,
-      "grad_norm": 0.361328125,
-      "learning_rate": 2.1852399266194314e-06,
-      "loss": 9.6595,
-      "step": 280
-    },
-    {
-      "epoch": 96.66666666666667,
-      "grad_norm": 0.3671875,
-      "learning_rate": 5.478104631726711e-07,
-      "loss": 9.659,
-      "step": 290
-    },
-    {
-      "epoch": 100.0,
-      "grad_norm": 0.36328125,
-      "learning_rate": 0.0,
-      "loss": 9.6596,
-      "step": 300
     }
   ],
   "logging_steps": 10,
-  "max_steps": 300,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 100,
-  "save_steps": 40,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
@@ -247,13 +64,13 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": true
       },
       "attributes": {}
     }
   },
-  "total_flos": 490990259404800.0,
-  "train_batch_size": 32,
   "trial_name": null,
   "trial_params": null
 }

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.47619047619047616,
+  "eval_steps": 100,
+  "global_step": 50,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.009523809523809525,
+      "eval_loss": 1.873080849647522,
+      "eval_runtime": 27.6901,
+      "eval_samples_per_second": 54.207,
+      "eval_steps_per_second": 6.789,
       "step": 1
     },
     {
+      "epoch": 0.09523809523809523,
+      "grad_norm": 25.375,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 2.6924,
       "step": 10
     },
     {
+      "epoch": 0.19047619047619047,
+      "grad_norm": 2560.0,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 4.615,
       "step": 20
     },
     {
+      "epoch": 0.2857142857142857,
+      "grad_norm": 42.25,
+      "learning_rate": 0.0002,
+      "loss": 9.3887,
       "step": 30
     },
     {
+      "epoch": 0.38095238095238093,
+      "grad_norm": 1152.0,
+      "learning_rate": 0.00019984815164333163,
+      "loss": 8.7566,
       "step": 40
     },
     {
+      "epoch": 0.47619047619047616,
+      "grad_norm": 62.5,
+      "learning_rate": 0.00019939306773179497,
+      "loss": 8.555,
       "step": 50
     }
   ],
   "logging_steps": 10,
+  "max_steps": 600,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 6,
+  "save_steps": 50,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": false
       },
       "attributes": {}
     }
   },
+  "total_flos": 3.495035542700032e+16,
+  "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null
 }

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c5805d33db3bd7bbd9181584d8158e2b8616e3f18d56ec44b12ac78d4f859a79
-size 6840

 version https://git-lfs.github.com/spec/v1
+oid sha256:c7626f1764e5176ea7bd748263d627f8cb6423b299d8a596d8f39c9c0a75e460
+size 6776