medication-stucture / running_log.txt

Upload folder using huggingface_hub

d9ffe28 verified 7 days ago

26.8 kB

	[INFO\|2025-06-25 12:12:48] tokenization_utils_base.py:2023 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/tokenizer.json

	[INFO\|2025-06-25 12:12:48] tokenization_utils_base.py:2023 >> loading file tokenizer.model from cache at None

	[INFO\|2025-06-25 12:12:48] tokenization_utils_base.py:2023 >> loading file added_tokens.json from cache at None

	[INFO\|2025-06-25 12:12:48] tokenization_utils_base.py:2023 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/special_tokens_map.json

	[INFO\|2025-06-25 12:12:48] tokenization_utils_base.py:2023 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/tokenizer_config.json

	[INFO\|2025-06-25 12:12:48] tokenization_utils_base.py:2023 >> loading file chat_template.jinja from cache at None

	[INFO\|2025-06-25 12:12:48] tokenization_utils_base.py:2299 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

	[INFO\|2025-06-25 12:12:50] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/config.json

	[INFO\|2025-06-25 12:12:50] configuration_utils.py:770 >> Model config LlamaConfig {
	"architectures": [
	"LlamaForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"bos_token_id": 128000,
	"eos_token_id": [
	128001,
	128008,
	128009
	],
	"head_dim": 128,
	"hidden_act": "silu",
	"hidden_size": 4096,
	"initializer_range": 0.02,
	"intermediate_size": 14336,
	"max_position_embeddings": 131072,
	"mlp_bias": false,
	"model_type": "llama",
	"num_attention_heads": 32,
	"num_hidden_layers": 32,
	"num_key_value_heads": 8,
	"pretraining_tp": 1,
	"rms_norm_eps": 1e-05,
	"rope_scaling": {
	"factor": 8.0,
	"high_freq_factor": 4.0,
	"low_freq_factor": 1.0,
	"original_max_position_embeddings": 8192,
	"rope_type": "llama3"
	},
	"rope_theta": 500000.0,
	"tie_word_embeddings": false,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.52.4",
	"use_cache": true,
	"vocab_size": 128256
	}


	[INFO\|2025-06-25 12:12:51] tokenization_utils_base.py:2023 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/tokenizer.json

	[INFO\|2025-06-25 12:12:51] tokenization_utils_base.py:2023 >> loading file tokenizer.model from cache at None

	[INFO\|2025-06-25 12:12:51] tokenization_utils_base.py:2023 >> loading file added_tokens.json from cache at None

	[INFO\|2025-06-25 12:12:51] tokenization_utils_base.py:2023 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/special_tokens_map.json

	[INFO\|2025-06-25 12:12:51] tokenization_utils_base.py:2023 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/tokenizer_config.json

	[INFO\|2025-06-25 12:12:51] tokenization_utils_base.py:2023 >> loading file chat_template.jinja from cache at None

	[INFO\|2025-06-25 12:12:51] tokenization_utils_base.py:2299 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

	[INFO\|2025-06-25 12:12:51] logging.py:143 >> Add pad token: <\|eot_id\|>

	[INFO\|2025-06-25 12:12:51] logging.py:143 >> Add <\|eom_id\|> to stop words.

	[INFO\|2025-06-25 12:12:51] logging.py:143 >> Loading dataset thomas_train.json...

	[INFO\|2025-06-25 12:12:54] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/config.json

	[INFO\|2025-06-25 12:12:54] configuration_utils.py:770 >> Model config LlamaConfig {
	"architectures": [
	"LlamaForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"bos_token_id": 128000,
	"eos_token_id": [
	128001,
	128008,
	128009
	],
	"head_dim": 128,
	"hidden_act": "silu",
	"hidden_size": 4096,
	"initializer_range": 0.02,
	"intermediate_size": 14336,
	"max_position_embeddings": 131072,
	"mlp_bias": false,
	"model_type": "llama",
	"num_attention_heads": 32,
	"num_hidden_layers": 32,
	"num_key_value_heads": 8,
	"pretraining_tp": 1,
	"rms_norm_eps": 1e-05,
	"rope_scaling": {
	"factor": 8.0,
	"high_freq_factor": 4.0,
	"low_freq_factor": 1.0,
	"original_max_position_embeddings": 8192,
	"rope_type": "llama3"
	},
	"rope_theta": 500000.0,
	"tie_word_embeddings": false,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.52.4",
	"use_cache": true,
	"vocab_size": 128256
	}


	[INFO\|2025-06-25 12:12:54] logging.py:143 >> KV cache is disabled during training.

	[INFO\|2025-06-25 12:12:54] modeling_utils.py:1151 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/model.safetensors.index.json

	[INFO\|2025-06-25 12:12:54] modeling_utils.py:2241 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16.

	[INFO\|2025-06-25 12:12:54] configuration_utils.py:1135 >> Generate config GenerationConfig {
	"bos_token_id": 128000,
	"eos_token_id": [
	128001,
	128008,
	128009
	],
	"use_cache": false
	}


	[INFO\|2025-06-25 12:12:57] modeling_utils.py:5131 >> All model checkpoint weights were used when initializing LlamaForCausalLM.


	[INFO\|2025-06-25 12:12:57] modeling_utils.py:5139 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Meta-Llama-3.1-8B-Instruct.
	If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.

	[INFO\|2025-06-25 12:12:57] configuration_utils.py:1090 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/generation_config.json

	[INFO\|2025-06-25 12:12:57] configuration_utils.py:1135 >> Generate config GenerationConfig {
	"bos_token_id": 128000,
	"do_sample": true,
	"eos_token_id": [
	128001,
	128008,
	128009
	],
	"temperature": 0.6,
	"top_p": 0.9
	}


	[INFO\|2025-06-25 12:12:58] logging.py:143 >> Gradient checkpointing enabled.

	[INFO\|2025-06-25 12:12:58] logging.py:143 >> Using torch SDPA for faster training and inference.

	[INFO\|2025-06-25 12:12:58] logging.py:143 >> Upcasting trainable params to float32.

	[INFO\|2025-06-25 12:12:58] logging.py:143 >> Fine-tuning method: LoRA

	[INFO\|2025-06-25 12:12:58] logging.py:143 >> Found linear modules: v_proj,o_proj,k_proj,up_proj,down_proj,gate_proj,q_proj

	[INFO\|2025-06-25 12:12:58] logging.py:143 >> trainable params: 20,971,520 \|\| all params: 8,051,232,768 \|\| trainable%: 0.2605

	[INFO\|2025-06-25 12:12:58] trainer.py:756 >> Using auto half precision backend

	[INFO\|2025-06-25 12:12:58] trainer.py:2409 >> *** Running training ***

	[INFO\|2025-06-25 12:12:58] trainer.py:2410 >> Num examples = 1,754

	[INFO\|2025-06-25 12:12:58] trainer.py:2411 >> Num Epochs = 3

	[INFO\|2025-06-25 12:12:58] trainer.py:2412 >> Instantaneous batch size per device = 2

	[INFO\|2025-06-25 12:12:58] trainer.py:2415 >> Total train batch size (w. parallel, distributed & accumulation) = 16

	[INFO\|2025-06-25 12:12:58] trainer.py:2416 >> Gradient Accumulation steps = 8

	[INFO\|2025-06-25 12:12:58] trainer.py:2417 >> Total optimization steps = 330

	[INFO\|2025-06-25 12:12:58] trainer.py:2418 >> Number of trainable parameters = 20,971,520

	[INFO\|2025-06-25 12:13:29] logging.py:143 >> {'loss': 0.4811, 'learning_rate': 4.9982e-05, 'epoch': 0.05, 'throughput': 3654.89}

	[INFO\|2025-06-25 12:13:59] logging.py:143 >> {'loss': 0.2124, 'learning_rate': 4.9908e-05, 'epoch': 0.09, 'throughput': 3657.56}

	[INFO\|2025-06-25 12:14:29] logging.py:143 >> {'loss': 0.1393, 'learning_rate': 4.9778e-05, 'epoch': 0.14, 'throughput': 3652.34}

	[INFO\|2025-06-25 12:15:00] logging.py:143 >> {'loss': 0.1043, 'learning_rate': 4.9592e-05, 'epoch': 0.18, 'throughput': 3651.69}

	[INFO\|2025-06-25 12:15:30] logging.py:143 >> {'loss': 0.0953, 'learning_rate': 4.9350e-05, 'epoch': 0.23, 'throughput': 3653.26}

	[INFO\|2025-06-25 12:16:00] logging.py:143 >> {'loss': 0.0867, 'learning_rate': 4.9053e-05, 'epoch': 0.27, 'throughput': 3653.94}

	[INFO\|2025-06-25 12:16:30] logging.py:143 >> {'loss': 0.0625, 'learning_rate': 4.8702e-05, 'epoch': 0.32, 'throughput': 3655.41}

	[INFO\|2025-06-25 12:17:00] logging.py:143 >> {'loss': 0.0727, 'learning_rate': 4.8297e-05, 'epoch': 0.36, 'throughput': 3656.75}

	[INFO\|2025-06-25 12:17:31] logging.py:143 >> {'loss': 0.0727, 'learning_rate': 4.7839e-05, 'epoch': 0.41, 'throughput': 3656.82}

	[INFO\|2025-06-25 12:18:01] logging.py:143 >> {'loss': 0.0676, 'learning_rate': 4.7329e-05, 'epoch': 0.46, 'throughput': 3658.59}

	[INFO\|2025-06-25 12:18:31] logging.py:143 >> {'loss': 0.0635, 'learning_rate': 4.6769e-05, 'epoch': 0.50, 'throughput': 3658.13}

	[INFO\|2025-06-25 12:19:02] logging.py:143 >> {'loss': 0.0586, 'learning_rate': 4.6159e-05, 'epoch': 0.55, 'throughput': 3658.10}

	[INFO\|2025-06-25 12:19:32] logging.py:143 >> {'loss': 0.0572, 'learning_rate': 4.5502e-05, 'epoch': 0.59, 'throughput': 3657.87}

	[INFO\|2025-06-25 12:20:02] logging.py:143 >> {'loss': 0.0540, 'learning_rate': 4.4798e-05, 'epoch': 0.64, 'throughput': 3657.89}

	[INFO\|2025-06-25 12:20:32] logging.py:143 >> {'loss': 0.0470, 'learning_rate': 4.4049e-05, 'epoch': 0.68, 'throughput': 3657.81}

	[INFO\|2025-06-25 12:21:02] logging.py:143 >> {'loss': 0.0471, 'learning_rate': 4.3257e-05, 'epoch': 0.73, 'throughput': 3657.54}

	[INFO\|2025-06-25 12:21:32] logging.py:143 >> {'loss': 0.0408, 'learning_rate': 4.2423e-05, 'epoch': 0.78, 'throughput': 3657.55}

	[INFO\|2025-06-25 12:22:03] logging.py:143 >> {'loss': 0.0398, 'learning_rate': 4.1551e-05, 'epoch': 0.82, 'throughput': 3657.75}

	[INFO\|2025-06-25 12:22:33] logging.py:143 >> {'loss': 0.0369, 'learning_rate': 4.0640e-05, 'epoch': 0.87, 'throughput': 3658.09}

	[INFO\|2025-06-25 12:23:03] logging.py:143 >> {'loss': 0.0408, 'learning_rate': 3.9695e-05, 'epoch': 0.91, 'throughput': 3658.50}

	[INFO\|2025-06-25 12:23:03] trainer.py:3993 >> Saving model checkpoint to saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25/checkpoint-100

	[INFO\|2025-06-25 12:23:04] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/config.json

	[INFO\|2025-06-25 12:23:04] configuration_utils.py:770 >> Model config LlamaConfig {
	"architectures": [
	"LlamaForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"bos_token_id": 128000,
	"eos_token_id": [
	128001,
	128008,
	128009
	],
	"head_dim": 128,
	"hidden_act": "silu",
	"hidden_size": 4096,
	"initializer_range": 0.02,
	"intermediate_size": 14336,
	"max_position_embeddings": 131072,
	"mlp_bias": false,
	"model_type": "llama",
	"num_attention_heads": 32,
	"num_hidden_layers": 32,
	"num_key_value_heads": 8,
	"pretraining_tp": 1,
	"rms_norm_eps": 1e-05,
	"rope_scaling": {
	"factor": 8.0,
	"high_freq_factor": 4.0,
	"low_freq_factor": 1.0,
	"original_max_position_embeddings": 8192,
	"rope_type": "llama3"
	},
	"rope_theta": 500000.0,
	"tie_word_embeddings": false,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.52.4",
	"use_cache": true,
	"vocab_size": 128256
	}


	[INFO\|2025-06-25 12:23:04] tokenization_utils_base.py:2356 >> chat template saved in saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25/checkpoint-100/chat_template.jinja

	[INFO\|2025-06-25 12:23:04] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25/checkpoint-100/tokenizer_config.json

	[INFO\|2025-06-25 12:23:04] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25/checkpoint-100/special_tokens_map.json

	[INFO\|2025-06-25 12:23:34] logging.py:143 >> {'loss': 0.0441, 'learning_rate': 3.8716e-05, 'epoch': 0.96, 'throughput': 3653.95}

	[INFO\|2025-06-25 12:24:02] logging.py:143 >> {'loss': 0.0356, 'learning_rate': 3.7706e-05, 'epoch': 1.00, 'throughput': 3654.53}

	[INFO\|2025-06-25 12:24:32] logging.py:143 >> {'loss': 0.0358, 'learning_rate': 3.6667e-05, 'epoch': 1.05, 'throughput': 3655.17}

	[INFO\|2025-06-25 12:25:02] logging.py:143 >> {'loss': 0.0388, 'learning_rate': 3.5601e-05, 'epoch': 1.09, 'throughput': 3655.29}

	[INFO\|2025-06-25 12:25:32] logging.py:143 >> {'loss': 0.0323, 'learning_rate': 3.4512e-05, 'epoch': 1.14, 'throughput': 3655.54}

	[INFO\|2025-06-25 12:26:02] logging.py:143 >> {'loss': 0.0310, 'learning_rate': 3.3401e-05, 'epoch': 1.18, 'throughput': 3655.62}

	[INFO\|2025-06-25 12:26:33] logging.py:143 >> {'loss': 0.0279, 'learning_rate': 3.2271e-05, 'epoch': 1.23, 'throughput': 3655.42}

	[INFO\|2025-06-25 12:27:03] logging.py:143 >> {'loss': 0.0285, 'learning_rate': 3.1125e-05, 'epoch': 1.27, 'throughput': 3655.57}

	[INFO\|2025-06-25 12:27:33] logging.py:143 >> {'loss': 0.0292, 'learning_rate': 2.9965e-05, 'epoch': 1.32, 'throughput': 3655.94}

	[INFO\|2025-06-25 12:28:03] logging.py:143 >> {'loss': 0.0299, 'learning_rate': 2.8793e-05, 'epoch': 1.36, 'throughput': 3656.65}

	[INFO\|2025-06-25 12:28:33] logging.py:143 >> {'loss': 0.0345, 'learning_rate': 2.7613e-05, 'epoch': 1.41, 'throughput': 3657.12}

	[INFO\|2025-06-25 12:29:03] logging.py:143 >> {'loss': 0.0338, 'learning_rate': 2.6427e-05, 'epoch': 1.46, 'throughput': 3657.61}

	[INFO\|2025-06-25 12:29:33] logging.py:143 >> {'loss': 0.0363, 'learning_rate': 2.5238e-05, 'epoch': 1.50, 'throughput': 3658.09}

	[INFO\|2025-06-25 12:30:03] logging.py:143 >> {'loss': 0.0294, 'learning_rate': 2.4048e-05, 'epoch': 1.55, 'throughput': 3658.22}

	[INFO\|2025-06-25 12:30:33] logging.py:143 >> {'loss': 0.0334, 'learning_rate': 2.2861e-05, 'epoch': 1.59, 'throughput': 3658.33}

	[INFO\|2025-06-25 12:31:04] logging.py:143 >> {'loss': 0.0296, 'learning_rate': 2.1678e-05, 'epoch': 1.64, 'throughput': 3658.36}

	[INFO\|2025-06-25 12:31:34] logging.py:143 >> {'loss': 0.0319, 'learning_rate': 2.0503e-05, 'epoch': 1.68, 'throughput': 3658.35}

	[INFO\|2025-06-25 12:32:04] logging.py:143 >> {'loss': 0.0324, 'learning_rate': 1.9338e-05, 'epoch': 1.73, 'throughput': 3658.56}

	[INFO\|2025-06-25 12:32:34] logging.py:143 >> {'loss': 0.0337, 'learning_rate': 1.8185e-05, 'epoch': 1.78, 'throughput': 3658.65}

	[INFO\|2025-06-25 12:33:04] logging.py:143 >> {'loss': 0.0356, 'learning_rate': 1.7049e-05, 'epoch': 1.82, 'throughput': 3658.30}

	[INFO\|2025-06-25 12:33:04] trainer.py:3993 >> Saving model checkpoint to saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25/checkpoint-200

	[INFO\|2025-06-25 12:33:05] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/config.json

	[INFO\|2025-06-25 12:33:05] configuration_utils.py:770 >> Model config LlamaConfig {
	"architectures": [
	"LlamaForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"bos_token_id": 128000,
	"eos_token_id": [
	128001,
	128008,
	128009
	],
	"head_dim": 128,
	"hidden_act": "silu",
	"hidden_size": 4096,
	"initializer_range": 0.02,
	"intermediate_size": 14336,
	"max_position_embeddings": 131072,
	"mlp_bias": false,
	"model_type": "llama",
	"num_attention_heads": 32,
	"num_hidden_layers": 32,
	"num_key_value_heads": 8,
	"pretraining_tp": 1,
	"rms_norm_eps": 1e-05,
	"rope_scaling": {
	"factor": 8.0,
	"high_freq_factor": 4.0,
	"low_freq_factor": 1.0,
	"original_max_position_embeddings": 8192,
	"rope_type": "llama3"
	},
	"rope_theta": 500000.0,
	"tie_word_embeddings": false,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.52.4",
	"use_cache": true,
	"vocab_size": 128256
	}


	[INFO\|2025-06-25 12:33:05] tokenization_utils_base.py:2356 >> chat template saved in saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25/checkpoint-200/chat_template.jinja

	[INFO\|2025-06-25 12:33:05] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25/checkpoint-200/tokenizer_config.json

	[INFO\|2025-06-25 12:33:05] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25/checkpoint-200/special_tokens_map.json

	[INFO\|2025-06-25 12:33:36] logging.py:143 >> {'loss': 0.0259, 'learning_rate': 1.5930e-05, 'epoch': 1.87, 'throughput': 3655.21}

	[INFO\|2025-06-25 12:34:06] logging.py:143 >> {'loss': 0.0372, 'learning_rate': 1.4832e-05, 'epoch': 1.91, 'throughput': 3655.11}

	[INFO\|2025-06-25 12:34:36] logging.py:143 >> {'loss': 0.0261, 'learning_rate': 1.3756e-05, 'epoch': 1.96, 'throughput': 3655.18}

	[INFO\|2025-06-25 12:35:04] logging.py:143 >> {'loss': 0.0256, 'learning_rate': 1.2707e-05, 'epoch': 2.00, 'throughput': 3655.42}

	[INFO\|2025-06-25 12:35:34] logging.py:143 >> {'loss': 0.0240, 'learning_rate': 1.1685e-05, 'epoch': 2.05, 'throughput': 3655.45}

	[INFO\|2025-06-25 12:36:04] logging.py:143 >> {'loss': 0.0220, 'learning_rate': 1.0693e-05, 'epoch': 2.09, 'throughput': 3655.85}

	[INFO\|2025-06-25 12:36:34] logging.py:143 >> {'loss': 0.0223, 'learning_rate': 9.7338e-06, 'epoch': 2.14, 'throughput': 3656.19}

	[INFO\|2025-06-25 12:37:05] logging.py:143 >> {'loss': 0.0251, 'learning_rate': 8.8091e-06, 'epoch': 2.18, 'throughput': 3656.49}

	[INFO\|2025-06-25 12:37:35] logging.py:143 >> {'loss': 0.0223, 'learning_rate': 7.9211e-06, 'epoch': 2.23, 'throughput': 3656.84}

	[INFO\|2025-06-25 12:38:05] logging.py:143 >> {'loss': 0.0255, 'learning_rate': 7.0717e-06, 'epoch': 2.27, 'throughput': 3657.06}

	[INFO\|2025-06-25 12:38:35] logging.py:143 >> {'loss': 0.0211, 'learning_rate': 6.2630e-06, 'epoch': 2.32, 'throughput': 3657.38}

	[INFO\|2025-06-25 12:39:05] logging.py:143 >> {'loss': 0.0235, 'learning_rate': 5.4967e-06, 'epoch': 2.36, 'throughput': 3657.62}

	[INFO\|2025-06-25 12:39:35] logging.py:143 >> {'loss': 0.0249, 'learning_rate': 4.7746e-06, 'epoch': 2.41, 'throughput': 3657.94}

	[INFO\|2025-06-25 12:40:05] logging.py:143 >> {'loss': 0.0228, 'learning_rate': 4.0983e-06, 'epoch': 2.46, 'throughput': 3658.24}

	[INFO\|2025-06-25 12:40:35] logging.py:143 >> {'loss': 0.0304, 'learning_rate': 3.4693e-06, 'epoch': 2.50, 'throughput': 3658.54}

	[INFO\|2025-06-25 12:41:06] logging.py:143 >> {'loss': 0.0230, 'learning_rate': 2.8892e-06, 'epoch': 2.55, 'throughput': 3658.85}

	[INFO\|2025-06-25 12:41:36] logging.py:143 >> {'loss': 0.0263, 'learning_rate': 2.3591e-06, 'epoch': 2.59, 'throughput': 3659.12}

	[INFO\|2025-06-25 12:42:06] logging.py:143 >> {'loss': 0.0259, 'learning_rate': 1.8803e-06, 'epoch': 2.64, 'throughput': 3659.33}

	[INFO\|2025-06-25 12:42:35] logging.py:143 >> {'loss': 0.0217, 'learning_rate': 1.4539e-06, 'epoch': 2.68, 'throughput': 3659.64}

	[INFO\|2025-06-25 12:43:06] logging.py:143 >> {'loss': 0.0259, 'learning_rate': 1.0808e-06, 'epoch': 2.73, 'throughput': 3659.99}

	[INFO\|2025-06-25 12:43:06] trainer.py:3993 >> Saving model checkpoint to saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25/checkpoint-300

	[INFO\|2025-06-25 12:43:06] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/config.json

	[INFO\|2025-06-25 12:43:06] configuration_utils.py:770 >> Model config LlamaConfig {
	"architectures": [
	"LlamaForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"bos_token_id": 128000,
	"eos_token_id": [
	128001,
	128008,
	128009
	],
	"head_dim": 128,
	"hidden_act": "silu",
	"hidden_size": 4096,
	"initializer_range": 0.02,
	"intermediate_size": 14336,
	"max_position_embeddings": 131072,
	"mlp_bias": false,
	"model_type": "llama",
	"num_attention_heads": 32,
	"num_hidden_layers": 32,
	"num_key_value_heads": 8,
	"pretraining_tp": 1,
	"rms_norm_eps": 1e-05,
	"rope_scaling": {
	"factor": 8.0,
	"high_freq_factor": 4.0,
	"low_freq_factor": 1.0,
	"original_max_position_embeddings": 8192,
	"rope_type": "llama3"
	},
	"rope_theta": 500000.0,
	"tie_word_embeddings": false,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.52.4",
	"use_cache": true,
	"vocab_size": 128256
	}


	[INFO\|2025-06-25 12:43:06] tokenization_utils_base.py:2356 >> chat template saved in saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25/checkpoint-300/chat_template.jinja

	[INFO\|2025-06-25 12:43:06] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25/checkpoint-300/tokenizer_config.json

	[INFO\|2025-06-25 12:43:06] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25/checkpoint-300/special_tokens_map.json

	[INFO\|2025-06-25 12:43:37] logging.py:143 >> {'loss': 0.0259, 'learning_rate': 7.6192e-07, 'epoch': 2.78, 'throughput': 3658.22}

	[INFO\|2025-06-25 12:44:07] logging.py:143 >> {'loss': 0.0279, 'learning_rate': 4.9794e-07, 'epoch': 2.82, 'throughput': 3658.34}

	[INFO\|2025-06-25 12:44:37] logging.py:143 >> {'loss': 0.0221, 'learning_rate': 2.8946e-07, 'epoch': 2.87, 'throughput': 3658.39}

	[INFO\|2025-06-25 12:45:07] logging.py:143 >> {'loss': 0.0246, 'learning_rate': 1.3695e-07, 'epoch': 2.91, 'throughput': 3658.39}

	[INFO\|2025-06-25 12:45:37] logging.py:143 >> {'loss': 0.0189, 'learning_rate': 4.0772e-08, 'epoch': 2.96, 'throughput': 3658.32}

	[INFO\|2025-06-25 12:46:05] logging.py:143 >> {'loss': 0.0228, 'learning_rate': 1.1329e-09, 'epoch': 3.00, 'throughput': 3658.49}

	[INFO\|2025-06-25 12:46:05] trainer.py:3993 >> Saving model checkpoint to saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25/checkpoint-330

	[INFO\|2025-06-25 12:46:06] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/config.json

	[INFO\|2025-06-25 12:46:06] configuration_utils.py:770 >> Model config LlamaConfig {
	"architectures": [
	"LlamaForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"bos_token_id": 128000,
	"eos_token_id": [
	128001,
	128008,
	128009
	],
	"head_dim": 128,
	"hidden_act": "silu",
	"hidden_size": 4096,
	"initializer_range": 0.02,
	"intermediate_size": 14336,
	"max_position_embeddings": 131072,
	"mlp_bias": false,
	"model_type": "llama",
	"num_attention_heads": 32,
	"num_hidden_layers": 32,
	"num_key_value_heads": 8,
	"pretraining_tp": 1,
	"rms_norm_eps": 1e-05,
	"rope_scaling": {
	"factor": 8.0,
	"high_freq_factor": 4.0,
	"low_freq_factor": 1.0,
	"original_max_position_embeddings": 8192,
	"rope_type": "llama3"
	},
	"rope_theta": 500000.0,
	"tie_word_embeddings": false,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.52.4",
	"use_cache": true,
	"vocab_size": 128256
	}


	[INFO\|2025-06-25 12:46:06] tokenization_utils_base.py:2356 >> chat template saved in saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25/checkpoint-330/chat_template.jinja

	[INFO\|2025-06-25 12:46:06] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25/checkpoint-330/tokenizer_config.json

	[INFO\|2025-06-25 12:46:06] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25/checkpoint-330/special_tokens_map.json

	[INFO\|2025-06-25 12:46:07] trainer.py:2676 >>

	Training completed. Do not forget to share your model on huggingface.co/models =)



	[INFO\|2025-06-25 12:46:07] trainer.py:3993 >> Saving model checkpoint to saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25

	[INFO\|2025-06-25 12:46:07] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/config.json

	[INFO\|2025-06-25 12:46:07] configuration_utils.py:770 >> Model config LlamaConfig {
	"architectures": [
	"LlamaForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"bos_token_id": 128000,
	"eos_token_id": [
	128001,
	128008,
	128009
	],
	"head_dim": 128,
	"hidden_act": "silu",
	"hidden_size": 4096,
	"initializer_range": 0.02,
	"intermediate_size": 14336,
	"max_position_embeddings": 131072,
	"mlp_bias": false,
	"model_type": "llama",
	"num_attention_heads": 32,
	"num_hidden_layers": 32,
	"num_key_value_heads": 8,
	"pretraining_tp": 1,
	"rms_norm_eps": 1e-05,
	"rope_scaling": {
	"factor": 8.0,
	"high_freq_factor": 4.0,
	"low_freq_factor": 1.0,
	"original_max_position_embeddings": 8192,
	"rope_type": "llama3"
	},
	"rope_theta": 500000.0,
	"tie_word_embeddings": false,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.52.4",
	"use_cache": true,
	"vocab_size": 128256
	}


	[INFO\|2025-06-25 12:46:08] tokenization_utils_base.py:2356 >> chat template saved in saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25/chat_template.jinja

	[INFO\|2025-06-25 12:46:08] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25/tokenizer_config.json

	[INFO\|2025-06-25 12:46:08] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Llama-3.1-8B-Instruct/lora/thomas_trained_2025-06-25/special_tokens_map.json

	[WARNING\|2025-06-25 12:46:08] logging.py:148 >> No metric eval_loss to plot.

	[WARNING\|2025-06-25 12:46:08] logging.py:148 >> No metric eval_accuracy to plot.

	[INFO\|2025-06-25 12:46:08] modelcard.py:450 >> Dropping the following result as it does not have all the necessary fields:
	{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}