Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

README.md +51 -32
model-00001-of-00005.safetensors +1 -1
model-00002-of-00005.safetensors +1 -1
model-00003-of-00005.safetensors +1 -1
model-00004-of-00005.safetensors +1 -1
model-00005-of-00005.safetensors +1 -1
training_args.bin +2 -2

README.md CHANGED Viewed

@@ -7,7 +7,7 @@ tags:
 datasets:
 - le-llm/openthoughts-113k
 model-index:
-- name: outputs/gemma-3-12b-it-reasoning-tok
   results: []
 ---
@@ -22,7 +22,7 @@ axolotl version: `0.9.2`
 base_model: google/gemma-3-12b-it
 #load_in_4bit: true
 # gemma3 doesn't seem to play nice with ddp
 ddp_find_unused_parameters: true
@@ -52,26 +52,21 @@ chat_template: gemma3
 dataset_prepared_path: last_run_prepared_reasoning
 # val_set_size: 0.01
-output_dir: ./outputs/gemma-3-12b-it-reasoning-tok
 #adapter: qlora
 #lora_model_dir:
-sequence_len: 16384 # 2048
-sample_packing: false
 pad_to_sequence_len: true
 train_on_inputs: true
-#adapter: lora
-#lora_r: 32
-#lora_alpha: 16
-#lora_dropout: 0.05
-#lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 plugins:
   - axolotl.integrations.liger.LigerPlugin
-  #- axolotl.integrations.spectrum.SpectrumPlugin
 liger_rope: true
 liger_rms_norm: true
 liger_glu_activation: true
@@ -80,6 +75,7 @@ liger_fused_linear_cross_entropy: true
 # spectrum
 #spectrum_top_fraction: 0.5
 #spectrum_model_name: google/gemma-3-12b-it
@@ -89,19 +85,19 @@ wandb_watch:
 wandb_name:
 wandb_log_model:
-gradient_accumulation_steps: 1
-micro_batch_size: 3
 num_epochs: 1
 optimizer: adamw_torch_fused # muon #adamw_bnb_8bit
 lr_scheduler: warmup_stable_decay
 learning_rate: 5e-5
-lr_scheduler_kwargs: {"num_decay_steps": 100}
-bf16: true
-fp16:
 tf32: false # TODO: double check precision impact
-deepspeed: deepspeed_configs/zero3_bf16.json
 # TODO: When using FSDP full shard, instead of using `gradient_checkpointing` in TrainingArguments, please use `activation_checkpointing` in `fsdp_config`. The former introduces a redundant AllGather operation in backward pass. Reference: https://github.com/huggingface/transformers/issues/30404
 #fsdp:
@@ -112,17 +108,39 @@ deepspeed: deepspeed_configs/zero3_bf16.json
 #  fsdp_state_dict_type: FULL_STATE_DICT
 #  fsdp_transformer_layer_cls_to_wrap: Gemma3DecoderLayer
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
 logging_steps: 1
 flash_attention: true # not recommended for gemma3 due to soft logit capping, but it should be fixed in the lates flash attention
 #eager_attention:
-#torch_compile: True
-warmup_steps: 100 #0.4
 evals_per_epoch: 1
 save_steps: 100
 save_total_limit: 6
@@ -133,7 +151,7 @@ weight_decay: 0.0
 </details><br>
-# outputs/gemma-3-12b-it-reasoning-tok
 This model is a fine-tuned version of [google/gemma-3-12b-it](https://huggingface.co/google/gemma-3-12b-it) on the le-llm/openthoughts-113k dataset.
@@ -155,16 +173,17 @@ More information needed
 The following hyperparameters were used during training:
 - learning_rate: 5e-05
-- train_batch_size: 3
-- eval_batch_size: 3
 - seed: 42
 - distributed_type: multi-GPU
-- num_devices: 56
-- total_train_batch_size: 168
-- total_eval_batch_size: 168
 - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: warmup_stable_decay
-- lr_scheduler_warmup_steps: 100
 - num_epochs: 1.0
 ### Training results

 datasets:
 - le-llm/openthoughts-113k
 model-index:
+- name: outputs/gemma-3-12b-it-reasoning-tok-27b
   results: []
 ---
 base_model: google/gemma-3-12b-it
 #load_in_4bit: true
+#auto_resume_from_checkpoints: true
 # gemma3 doesn't seem to play nice with ddp
 ddp_find_unused_parameters: true
 dataset_prepared_path: last_run_prepared_reasoning
 # val_set_size: 0.01
+output_dir: ./outputs/gemma-3-12b-it-reasoning-tok-27b
 #adapter: qlora
 #lora_model_dir:
+sequence_len: 32768 # 16384 # 2048
+sample_packing: false # true
 pad_to_sequence_len: true
 train_on_inputs: true
+tensor_parallel_size: 8
+# tiled_mlp: true
+#context_parallel_size: 8
+# dp_shard_size: 4
 plugins:
   - axolotl.integrations.liger.LigerPlugin
 liger_rope: true
 liger_rms_norm: true
 liger_glu_activation: true
 # spectrum
+#- axolotl.integrations.spectrum.SpectrumPlugin
 #spectrum_top_fraction: 0.5
 #spectrum_model_name: google/gemma-3-12b-it
 wandb_name:
 wandb_log_model:
+gradient_accumulation_steps: 2
+micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_torch_fused # muon #adamw_bnb_8bit
 lr_scheduler: warmup_stable_decay
 learning_rate: 5e-5
+lr_scheduler_kwargs: {"num_decay_steps": 150}
+bf16: auto
+# fp16:
 tf32: false # TODO: double check precision impact
+deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json # deepspeed_configs/zero3_bf16.json
 # TODO: When using FSDP full shard, instead of using `gradient_checkpointing` in TrainingArguments, please use `activation_checkpointing` in `fsdp_config`. The former introduces a redundant AllGather operation in backward pass. Reference: https://github.com/huggingface/transformers/issues/30404
 #fsdp:
 #  fsdp_state_dict_type: FULL_STATE_DICT
 #  fsdp_transformer_layer_cls_to_wrap: Gemma3DecoderLayer
+#fp8: true
+#fp8_enable_fsdp_float8_all_gather: true
+#torch_compile: true
+#fsdp:
+#  - full_shard
+#  - auto_wrap
+#fsdp_config:
+#  fsdp_version: 2
+#  fsdp_offload_params: false
+#  fsdp_cpu_ram_efficient_loading: false
+#  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+#  fsdp_transformer_layer_cls_to_wrap: Gemma3DecoderLayer
+#  fsdp_state_dict_type: FULL_STATE_DICT
+#  fsdp_sharding_strategy: FULL_SHARD
+#  fsdp_reshard_after_forward: true
+#  # fsdp_activation_checkpointing: true
+gradient_checkpointing: true  # required for activation offloading
+activation_offloading: legacy
+#gradient_checkpointing: true
+#gradient_checkpointing_kwargs:
+#  use_reentrant: false
+#activation_offloading: true
 logging_steps: 1
 flash_attention: true # not recommended for gemma3 due to soft logit capping, but it should be fixed in the lates flash attention
 #eager_attention:
+# torch_compile: True
+warmup_steps: 150 #0.4
 evals_per_epoch: 1
 save_steps: 100
 save_total_limit: 6
 </details><br>
+# outputs/gemma-3-12b-it-reasoning-tok-27b
 This model is a fine-tuned version of [google/gemma-3-12b-it](https://huggingface.co/google/gemma-3-12b-it) on the le-llm/openthoughts-113k dataset.
 The following hyperparameters were used during training:
 - learning_rate: 5e-05
+- train_batch_size: 2
+- eval_batch_size: 2
 - seed: 42
 - distributed_type: multi-GPU
+- num_devices: 32
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 128
+- total_eval_batch_size: 64
 - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: warmup_stable_decay
+- lr_scheduler_warmup_steps: 150
 - num_epochs: 1.0
 ### Training results

model-00001-of-00005.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:93f3f63d80c6e0c8c5495546ba8eb94a098d1fd8848e8b2a7cc32d8b8cc6671d
 size 4979902192

 version https://git-lfs.github.com/spec/v1
+oid sha256:ce40c3c774c95bca9abafbd12212577004abc4401a942a2fc7e4e44a4a3c148d
 size 4979902192

model-00002-of-00005.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a3bd9e145baf953923e147b68836a1b7cc31276ac80728aa37e84aeb2cef2c56
 size 4931296592

 version https://git-lfs.github.com/spec/v1
+oid sha256:f4bd49355a9b407b5d4b413c32b3b3572a3a28ac9e1131b483290f756e65837a
 size 4931296592

model-00003-of-00005.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:60c31d998b6562c6b90fa6cb03706c047c1aef9d390c4a70e8bd6f2e54307bc3
 size 4931296656

 version https://git-lfs.github.com/spec/v1
+oid sha256:4663cc2a72025b5f0f955902a7933c953aefd188943038c9293ba7be46bb2bd6
 size 4931296656

model-00004-of-00005.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4be8159be0be8e783b50bf8e3ca3f2e0bf2619c79b75b4471f24fd8102437906
 size 4931296656

 version https://git-lfs.github.com/spec/v1
+oid sha256:ab6135dbc381aafdc2b73a2971f0a9382279f174ec7c16d6848df6e0a3234019
 size 4931296656

model-00005-of-00005.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3503c3be34c78b1c434025273afac1a3ef134f0df915d7baa5e54e23743ef831
 size 4601000928

 version https://git-lfs.github.com/spec/v1
+oid sha256:b62aa58bdd0a1b766e3ee21c159368b00687ef682ce4aa4c44903e0b8daaac32
 size 4601000928

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:55cd9ea88d84dca67b853b40ce6bae4b7b4bd11c5cbc8d6db0a40166b7fe5e49
-size 10168

 version https://git-lfs.github.com/spec/v1
+oid sha256:8e77e45e68ab1ccf0b68724307eeb7a0a46b5db16886ba2e5fd85fe7b802343f
+size 10424