robinhad commited on
Commit
d71ef19
·
verified ·
1 Parent(s): 88e4bf0

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -7,7 +7,7 @@ tags:
7
  datasets:
8
  - le-llm/openthoughts-113k
9
  model-index:
10
- - name: outputs/gemma-3-12b-it-reasoning-tok
11
  results: []
12
  ---
13
 
@@ -22,7 +22,7 @@ axolotl version: `0.9.2`
22
  base_model: google/gemma-3-12b-it
23
 
24
  #load_in_4bit: true
25
-
26
  # gemma3 doesn't seem to play nice with ddp
27
  ddp_find_unused_parameters: true
28
 
@@ -52,26 +52,21 @@ chat_template: gemma3
52
 
53
  dataset_prepared_path: last_run_prepared_reasoning
54
  # val_set_size: 0.01
55
- output_dir: ./outputs/gemma-3-12b-it-reasoning-tok
56
 
57
  #adapter: qlora
58
  #lora_model_dir:
59
-
60
- sequence_len: 16384 # 2048
61
- sample_packing: false
62
  pad_to_sequence_len: true
63
  train_on_inputs: true
64
-
65
- #adapter: lora
66
- #lora_r: 32
67
- #lora_alpha: 16
68
- #lora_dropout: 0.05
69
- #lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
70
-
71
 
72
  plugins:
73
  - axolotl.integrations.liger.LigerPlugin
74
- #- axolotl.integrations.spectrum.SpectrumPlugin
75
  liger_rope: true
76
  liger_rms_norm: true
77
  liger_glu_activation: true
@@ -80,6 +75,7 @@ liger_fused_linear_cross_entropy: true
80
 
81
 
82
  # spectrum
 
83
  #spectrum_top_fraction: 0.5
84
  #spectrum_model_name: google/gemma-3-12b-it
85
 
@@ -89,19 +85,19 @@ wandb_watch:
89
  wandb_name:
90
  wandb_log_model:
91
 
92
- gradient_accumulation_steps: 1
93
- micro_batch_size: 3
94
  num_epochs: 1
95
  optimizer: adamw_torch_fused # muon #adamw_bnb_8bit
96
  lr_scheduler: warmup_stable_decay
97
  learning_rate: 5e-5
98
- lr_scheduler_kwargs: {"num_decay_steps": 100}
99
 
100
- bf16: true
101
- fp16:
102
  tf32: false # TODO: double check precision impact
103
 
104
- deepspeed: deepspeed_configs/zero3_bf16.json
105
 
106
  # TODO: When using FSDP full shard, instead of using `gradient_checkpointing` in TrainingArguments, please use `activation_checkpointing` in `fsdp_config`. The former introduces a redundant AllGather operation in backward pass. Reference: https://github.com/huggingface/transformers/issues/30404
107
  #fsdp:
@@ -112,17 +108,39 @@ deepspeed: deepspeed_configs/zero3_bf16.json
112
  # fsdp_state_dict_type: FULL_STATE_DICT
113
  # fsdp_transformer_layer_cls_to_wrap: Gemma3DecoderLayer
114
 
115
- gradient_checkpointing: true
116
- gradient_checkpointing_kwargs:
117
- use_reentrant: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  logging_steps: 1
119
  flash_attention: true # not recommended for gemma3 due to soft logit capping, but it should be fixed in the lates flash attention
120
  #eager_attention:
121
- #torch_compile: True
122
 
123
 
124
 
125
- warmup_steps: 100 #0.4
126
  evals_per_epoch: 1
127
  save_steps: 100
128
  save_total_limit: 6
@@ -133,7 +151,7 @@ weight_decay: 0.0
133
 
134
  </details><br>
135
 
136
- # outputs/gemma-3-12b-it-reasoning-tok
137
 
138
  This model is a fine-tuned version of [google/gemma-3-12b-it](https://huggingface.co/google/gemma-3-12b-it) on the le-llm/openthoughts-113k dataset.
139
 
@@ -155,16 +173,17 @@ More information needed
155
 
156
  The following hyperparameters were used during training:
157
  - learning_rate: 5e-05
158
- - train_batch_size: 3
159
- - eval_batch_size: 3
160
  - seed: 42
161
  - distributed_type: multi-GPU
162
- - num_devices: 56
163
- - total_train_batch_size: 168
164
- - total_eval_batch_size: 168
 
165
  - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
166
  - lr_scheduler_type: warmup_stable_decay
167
- - lr_scheduler_warmup_steps: 100
168
  - num_epochs: 1.0
169
 
170
  ### Training results
 
7
  datasets:
8
  - le-llm/openthoughts-113k
9
  model-index:
10
+ - name: outputs/gemma-3-12b-it-reasoning-tok-27b
11
  results: []
12
  ---
13
 
 
22
  base_model: google/gemma-3-12b-it
23
 
24
  #load_in_4bit: true
25
+ #auto_resume_from_checkpoints: true
26
  # gemma3 doesn't seem to play nice with ddp
27
  ddp_find_unused_parameters: true
28
 
 
52
 
53
  dataset_prepared_path: last_run_prepared_reasoning
54
  # val_set_size: 0.01
55
+ output_dir: ./outputs/gemma-3-12b-it-reasoning-tok-27b
56
 
57
  #adapter: qlora
58
  #lora_model_dir:
59
+ sequence_len: 32768 # 16384 # 2048
60
+ sample_packing: false # true
 
61
  pad_to_sequence_len: true
62
  train_on_inputs: true
63
+ tensor_parallel_size: 8
64
+ # tiled_mlp: true
65
+ #context_parallel_size: 8
66
+ # dp_shard_size: 4
 
 
 
67
 
68
  plugins:
69
  - axolotl.integrations.liger.LigerPlugin
 
70
  liger_rope: true
71
  liger_rms_norm: true
72
  liger_glu_activation: true
 
75
 
76
 
77
  # spectrum
78
+ #- axolotl.integrations.spectrum.SpectrumPlugin
79
  #spectrum_top_fraction: 0.5
80
  #spectrum_model_name: google/gemma-3-12b-it
81
 
 
85
  wandb_name:
86
  wandb_log_model:
87
 
88
+ gradient_accumulation_steps: 2
89
+ micro_batch_size: 2
90
  num_epochs: 1
91
  optimizer: adamw_torch_fused # muon #adamw_bnb_8bit
92
  lr_scheduler: warmup_stable_decay
93
  learning_rate: 5e-5
94
+ lr_scheduler_kwargs: {"num_decay_steps": 150}
95
 
96
+ bf16: auto
97
+ # fp16:
98
  tf32: false # TODO: double check precision impact
99
 
100
+ deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json # deepspeed_configs/zero3_bf16.json
101
 
102
  # TODO: When using FSDP full shard, instead of using `gradient_checkpointing` in TrainingArguments, please use `activation_checkpointing` in `fsdp_config`. The former introduces a redundant AllGather operation in backward pass. Reference: https://github.com/huggingface/transformers/issues/30404
103
  #fsdp:
 
108
  # fsdp_state_dict_type: FULL_STATE_DICT
109
  # fsdp_transformer_layer_cls_to_wrap: Gemma3DecoderLayer
110
 
111
+ #fp8: true
112
+ #fp8_enable_fsdp_float8_all_gather: true
113
+ #torch_compile: true
114
+
115
+ #fsdp:
116
+ # - full_shard
117
+ # - auto_wrap
118
+ #fsdp_config:
119
+ # fsdp_version: 2
120
+ # fsdp_offload_params: false
121
+ # fsdp_cpu_ram_efficient_loading: false
122
+ # fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
123
+ # fsdp_transformer_layer_cls_to_wrap: Gemma3DecoderLayer
124
+ # fsdp_state_dict_type: FULL_STATE_DICT
125
+ # fsdp_sharding_strategy: FULL_SHARD
126
+ # fsdp_reshard_after_forward: true
127
+ # # fsdp_activation_checkpointing: true
128
+
129
+ gradient_checkpointing: true # required for activation offloading
130
+ activation_offloading: legacy
131
+
132
+ #gradient_checkpointing: true
133
+ #gradient_checkpointing_kwargs:
134
+ # use_reentrant: false
135
+ #activation_offloading: true
136
  logging_steps: 1
137
  flash_attention: true # not recommended for gemma3 due to soft logit capping, but it should be fixed in the lates flash attention
138
  #eager_attention:
139
+ # torch_compile: True
140
 
141
 
142
 
143
+ warmup_steps: 150 #0.4
144
  evals_per_epoch: 1
145
  save_steps: 100
146
  save_total_limit: 6
 
151
 
152
  </details><br>
153
 
154
+ # outputs/gemma-3-12b-it-reasoning-tok-27b
155
 
156
  This model is a fine-tuned version of [google/gemma-3-12b-it](https://huggingface.co/google/gemma-3-12b-it) on the le-llm/openthoughts-113k dataset.
157
 
 
173
 
174
  The following hyperparameters were used during training:
175
  - learning_rate: 5e-05
176
+ - train_batch_size: 2
177
+ - eval_batch_size: 2
178
  - seed: 42
179
  - distributed_type: multi-GPU
180
+ - num_devices: 32
181
+ - gradient_accumulation_steps: 2
182
+ - total_train_batch_size: 128
183
+ - total_eval_batch_size: 64
184
  - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
185
  - lr_scheduler_type: warmup_stable_decay
186
+ - lr_scheduler_warmup_steps: 150
187
  - num_epochs: 1.0
188
 
189
  ### Training results
model-00001-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93f3f63d80c6e0c8c5495546ba8eb94a098d1fd8848e8b2a7cc32d8b8cc6671d
3
  size 4979902192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce40c3c774c95bca9abafbd12212577004abc4401a942a2fc7e4e44a4a3c148d
3
  size 4979902192
model-00002-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3bd9e145baf953923e147b68836a1b7cc31276ac80728aa37e84aeb2cef2c56
3
  size 4931296592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4bd49355a9b407b5d4b413c32b3b3572a3a28ac9e1131b483290f756e65837a
3
  size 4931296592
model-00003-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:60c31d998b6562c6b90fa6cb03706c047c1aef9d390c4a70e8bd6f2e54307bc3
3
  size 4931296656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4663cc2a72025b5f0f955902a7933c953aefd188943038c9293ba7be46bb2bd6
3
  size 4931296656
model-00004-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4be8159be0be8e783b50bf8e3ca3f2e0bf2619c79b75b4471f24fd8102437906
3
  size 4931296656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab6135dbc381aafdc2b73a2971f0a9382279f174ec7c16d6848df6e0a3234019
3
  size 4931296656
model-00005-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3503c3be34c78b1c434025273afac1a3ef134f0df915d7baa5e54e23743ef831
3
  size 4601000928
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b62aa58bdd0a1b766e3ee21c159368b00687ef682ce4aa4c44903e0b8daaac32
3
  size 4601000928
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55cd9ea88d84dca67b853b40ce6bae4b7b4bd11c5cbc8d6db0a40166b7fe5e49
3
- size 10168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e77e45e68ab1ccf0b68724307eeb7a0a46b5db16886ba2e5fd85fe7b802343f
3
+ size 10424