Upload folder using huggingface_hub
Browse files- README.md +51 -32
- model-00001-of-00005.safetensors +1 -1
- model-00002-of-00005.safetensors +1 -1
- model-00003-of-00005.safetensors +1 -1
- model-00004-of-00005.safetensors +1 -1
- model-00005-of-00005.safetensors +1 -1
- training_args.bin +2 -2
README.md
CHANGED
@@ -7,7 +7,7 @@ tags:
|
|
7 |
datasets:
|
8 |
- le-llm/openthoughts-113k
|
9 |
model-index:
|
10 |
-
- name: outputs/gemma-3-12b-it-reasoning-tok
|
11 |
results: []
|
12 |
---
|
13 |
|
@@ -22,7 +22,7 @@ axolotl version: `0.9.2`
|
|
22 |
base_model: google/gemma-3-12b-it
|
23 |
|
24 |
#load_in_4bit: true
|
25 |
-
|
26 |
# gemma3 doesn't seem to play nice with ddp
|
27 |
ddp_find_unused_parameters: true
|
28 |
|
@@ -52,26 +52,21 @@ chat_template: gemma3
|
|
52 |
|
53 |
dataset_prepared_path: last_run_prepared_reasoning
|
54 |
# val_set_size: 0.01
|
55 |
-
output_dir: ./outputs/gemma-3-12b-it-reasoning-tok
|
56 |
|
57 |
#adapter: qlora
|
58 |
#lora_model_dir:
|
59 |
-
|
60 |
-
|
61 |
-
sample_packing: false
|
62 |
pad_to_sequence_len: true
|
63 |
train_on_inputs: true
|
64 |
-
|
65 |
-
#
|
66 |
-
#
|
67 |
-
#
|
68 |
-
#lora_dropout: 0.05
|
69 |
-
#lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
70 |
-
|
71 |
|
72 |
plugins:
|
73 |
- axolotl.integrations.liger.LigerPlugin
|
74 |
-
#- axolotl.integrations.spectrum.SpectrumPlugin
|
75 |
liger_rope: true
|
76 |
liger_rms_norm: true
|
77 |
liger_glu_activation: true
|
@@ -80,6 +75,7 @@ liger_fused_linear_cross_entropy: true
|
|
80 |
|
81 |
|
82 |
# spectrum
|
|
|
83 |
#spectrum_top_fraction: 0.5
|
84 |
#spectrum_model_name: google/gemma-3-12b-it
|
85 |
|
@@ -89,19 +85,19 @@ wandb_watch:
|
|
89 |
wandb_name:
|
90 |
wandb_log_model:
|
91 |
|
92 |
-
gradient_accumulation_steps:
|
93 |
-
micro_batch_size:
|
94 |
num_epochs: 1
|
95 |
optimizer: adamw_torch_fused # muon #adamw_bnb_8bit
|
96 |
lr_scheduler: warmup_stable_decay
|
97 |
learning_rate: 5e-5
|
98 |
-
lr_scheduler_kwargs: {"num_decay_steps":
|
99 |
|
100 |
-
bf16:
|
101 |
-
fp16:
|
102 |
tf32: false # TODO: double check precision impact
|
103 |
|
104 |
-
deepspeed: deepspeed_configs/zero3_bf16.json
|
105 |
|
106 |
# TODO: When using FSDP full shard, instead of using `gradient_checkpointing` in TrainingArguments, please use `activation_checkpointing` in `fsdp_config`. The former introduces a redundant AllGather operation in backward pass. Reference: https://github.com/huggingface/transformers/issues/30404
|
107 |
#fsdp:
|
@@ -112,17 +108,39 @@ deepspeed: deepspeed_configs/zero3_bf16.json
|
|
112 |
# fsdp_state_dict_type: FULL_STATE_DICT
|
113 |
# fsdp_transformer_layer_cls_to_wrap: Gemma3DecoderLayer
|
114 |
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
logging_steps: 1
|
119 |
flash_attention: true # not recommended for gemma3 due to soft logit capping, but it should be fixed in the lates flash attention
|
120 |
#eager_attention:
|
121 |
-
#torch_compile: True
|
122 |
|
123 |
|
124 |
|
125 |
-
warmup_steps:
|
126 |
evals_per_epoch: 1
|
127 |
save_steps: 100
|
128 |
save_total_limit: 6
|
@@ -133,7 +151,7 @@ weight_decay: 0.0
|
|
133 |
|
134 |
</details><br>
|
135 |
|
136 |
-
# outputs/gemma-3-12b-it-reasoning-tok
|
137 |
|
138 |
This model is a fine-tuned version of [google/gemma-3-12b-it](https://huggingface.co/google/gemma-3-12b-it) on the le-llm/openthoughts-113k dataset.
|
139 |
|
@@ -155,16 +173,17 @@ More information needed
|
|
155 |
|
156 |
The following hyperparameters were used during training:
|
157 |
- learning_rate: 5e-05
|
158 |
-
- train_batch_size:
|
159 |
-
- eval_batch_size:
|
160 |
- seed: 42
|
161 |
- distributed_type: multi-GPU
|
162 |
-
- num_devices:
|
163 |
-
-
|
164 |
-
-
|
|
|
165 |
- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
|
166 |
- lr_scheduler_type: warmup_stable_decay
|
167 |
-
- lr_scheduler_warmup_steps:
|
168 |
- num_epochs: 1.0
|
169 |
|
170 |
### Training results
|
|
|
7 |
datasets:
|
8 |
- le-llm/openthoughts-113k
|
9 |
model-index:
|
10 |
+
- name: outputs/gemma-3-12b-it-reasoning-tok-27b
|
11 |
results: []
|
12 |
---
|
13 |
|
|
|
22 |
base_model: google/gemma-3-12b-it
|
23 |
|
24 |
#load_in_4bit: true
|
25 |
+
#auto_resume_from_checkpoints: true
|
26 |
# gemma3 doesn't seem to play nice with ddp
|
27 |
ddp_find_unused_parameters: true
|
28 |
|
|
|
52 |
|
53 |
dataset_prepared_path: last_run_prepared_reasoning
|
54 |
# val_set_size: 0.01
|
55 |
+
output_dir: ./outputs/gemma-3-12b-it-reasoning-tok-27b
|
56 |
|
57 |
#adapter: qlora
|
58 |
#lora_model_dir:
|
59 |
+
sequence_len: 32768 # 16384 # 2048
|
60 |
+
sample_packing: false # true
|
|
|
61 |
pad_to_sequence_len: true
|
62 |
train_on_inputs: true
|
63 |
+
tensor_parallel_size: 8
|
64 |
+
# tiled_mlp: true
|
65 |
+
#context_parallel_size: 8
|
66 |
+
# dp_shard_size: 4
|
|
|
|
|
|
|
67 |
|
68 |
plugins:
|
69 |
- axolotl.integrations.liger.LigerPlugin
|
|
|
70 |
liger_rope: true
|
71 |
liger_rms_norm: true
|
72 |
liger_glu_activation: true
|
|
|
75 |
|
76 |
|
77 |
# spectrum
|
78 |
+
#- axolotl.integrations.spectrum.SpectrumPlugin
|
79 |
#spectrum_top_fraction: 0.5
|
80 |
#spectrum_model_name: google/gemma-3-12b-it
|
81 |
|
|
|
85 |
wandb_name:
|
86 |
wandb_log_model:
|
87 |
|
88 |
+
gradient_accumulation_steps: 2
|
89 |
+
micro_batch_size: 2
|
90 |
num_epochs: 1
|
91 |
optimizer: adamw_torch_fused # muon #adamw_bnb_8bit
|
92 |
lr_scheduler: warmup_stable_decay
|
93 |
learning_rate: 5e-5
|
94 |
+
lr_scheduler_kwargs: {"num_decay_steps": 150}
|
95 |
|
96 |
+
bf16: auto
|
97 |
+
# fp16:
|
98 |
tf32: false # TODO: double check precision impact
|
99 |
|
100 |
+
deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json # deepspeed_configs/zero3_bf16.json
|
101 |
|
102 |
# TODO: When using FSDP full shard, instead of using `gradient_checkpointing` in TrainingArguments, please use `activation_checkpointing` in `fsdp_config`. The former introduces a redundant AllGather operation in backward pass. Reference: https://github.com/huggingface/transformers/issues/30404
|
103 |
#fsdp:
|
|
|
108 |
# fsdp_state_dict_type: FULL_STATE_DICT
|
109 |
# fsdp_transformer_layer_cls_to_wrap: Gemma3DecoderLayer
|
110 |
|
111 |
+
#fp8: true
|
112 |
+
#fp8_enable_fsdp_float8_all_gather: true
|
113 |
+
#torch_compile: true
|
114 |
+
|
115 |
+
#fsdp:
|
116 |
+
# - full_shard
|
117 |
+
# - auto_wrap
|
118 |
+
#fsdp_config:
|
119 |
+
# fsdp_version: 2
|
120 |
+
# fsdp_offload_params: false
|
121 |
+
# fsdp_cpu_ram_efficient_loading: false
|
122 |
+
# fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
123 |
+
# fsdp_transformer_layer_cls_to_wrap: Gemma3DecoderLayer
|
124 |
+
# fsdp_state_dict_type: FULL_STATE_DICT
|
125 |
+
# fsdp_sharding_strategy: FULL_SHARD
|
126 |
+
# fsdp_reshard_after_forward: true
|
127 |
+
# # fsdp_activation_checkpointing: true
|
128 |
+
|
129 |
+
gradient_checkpointing: true # required for activation offloading
|
130 |
+
activation_offloading: legacy
|
131 |
+
|
132 |
+
#gradient_checkpointing: true
|
133 |
+
#gradient_checkpointing_kwargs:
|
134 |
+
# use_reentrant: false
|
135 |
+
#activation_offloading: true
|
136 |
logging_steps: 1
|
137 |
flash_attention: true # not recommended for gemma3 due to soft logit capping, but it should be fixed in the lates flash attention
|
138 |
#eager_attention:
|
139 |
+
# torch_compile: True
|
140 |
|
141 |
|
142 |
|
143 |
+
warmup_steps: 150 #0.4
|
144 |
evals_per_epoch: 1
|
145 |
save_steps: 100
|
146 |
save_total_limit: 6
|
|
|
151 |
|
152 |
</details><br>
|
153 |
|
154 |
+
# outputs/gemma-3-12b-it-reasoning-tok-27b
|
155 |
|
156 |
This model is a fine-tuned version of [google/gemma-3-12b-it](https://huggingface.co/google/gemma-3-12b-it) on the le-llm/openthoughts-113k dataset.
|
157 |
|
|
|
173 |
|
174 |
The following hyperparameters were used during training:
|
175 |
- learning_rate: 5e-05
|
176 |
+
- train_batch_size: 2
|
177 |
+
- eval_batch_size: 2
|
178 |
- seed: 42
|
179 |
- distributed_type: multi-GPU
|
180 |
+
- num_devices: 32
|
181 |
+
- gradient_accumulation_steps: 2
|
182 |
+
- total_train_batch_size: 128
|
183 |
+
- total_eval_batch_size: 64
|
184 |
- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
|
185 |
- lr_scheduler_type: warmup_stable_decay
|
186 |
+
- lr_scheduler_warmup_steps: 150
|
187 |
- num_epochs: 1.0
|
188 |
|
189 |
### Training results
|
model-00001-of-00005.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4979902192
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ce40c3c774c95bca9abafbd12212577004abc4401a942a2fc7e4e44a4a3c148d
|
3 |
size 4979902192
|
model-00002-of-00005.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4931296592
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f4bd49355a9b407b5d4b413c32b3b3572a3a28ac9e1131b483290f756e65837a
|
3 |
size 4931296592
|
model-00003-of-00005.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4931296656
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4663cc2a72025b5f0f955902a7933c953aefd188943038c9293ba7be46bb2bd6
|
3 |
size 4931296656
|
model-00004-of-00005.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4931296656
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ab6135dbc381aafdc2b73a2971f0a9382279f174ec7c16d6848df6e0a3234019
|
3 |
size 4931296656
|
model-00005-of-00005.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4601000928
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b62aa58bdd0a1b766e3ee21c159368b00687ef682ce4aa4c44903e0b8daaac32
|
3 |
size 4601000928
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8e77e45e68ab1ccf0b68724307eeb7a0a46b5db16886ba2e5fd85fe7b802343f
|
3 |
+
size 10424
|