leonardlin commited on 30 days ago

Commit

35d1e5e

verified ·

1 Parent(s): 493a3d8

Upload folder using huggingface_hub

Browse files

Files changed (23) hide show

.gitattributes +1 -0
README.md +278 -0
added_tokens.json +3 -0
config.json +60 -0
generation_config.json +13 -0
model-00001-of-00012.safetensors +3 -0
model-00002-of-00012.safetensors +3 -0
model-00003-of-00012.safetensors +3 -0
model-00004-of-00012.safetensors +3 -0
model-00005-of-00012.safetensors +3 -0
model-00006-of-00012.safetensors +3 -0
model-00007-of-00012.safetensors +3 -0
model-00008-of-00012.safetensors +3 -0
model-00009-of-00012.safetensors +3 -0
model-00010-of-00012.safetensors +3 -0
model-00011-of-00012.safetensors +3 -0
model-00012-of-00012.safetensors +3 -0
model.safetensors.index.json +0 -0
special_tokens_map.json +33 -0
tokenizer.json +3 -0
tokenizer.model +3 -0
tokenizer_config.json +0 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,278 @@

+---
+library_name: transformers
+license: gemma
+base_model: google/gemma-3-27b-it
+tags:
+- generated_from_trainer
+datasets:
+- shisa-ai/shisa-v2-best-of-n-athenev2-tulu70b-llama33-only-no-sysprompt
+- shisa-ai/shisa-v2-roleplaying-sft
+- shisa-ai/translation_set_april_6
+- shisa-ai/rewild-set-deepseek-subset
+- shisa-ai/magpie-ultra-set
+- shisa-ai/magpie-advanced-questions-set
+- shisa-ai/japan-magpie-set
+- shisa-ai/shisa-v2-instruction-following-sft
+model-index:
+- name: outputs/ablation-196-finalsft2-shisa-v2-gemma3-27b
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
+<details><summary>See axolotl config</summary>
+axolotl version: `0.8.0.dev0`
+```yaml
+# train w/ shisa-ai/shisa-v1-athenev2-reannotated-filtered
+base_model: google/gemma-3-27b-it
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+datasets:
+  - path: shisa-ai/shisa-v2-best-of-n-athenev2-tulu70b-llama33-only-no-sysprompt
+    type: chat_template
+    field_messages: conversations
+    message_field_role: from
+    message_field_content: value
+  - path: shisa-ai/shisa-v2-roleplaying-sft
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: role
+      content: content
+    roles:
+      system:
+        - system
+      assistant:
+        - gpt
+        - model
+        - assistant
+      user:
+        - human
+        - user
+    roles_to_train: ["assistant"]
+  - path: shisa-ai/translation_set_april_6
+    split: train[:25%]
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: role
+      content: content
+    roles:
+      system:
+        - system
+      assistant:
+        - gpt
+        - model
+        - assistant
+      user:
+        - human
+        - user
+    roles_to_train: ["assistant"]
+  - path: shisa-ai/rewild-set-deepseek-subset
+    split: train[:25%]
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: role
+      content: content
+    roles:
+      system:
+        - system
+      assistant:
+        - gpt
+        - model
+        - assistant
+      user:
+        - human
+        - user
+    roles_to_train: ["assistant"]
+  - path: shisa-ai/magpie-ultra-set
+    split: train[:8%]
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: role
+      content: content
+    roles:
+      system:
+        - system
+      assistant:
+        - gpt
+        - model
+        - assistant
+      user:
+        - human
+        - user
+    roles_to_train: ["assistant"]
+  - path: shisa-ai/magpie-advanced-questions-set
+    split: train[:8%]
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: role
+      content: content
+    roles:
+      system:
+        - system
+      assistant:
+        - gpt
+        - model
+        - assistant
+      user:
+        - human
+        - user
+    roles_to_train: ["assistant"]
+  - path: shisa-ai/japan-magpie-set
+    split: train
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: role
+      content: content
+    roles:
+      system:
+        - system
+      assistant:
+        - gpt
+        - model
+        - assistant
+      user:
+        - human
+        - user
+    roles_to_train: ["assistant"]
+  - path: shisa-ai/shisa-v2-instruction-following-sft
+    split: train[:50%]
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: role
+      content: content
+    roles:
+      system:
+        - system
+      assistant:
+        - gpt
+        - model
+        - assistant
+      user:
+        - human
+        - user
+    roles_to_train: ["assistant"]
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.05
+output_dir: ./outputs/ablation-196-finalsft2-shisa-v2-gemma3-27b
+sequence_len: 8192
+sample_packing: true
+pad_to_sequence_len: true
+# marginal difference
+neftune_noise_alpha: 5
+use_wandb: true
+wandb_project: shisa-v2
+wandb_entity: augmxnt
+wandb_name: ablation-196-finalsft2-shisa-v2-gemma3-27b
+gradient_accumulation_steps: 2
+micro_batch_size: 1
+num_epochs: 3
+optimizer: paged_adamw_8bit
+lr_scheduler: linear
+learning_rate: 5.4e-6
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+warmup_steps: 100
+evals_per_epoch: 2
+eval_table_size:
+saves_per_epoch: 0
+save_total_limit: 1 # Only store a single checkpoint
+debug:
+deepspeed: zero3_bf16.json
+weight_decay: 1e-4
+fsdp:
+fsdp_config:
+special_tokens:
+```
+</details><br>
+# outputs/ablation-196-finalsft2-shisa-v2-gemma3-27b
+This model is a fine-tuned version of [google/gemma-3-27b-it](https://huggingface.co/google/gemma-3-27b-it) on the shisa-ai/shisa-v2-best-of-n-athenev2-tulu70b-llama33-only-no-sysprompt, the shisa-ai/shisa-v2-roleplaying-sft, the shisa-ai/translation_set_april_6, the shisa-ai/rewild-set-deepseek-subset, the shisa-ai/magpie-ultra-set, the shisa-ai/magpie-advanced-questions-set, the shisa-ai/japan-magpie-set and the shisa-ai/shisa-v2-instruction-following-sft datasets.
+It achieves the following results on the evaluation set:
+- Loss: 0.5417
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 5.4e-06
+- train_batch_size: 1
+- eval_batch_size: 1
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 64
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 128
+- total_eval_batch_size: 64
+- optimizer: Use OptimizerNames.PAGED_ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: linear
+- lr_scheduler_warmup_steps: 100
+- num_epochs: 3.0
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 11.3287       | 0.0027 | 1    | 5.4644          |
+| 1.1409        | 0.4993 | 182  | 0.5644          |
+| 1.0588        | 0.9986 | 364  | 0.5344          |
+| 0.9207        | 1.4966 | 546  | 0.5322          |
+| 0.8979        | 1.9959 | 728  | 0.5245          |
+| 0.7673        | 2.4938 | 910  | 0.5432          |
+| 0.7521        | 2.9931 | 1092 | 0.5417          |
+### Framework versions
+- Transformers 4.50.0
+- Pytorch 2.6.0+cu124
+- Datasets 3.4.1
+- Tokenizers 0.21.1

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image_soft_token>": 262144
+}

config.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "architectures": [
+    "Gemma3ForConditionalGeneration"
+  ],
+  "boi_token_index": 255999,
+  "eoi_token_index": 256000,
+  "eos_token_id": 1,
+  "image_token_index": 262144,
+  "initializer_range": 0.02,
+  "mm_tokens_per_image": 256,
+  "model_type": "gemma3",
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attn_logit_softcapping": null,
+    "cache_implementation": "hybrid",
+    "final_logit_softcapping": null,
+    "head_dim": 128,
+    "hidden_activation": "gelu_pytorch_tanh",
+    "hidden_size": 5376,
+    "initializer_range": 0.02,
+    "intermediate_size": 21504,
+    "max_position_embeddings": 131072,
+    "model_type": "gemma3_text",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 62,
+    "num_key_value_heads": 16,
+    "query_pre_attn_scalar": 168,
+    "rms_norm_eps": 1e-06,
+    "rope_local_base_freq": 10000.0,
+    "rope_scaling": {
+      "factor": 8.0,
+      "rope_type": "linear"
+    },
+    "rope_theta": 1000000.0,
+    "sliding_window": 1024,
+    "sliding_window_pattern": 6,
+    "torch_dtype": "bfloat16",
+    "use_cache": false,
+    "vocab_size": 262208
+  },
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.50.0",
+  "use_cache": false,
+  "vision_config": {
+    "attention_dropout": 0.0,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_size": 896,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "patch_size": 14,
+    "torch_dtype": "bfloat16",
+    "vision_use_head": false
+  }
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "do_sample": true,
+  "eos_token_id": [
+    1,
+    106
+  ],
+  "pad_token_id": 0,
+  "top_k": 64,
+  "top_p": 0.95,
+  "transformers_version": "4.50.0"
+}

model-00001-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7cfe65620decaff9d737059e37e8496249b11ecf5a80381506e516cef336cd9
+size 4854573696

model-00002-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e782dd22381a0675ffa4bedf91b121fb7af4f471c391cf5c80f769cadd35bfef
+size 4954792944

model-00003-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edc1111415f73c3acc038d8f4a50cbdb180577b7762f0a2cd8f304ad3303093a
+size 4954792976

model-00004-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eacbc025f831dad7c95da476be6e10d2db5b843295710db47771594221165356
+size 4954793016

model-00005-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f11f7c13d8f08e4c707e5ca5d5f103da0f70bde2e6953184f25b81729c937af7
+size 4954793016

model-00006-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93336fdc39591d2e43fd1ee9d67180155f4b29028ee45629500d9d105d85c299
+size 4954793016

model-00007-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aba120fe6156ccd6f35db21283cf1b990848eaae13a5e0f21c3b9a2ae1e794d0
+size 4954793016

model-00008-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53253af4d3cd151692386e7ed23012710a7fa00cdd7e6ebcf9c7b6a7db35ec23
+size 4954793016

model-00009-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:561ef3fbe7ece9bcf5b0c62043f2a65be529ebcb6ccc51ed77a3900a8f115d98
+size 4954793016

model-00010-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:067608f9cf13317a73fd04228f65aa4d393a9650f7feddcdd1dd8ad1c7221475
+size 4954793016

model-00011-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73c47430843465d4f024d671ce885658bd6c56a4a9cacec36f7ab6179622ae53
+size 4954793016

model-00012-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b180613904855f75c07bb31c977ee6034c2f8bf499aedd1fa98ed51e22659385
+size 462476696

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
+size 33384568

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
+size 4689074

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7bf9420411a51edd2de797f377ab25c216d62b2e8d5b778e4679622cbf313c8
+size 8632