Upload weights and config

Files changed (11) hide show

config.json +26 -0
config_initialization.yaml +4 -0
generation_config.json +6 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +64 -0
run_initialization.py +127 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +42 -0

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_name_or_path": "mistralai/Mistral-7B-v0.1",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 32768,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 6,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 10000.0,
+  "sliding_window": 4096,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.36.2",
+  "use_cache": true,
+  "vocab_size": 32000
+}

config_initialization.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# Model arguments
+model_name_or_path: mistralai/Mistral-7B-v0.1
+num_hidden_layers: 6
+output_dir: ./

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.36.2"
+}

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0a08fed380573869d72b988d38d769ef100b55471b1712967f51af3b7a0fa3e
+size 4987196936

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:324fa08cb238f0dcb0eba8f0f51b6b77e2997cc3456558a312b89bbf98ae23a6
+size 1296089984

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+  "metadata": {
+    "total_size": 6283280384
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00002-of-00002.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.norm.weight": "model-00002-of-00002.safetensors"
+  }
+}

run_initialization.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import copy
+import logging
+from dataclasses import dataclass, field
+from typing import Optional
+import numpy as np
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from alignment import H4ArgumentParser, ModelArguments, get_kbit_device_map, get_quantization_config
+from huggingface_hub import upload_folder
+logger = logging.getLogger(__name__)
+@dataclass
+class InitializationArguments(ModelArguments):
+    output_dir: str = field(
+        default="./checkpoint",
+        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
+    )
+    num_hidden_layers: int = field(
+        default=6,
+        metadata={"help": "The number of hidden layers in the Transformer decoder."},
+    )
+    push_to_hub: Optional[bool] = field(
+        default=False, metadata={"help": "Whether or not to upload the trained model to the model hub after training."}
+    )
+    hub_model_id: Optional[str] = field(
+        default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."}
+    )
+    low_cpu_mem_usage: Optional[bool] = field(
+        default=True,
+        metadata={
+            "help": "Create the teacher model as an empty shell, and only materialize its parameters when the pretrained weights are loaded. "
+            "Significantly benefits loading time and RAM consumption."
+        },
+    )
+def main():
+    parser = H4ArgumentParser([InitializationArguments])
+    model_args = parser.parse()
+    logger.info(f"Model parameters {model_args}")
+    logger.info("*** Load pretrained teacher model ***")
+    torch_dtype = (
+        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
+    )
+    quantization_config = get_quantization_config(model_args)
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        torch_dtype=torch_dtype,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+        low_cpu_mem_usage=model_args.low_cpu_mem_usage,
+    )
+    teacher_model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs)
+    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+    generation_config = teacher_model.generation_config
+    teacher_config = teacher_model.config
+    logger.info("*** Teacher model loaded! ***")
+    student_config = copy.deepcopy(teacher_config)
+    student_config.num_hidden_layers = model_args.num_hidden_layers
+    teacher_hidden_layers = teacher_config.num_hidden_layers
+    decoder_mapping = np.linspace(0, teacher_hidden_layers - 1, student_config.num_hidden_layers, dtype=int)
+    decoder_mapping[-1] = teacher_hidden_layers - 1
+    decoder_map = {}
+    for student_layer, teacher_layer in enumerate(decoder_mapping):
+        decoder_map[teacher_layer] = student_layer
+    # init the student params from the teacher model
+    logger.info("*** Load and initialise student model ***")
+    student_model = AutoModelForCausalLM.from_config(student_config)
+    missing_keys, unexpected_keys = student_model.load_state_dict(teacher_model.state_dict(), strict=False)
+    if len(missing_keys) > 0:
+        raise RuntimeError(
+            f"Error(s) in loading state_dict for {student_model.__class__.__name__}. \n"
+            f"Missing key(s) in state_dict: {missing_keys}"
+        )
+    if student_config.num_hidden_layers == teacher_hidden_layers:
+        decoder_keys = [key for key in unexpected_keys if "model.layers" in key]
+        if len(decoder_keys) > 0:
+            raise RuntimeError(
+                f"Error(s) in loading state_dict for {student_model.__class__.__name__}. \n"
+                f"Unexpected key(s) in state_dict: {decoder_keys}"
+            )
+    for layer in range(teacher_hidden_layers):
+        if layer in decoder_map:
+            # re-introduce pre-defined layers from the teacher
+            student_model.model.layers[decoder_map[layer]].load_state_dict(
+                teacher_model.model.layers[layer].state_dict()
+            )
+    logger.info("*** Student model loaded! ***")
+    # remove the teacher params and model
+    del teacher_model
+    # save the converted weights and model
+    if model_args.output_dir is not None:
+        student_model.save_pretrained(model_args.output_dir)
+        # we also need to correctly save the processor and generation config
+        tokenizer.save_pretrained(model_args.output_dir)
+        generation_config.save_pretrained(model_args.output_dir)
+    if model_args.push_to_hub:
+        repo_id = model_args.hub_model_id or model_args.output_dir
+        upload_folder(
+            repo_id=repo_id,
+            folder_path=model_args.output_dir,
+            commit_description="Uploading initialised weights and configs",
+        )
+if __name__ == "__main__":
+    main()

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}