Sanchit Gandhi commited on
Commit
d4e2300
·
1 Parent(s): ad7f612

Upload weights and config

Browse files
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "mistralai/Mistral-7B-v0.1",
3
+ "architectures": [
4
+ "MistralForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 4096,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 14336,
13
+ "max_position_embeddings": 32768,
14
+ "model_type": "mistral",
15
+ "num_attention_heads": 32,
16
+ "num_hidden_layers": 6,
17
+ "num_key_value_heads": 8,
18
+ "rms_norm_eps": 1e-05,
19
+ "rope_theta": 10000.0,
20
+ "sliding_window": 4096,
21
+ "tie_word_embeddings": false,
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.36.2",
24
+ "use_cache": true,
25
+ "vocab_size": 32000
26
+ }
config_initialization.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Model arguments
2
+ model_name_or_path: mistralai/Mistral-7B-v0.1
3
+ num_hidden_layers: 6
4
+ output_dir: ./
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.36.2"
6
+ }
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0a08fed380573869d72b988d38d769ef100b55471b1712967f51af3b7a0fa3e
3
+ size 4987196936
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:324fa08cb238f0dcb0eba8f0f51b6b77e2997cc3456558a312b89bbf98ae23a6
3
+ size 1296089984
model.safetensors.index.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 6283280384
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00002-of-00002.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
28
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
37
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
38
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
40
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
41
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
42
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
43
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
44
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
45
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
46
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
47
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
48
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
49
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
50
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
51
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
52
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.5.input_layernorm.weight": "model-00002-of-00002.safetensors",
54
+ "model.layers.5.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
55
+ "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
56
+ "model.layers.5.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
57
+ "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
58
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
59
+ "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
60
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
61
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
62
+ "model.norm.weight": "model-00002-of-00002.safetensors"
63
+ }
64
+ }
run_initialization.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import logging
3
+ from dataclasses import dataclass, field
4
+ from typing import Optional
5
+
6
+ import numpy as np
7
+ import torch
8
+ from transformers import AutoModelForCausalLM, AutoTokenizer
9
+
10
+ from alignment import H4ArgumentParser, ModelArguments, get_kbit_device_map, get_quantization_config
11
+ from huggingface_hub import upload_folder
12
+
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class InitializationArguments(ModelArguments):
19
+ output_dir: str = field(
20
+ default="./checkpoint",
21
+ metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
22
+ )
23
+ num_hidden_layers: int = field(
24
+ default=6,
25
+ metadata={"help": "The number of hidden layers in the Transformer decoder."},
26
+ )
27
+ push_to_hub: Optional[bool] = field(
28
+ default=False, metadata={"help": "Whether or not to upload the trained model to the model hub after training."}
29
+ )
30
+ hub_model_id: Optional[str] = field(
31
+ default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."}
32
+ )
33
+ low_cpu_mem_usage: Optional[bool] = field(
34
+ default=True,
35
+ metadata={
36
+ "help": "Create the teacher model as an empty shell, and only materialize its parameters when the pretrained weights are loaded. "
37
+ "Significantly benefits loading time and RAM consumption."
38
+ },
39
+ )
40
+
41
+
42
+ def main():
43
+ parser = H4ArgumentParser([InitializationArguments])
44
+ model_args = parser.parse()
45
+
46
+ logger.info(f"Model parameters {model_args}")
47
+
48
+ logger.info("*** Load pretrained teacher model ***")
49
+ torch_dtype = (
50
+ model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
51
+ )
52
+ quantization_config = get_quantization_config(model_args)
53
+
54
+ model_kwargs = dict(
55
+ revision=model_args.model_revision,
56
+ trust_remote_code=model_args.trust_remote_code,
57
+ torch_dtype=torch_dtype,
58
+ device_map=get_kbit_device_map() if quantization_config is not None else None,
59
+ quantization_config=quantization_config,
60
+ low_cpu_mem_usage=model_args.low_cpu_mem_usage,
61
+ )
62
+
63
+ teacher_model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs)
64
+ tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
65
+ generation_config = teacher_model.generation_config
66
+ teacher_config = teacher_model.config
67
+
68
+ logger.info("*** Teacher model loaded! ***")
69
+
70
+ student_config = copy.deepcopy(teacher_config)
71
+ student_config.num_hidden_layers = model_args.num_hidden_layers
72
+ teacher_hidden_layers = teacher_config.num_hidden_layers
73
+
74
+ decoder_mapping = np.linspace(0, teacher_hidden_layers - 1, student_config.num_hidden_layers, dtype=int)
75
+ decoder_mapping[-1] = teacher_hidden_layers - 1
76
+
77
+ decoder_map = {}
78
+ for student_layer, teacher_layer in enumerate(decoder_mapping):
79
+ decoder_map[teacher_layer] = student_layer
80
+
81
+ # init the student params from the teacher model
82
+ logger.info("*** Load and initialise student model ***")
83
+ student_model = AutoModelForCausalLM.from_config(student_config)
84
+ missing_keys, unexpected_keys = student_model.load_state_dict(teacher_model.state_dict(), strict=False)
85
+ if len(missing_keys) > 0:
86
+ raise RuntimeError(
87
+ f"Error(s) in loading state_dict for {student_model.__class__.__name__}. \n"
88
+ f"Missing key(s) in state_dict: {missing_keys}"
89
+ )
90
+ if student_config.num_hidden_layers == teacher_hidden_layers:
91
+ decoder_keys = [key for key in unexpected_keys if "model.layers" in key]
92
+ if len(decoder_keys) > 0:
93
+ raise RuntimeError(
94
+ f"Error(s) in loading state_dict for {student_model.__class__.__name__}. \n"
95
+ f"Unexpected key(s) in state_dict: {decoder_keys}"
96
+ )
97
+
98
+ for layer in range(teacher_hidden_layers):
99
+ if layer in decoder_map:
100
+ # re-introduce pre-defined layers from the teacher
101
+ student_model.model.layers[decoder_map[layer]].load_state_dict(
102
+ teacher_model.model.layers[layer].state_dict()
103
+ )
104
+
105
+ logger.info("*** Student model loaded! ***")
106
+
107
+ # remove the teacher params and model
108
+ del teacher_model
109
+
110
+ # save the converted weights and model
111
+ if model_args.output_dir is not None:
112
+ student_model.save_pretrained(model_args.output_dir)
113
+ # we also need to correctly save the processor and generation config
114
+ tokenizer.save_pretrained(model_args.output_dir)
115
+ generation_config.save_pretrained(model_args.output_dir)
116
+
117
+ if model_args.push_to_hub:
118
+ repo_id = model_args.hub_model_id or model_args.output_dir
119
+ upload_folder(
120
+ repo_id=repo_id,
121
+ folder_path=model_args.output_dir,
122
+ commit_description="Uploading initialised weights and configs",
123
+ )
124
+
125
+
126
+ if __name__ == "__main__":
127
+ main()
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
tokenizer_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "additional_special_tokens": [],
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "legacy": true,
35
+ "model_max_length": 1000000000000000019884624838656,
36
+ "pad_token": null,
37
+ "sp_model_kwargs": {},
38
+ "spaces_between_special_tokens": false,
39
+ "tokenizer_class": "LlamaTokenizer",
40
+ "unk_token": "<unk>",
41
+ "use_default_system_prompt": false
42
+ }