farzadab commited on
Commit
a619ea3
·
verified ·
1 Parent(s): 7b6c9af

Update code

Browse files
Files changed (1) hide show
  1. config.json +192 -1
config.json CHANGED
@@ -1 +1,192 @@
1
- {"_name_or_path": "fixie-ai/ultravox-v0.2", "architectures": ["UltravoxModel"], "audio_config": {"_name_or_path": "openai/whisper-small", "activation_dropout": 0.0, "activation_function": "gelu", "apply_spec_augment": false, "architectures": ["WhisperForConditionalGeneration"], "attention_dropout": 0.0, "begin_suppress_tokens": [220, 50257], "bos_token_id": 50257, "d_model": 768, "decoder_attention_heads": 12, "decoder_ffn_dim": 3072, "decoder_layerdrop": 0.0, "decoder_layers": 12, "decoder_start_token_id": 50258, "dropout": 0.0, "encoder_attention_heads": 12, "encoder_ffn_dim": 3072, "encoder_layerdrop": 0.0, "encoder_layers": 12, "eos_token_id": 50257, "forced_decoder_ids": [[1, 50259], [2, 50359], [3, 50363]], "init_std": 0.02, "is_encoder_decoder": true, "max_length": 448, "max_source_positions": 1500, "max_target_positions": 448, "median_filter_width": 7, "model_type": "whisper", "num_hidden_layers": 12, "num_mel_bins": 80, "pad_token_id": 50257, "scale_embedding": false, "suppress_tokens": [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], "torch_dtype": "float32", "use_cache": true, "vocab_size": 51865}, "audio_model_id": "openai/whisper-small", "audio_token_index": 128256, "auto_map": {"AutoConfig": "ultravox_config.UltravoxConfig", "AutoModel": "ultravox_model.UltravoxModel"}, "custom_pipelines": {"ultravox-pipeline": {"impl": "ultravox_pipeline.UltravoxPipeline", "pt": ["AutoModel"], "tf": [], "type": "multimodal"}}, "hidden_size": 4096, "ignore_index": -100, "initializer_range": 0.02, "model_type": "ultravox", "norm_init": 0.4, "projector_act": "swiglu", "stack_factor": 8, "text_config": {"_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", "architectures": ["LlamaForCausalLM"], "bos_token_id": 128000, "eos_token_id": 128009, "intermediate_size": 14336, "max_position_embeddings": 8192, "model_type": "llama", "num_key_value_heads": 8, "rms_norm_eps": 1e-05, "rope_theta": 500000.0, "torch_dtype": "bfloat16", "vocab_size": 128256}, "text_model_id": null, "torch_dtype": "bfloat16", "transformers_version": "4.41.1", "vocab_size": 128256}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "fixie-ai/ultravox-v0.2",
3
+ "architectures": [
4
+ "UltravoxModel"
5
+ ],
6
+ "audio_config": {
7
+ "_name_or_path": "openai/whisper-small",
8
+ "activation_dropout": 0.0,
9
+ "activation_function": "gelu",
10
+ "apply_spec_augment": false,
11
+ "architectures": [
12
+ "WhisperForConditionalGeneration"
13
+ ],
14
+ "attention_dropout": 0.0,
15
+ "begin_suppress_tokens": [
16
+ 220,
17
+ 50257
18
+ ],
19
+ "bos_token_id": 50257,
20
+ "d_model": 768,
21
+ "decoder_attention_heads": 12,
22
+ "decoder_ffn_dim": 3072,
23
+ "decoder_layerdrop": 0.0,
24
+ "decoder_layers": 12,
25
+ "decoder_start_token_id": 50258,
26
+ "dropout": 0.0,
27
+ "encoder_attention_heads": 12,
28
+ "encoder_ffn_dim": 3072,
29
+ "encoder_layerdrop": 0.0,
30
+ "encoder_layers": 12,
31
+ "eos_token_id": 50257,
32
+ "forced_decoder_ids": [
33
+ [
34
+ 1,
35
+ 50259
36
+ ],
37
+ [
38
+ 2,
39
+ 50359
40
+ ],
41
+ [
42
+ 3,
43
+ 50363
44
+ ]
45
+ ],
46
+ "init_std": 0.02,
47
+ "is_encoder_decoder": true,
48
+ "max_length": 448,
49
+ "max_source_positions": 1500,
50
+ "max_target_positions": 448,
51
+ "median_filter_width": 7,
52
+ "model_type": "whisper",
53
+ "num_hidden_layers": 12,
54
+ "num_mel_bins": 80,
55
+ "pad_token_id": 50257,
56
+ "scale_embedding": false,
57
+ "suppress_tokens": [
58
+ 1,
59
+ 2,
60
+ 7,
61
+ 8,
62
+ 9,
63
+ 10,
64
+ 14,
65
+ 25,
66
+ 26,
67
+ 27,
68
+ 28,
69
+ 29,
70
+ 31,
71
+ 58,
72
+ 59,
73
+ 60,
74
+ 61,
75
+ 62,
76
+ 63,
77
+ 90,
78
+ 91,
79
+ 92,
80
+ 93,
81
+ 359,
82
+ 503,
83
+ 522,
84
+ 542,
85
+ 873,
86
+ 893,
87
+ 902,
88
+ 918,
89
+ 922,
90
+ 931,
91
+ 1350,
92
+ 1853,
93
+ 1982,
94
+ 2460,
95
+ 2627,
96
+ 3246,
97
+ 3253,
98
+ 3268,
99
+ 3536,
100
+ 3846,
101
+ 3961,
102
+ 4183,
103
+ 4667,
104
+ 6585,
105
+ 6647,
106
+ 7273,
107
+ 9061,
108
+ 9383,
109
+ 10428,
110
+ 10929,
111
+ 11938,
112
+ 12033,
113
+ 12331,
114
+ 12562,
115
+ 13793,
116
+ 14157,
117
+ 14635,
118
+ 15265,
119
+ 15618,
120
+ 16553,
121
+ 16604,
122
+ 18362,
123
+ 18956,
124
+ 20075,
125
+ 21675,
126
+ 22520,
127
+ 26130,
128
+ 26161,
129
+ 26435,
130
+ 28279,
131
+ 29464,
132
+ 31650,
133
+ 32302,
134
+ 32470,
135
+ 36865,
136
+ 42863,
137
+ 47425,
138
+ 49870,
139
+ 50254,
140
+ 50258,
141
+ 50360,
142
+ 50361,
143
+ 50362
144
+ ],
145
+ "torch_dtype": "float32",
146
+ "use_cache": true,
147
+ "vocab_size": 51865
148
+ },
149
+ "audio_model_id": "openai/whisper-small",
150
+ "audio_token_index": 128256,
151
+ "auto_map": {
152
+ "AutoConfig": "ultravox_config.UltravoxConfig",
153
+ "AutoModel": "ultravox_model.UltravoxModel"
154
+ },
155
+ "custom_pipelines": {
156
+ "ultravox-pipeline": {
157
+ "impl": "ultravox_pipeline.UltravoxPipeline",
158
+ "pt": [
159
+ "AutoModel"
160
+ ],
161
+ "tf": [],
162
+ "type": "multimodal"
163
+ }
164
+ },
165
+ "hidden_size": 4096,
166
+ "ignore_index": -100,
167
+ "initializer_range": 0.02,
168
+ "model_type": "ultravox",
169
+ "norm_init": 0.4,
170
+ "projector_act": "swiglu",
171
+ "stack_factor": 8,
172
+ "text_config": {
173
+ "_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct",
174
+ "architectures": [
175
+ "LlamaForCausalLM"
176
+ ],
177
+ "bos_token_id": 128000,
178
+ "eos_token_id": 128009,
179
+ "intermediate_size": 14336,
180
+ "max_position_embeddings": 8192,
181
+ "model_type": "llama",
182
+ "num_key_value_heads": 8,
183
+ "rms_norm_eps": 1e-05,
184
+ "rope_theta": 500000.0,
185
+ "torch_dtype": "bfloat16",
186
+ "vocab_size": 128256
187
+ },
188
+ "text_model_id": null,
189
+ "torch_dtype": "bfloat16",
190
+ "transformers_version": "4.41.1",
191
+ "vocab_size": 128256
192
+ }