sujitvasanth commited on
Commit
3e9b5d8
·
verified ·
1 Parent(s): 695a6a1

Upload 15 files

Browse files

QUANTISED FP16 BNB version (aprox 8Gb)

args.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "path_to_vm": "/home/sujit/VirtualBox VMs/ubuntu24.04/ubuntu24.04.vbox",
3
+ "headless": true,
4
+ "action_space": "pyautogui",
5
+ "observation_type": "screenshot",
6
+ "sleep_after_execution": 3.0,
7
+ "max_steps": 50,
8
+ "test_config_base_dir": "evaluation_examples",
9
+ "model": "/home/sujit/OpenCUA-7B",
10
+ "temperature": 0,
11
+ "top_p": 0.9,
12
+ "max_tokens": 2048,
13
+ "stop_token": null,
14
+ "cot_level": "l2",
15
+ "history_type": "action_history",
16
+ "coordinate_type": "qwen25",
17
+ "max_image_history_length": 3,
18
+ "domain": "all",
19
+ "test_all_meta_path": "evaluation_examples/test_small.json",
20
+ "result_dir": "./results",
21
+ "num_envs": 1,
22
+ "log_level": "INFO",
23
+ "region": "us-east-1",
24
+ "provider_name": "virtualbox",
25
+ "client_password": "password",
26
+ "screen_width": 1920,
27
+ "screen_height": 1080
28
+ }
bnb_snapshot.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "already_quantized": true
3
+ }
config.json ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vision_config": {
3
+ "return_dict": true,
4
+ "output_hidden_states": false,
5
+ "output_attentions": false,
6
+ "torchscript": false,
7
+ "torch_dtype": null,
8
+ "use_bfloat16": false,
9
+ "tf_legacy_loss": false,
10
+ "pruned_heads": {},
11
+ "tie_word_embeddings": true,
12
+ "chunk_size_feed_forward": 0,
13
+ "is_encoder_decoder": false,
14
+ "is_decoder": false,
15
+ "cross_attention_hidden_size": null,
16
+ "add_cross_attention": false,
17
+ "tie_encoder_decoder": false,
18
+ "max_length": 20,
19
+ "min_length": 0,
20
+ "do_sample": false,
21
+ "early_stopping": false,
22
+ "num_beams": 1,
23
+ "num_beam_groups": 1,
24
+ "diversity_penalty": 0.0,
25
+ "temperature": 1.0,
26
+ "top_k": 50,
27
+ "top_p": 1.0,
28
+ "typical_p": 1.0,
29
+ "repetition_penalty": 1.0,
30
+ "length_penalty": 1.0,
31
+ "no_repeat_ngram_size": 0,
32
+ "encoder_no_repeat_ngram_size": 0,
33
+ "bad_words_ids": null,
34
+ "num_return_sequences": 1,
35
+ "output_scores": false,
36
+ "return_dict_in_generate": false,
37
+ "forced_bos_token_id": null,
38
+ "forced_eos_token_id": null,
39
+ "remove_invalid_values": false,
40
+ "exponential_decay_length_penalty": null,
41
+ "suppress_tokens": null,
42
+ "begin_suppress_tokens": null,
43
+ "architectures": null,
44
+ "finetuning_task": null,
45
+ "id2label": {
46
+ "0": "LABEL_0",
47
+ "1": "LABEL_1"
48
+ },
49
+ "label2id": {
50
+ "LABEL_0": 0,
51
+ "LABEL_1": 1
52
+ },
53
+ "tokenizer_class": null,
54
+ "prefix": null,
55
+ "bos_token_id": null,
56
+ "pad_token_id": null,
57
+ "eos_token_id": null,
58
+ "sep_token_id": null,
59
+ "decoder_start_token_id": null,
60
+ "task_specific_params": null,
61
+ "problem_type": null,
62
+ "_name_or_path": "",
63
+ "_attn_implementation_autoset": false,
64
+ "in_chans": 3,
65
+ "spatial_patch_size": 14,
66
+ "depth": 32,
67
+ "hidden_size": 1280,
68
+ "hidden_act": "silu",
69
+ "intermediate_size": 3420,
70
+ "num_heads": 16,
71
+ "in_channels": 3,
72
+ "patch_size": 14,
73
+ "spatial_merge_size": 2,
74
+ "temporal_patch_size": 2,
75
+ "tokens_per_second": 2,
76
+ "window_size": 112,
77
+ "fullatt_block_indexes": [
78
+ 7,
79
+ 15,
80
+ 23,
81
+ 31
82
+ ],
83
+ "out_hidden_size": 3584,
84
+ "model_type": "qwen2_5_vl"
85
+ },
86
+ "text_config": {
87
+ "vocab_size": 152064,
88
+ "max_position_embeddings": 32768,
89
+ "hidden_size": 3584,
90
+ "intermediate_size": 18944,
91
+ "num_hidden_layers": 28,
92
+ "num_attention_heads": 28,
93
+ "use_sliding_window": false,
94
+ "sliding_window": 4096,
95
+ "max_window_layers": 28,
96
+ "num_key_value_heads": 4,
97
+ "hidden_act": "silu",
98
+ "initializer_range": 0.02,
99
+ "rms_norm_eps": 1e-05,
100
+ "use_cache": true,
101
+ "rope_theta": 1000000.0,
102
+ "rope_scaling": null,
103
+ "attention_dropout": 0.0,
104
+ "return_dict": true,
105
+ "output_hidden_states": false,
106
+ "output_attentions": false,
107
+ "torchscript": false,
108
+ "torch_dtype": "bfloat16",
109
+ "use_bfloat16": false,
110
+ "tf_legacy_loss": false,
111
+ "pruned_heads": {},
112
+ "tie_word_embeddings": false,
113
+ "chunk_size_feed_forward": 0,
114
+ "is_encoder_decoder": false,
115
+ "is_decoder": false,
116
+ "cross_attention_hidden_size": null,
117
+ "add_cross_attention": false,
118
+ "tie_encoder_decoder": false,
119
+ "max_length": 20,
120
+ "min_length": 0,
121
+ "do_sample": false,
122
+ "early_stopping": false,
123
+ "num_beams": 1,
124
+ "num_beam_groups": 1,
125
+ "diversity_penalty": 0.0,
126
+ "temperature": 1.0,
127
+ "top_k": 50,
128
+ "top_p": 1.0,
129
+ "typical_p": 1.0,
130
+ "repetition_penalty": 1.0,
131
+ "length_penalty": 1.0,
132
+ "no_repeat_ngram_size": 0,
133
+ "encoder_no_repeat_ngram_size": 0,
134
+ "bad_words_ids": null,
135
+ "num_return_sequences": 1,
136
+ "output_scores": false,
137
+ "return_dict_in_generate": false,
138
+ "forced_bos_token_id": null,
139
+ "forced_eos_token_id": null,
140
+ "remove_invalid_values": false,
141
+ "exponential_decay_length_penalty": null,
142
+ "suppress_tokens": null,
143
+ "begin_suppress_tokens": null,
144
+ "architectures": null,
145
+ "finetuning_task": null,
146
+ "id2label": {
147
+ "0": "LABEL_0",
148
+ "1": "LABEL_1"
149
+ },
150
+ "label2id": {
151
+ "LABEL_0": 0,
152
+ "LABEL_1": 1
153
+ },
154
+ "tokenizer_class": null,
155
+ "prefix": null,
156
+ "bos_token_id": 151643,
157
+ "pad_token_id": 152063,
158
+ "eos_token_id": 151644,
159
+ "sep_token_id": null,
160
+ "decoder_start_token_id": null,
161
+ "task_specific_params": null,
162
+ "problem_type": null,
163
+ "_name_or_path": "",
164
+ "_attn_implementation_autoset": false,
165
+ "head_dim": 128,
166
+ "k_proj_bias": true,
167
+ "model_type": "qwen2",
168
+ "pretraining_sequence_length": 128000,
169
+ "q_proj_bias": true,
170
+ "v_proj_bias": true
171
+ },
172
+ "ignore_index": -100,
173
+ "media_placeholder_token_id": 151664,
174
+ "return_dict": true,
175
+ "output_hidden_states": false,
176
+ "output_attentions": false,
177
+ "torchscript": false,
178
+ "torch_dtype": "bfloat16",
179
+ "use_bfloat16": false,
180
+ "tf_legacy_loss": false,
181
+ "pruned_heads": {},
182
+ "tie_word_embeddings": false,
183
+ "chunk_size_feed_forward": 0,
184
+ "is_encoder_decoder": false,
185
+ "is_decoder": false,
186
+ "cross_attention_hidden_size": null,
187
+ "add_cross_attention": false,
188
+ "tie_encoder_decoder": false,
189
+ "max_length": 20,
190
+ "min_length": 0,
191
+ "do_sample": false,
192
+ "early_stopping": false,
193
+ "num_beams": 1,
194
+ "num_beam_groups": 1,
195
+ "diversity_penalty": 0.0,
196
+ "temperature": 1.0,
197
+ "top_k": 50,
198
+ "top_p": 1.0,
199
+ "typical_p": 1.0,
200
+ "repetition_penalty": 1.0,
201
+ "length_penalty": 1.0,
202
+ "no_repeat_ngram_size": 0,
203
+ "encoder_no_repeat_ngram_size": 0,
204
+ "bad_words_ids": null,
205
+ "num_return_sequences": 1,
206
+ "output_scores": false,
207
+ "return_dict_in_generate": false,
208
+ "forced_bos_token_id": null,
209
+ "forced_eos_token_id": null,
210
+ "remove_invalid_values": false,
211
+ "exponential_decay_length_penalty": null,
212
+ "suppress_tokens": null,
213
+ "begin_suppress_tokens": null,
214
+ "architectures": [
215
+ "OpenCUAForConditionalGeneration"
216
+ ],
217
+ "finetuning_task": null,
218
+ "id2label": {
219
+ "0": "LABEL_0",
220
+ "1": "LABEL_1"
221
+ },
222
+ "label2id": {
223
+ "LABEL_0": 0,
224
+ "LABEL_1": 1
225
+ },
226
+ "tokenizer_class": null,
227
+ "prefix": null,
228
+ "bos_token_id": null,
229
+ "pad_token_id": 0,
230
+ "eos_token_id": null,
231
+ "sep_token_id": null,
232
+ "decoder_start_token_id": null,
233
+ "task_specific_params": null,
234
+ "problem_type": null,
235
+ "_name_or_path": "/home/sujit/OpenCUA-7B",
236
+ "_attn_implementation_autoset": false,
237
+ "transformers_version": "4.49.0",
238
+ "auto_map": {
239
+ "AutoConfig": "configuration_opencua.OpenCUAConfig",
240
+ "AutoModel": "modeling_opencua.OpenCUAForConditionalGeneration",
241
+ "AutoModelForCausalLM": "modeling_opencua.OpenCUAForConditionalGeneration"
242
+ },
243
+ "model_type": "opencua",
244
+ "vocab_size": 152064,
245
+ "quantization_config": {
246
+ "_load_in_4bit": true,
247
+ "_load_in_8bit": false,
248
+ "bnb_4bit_compute_dtype": "float16",
249
+ "bnb_4bit_quant_storage": "uint8",
250
+ "bnb_4bit_quant_type": "nf4",
251
+ "bnb_4bit_use_double_quant": true,
252
+ "llm_int8_enable_fp32_cpu_offload": false,
253
+ "llm_int8_has_fp16_weight": false,
254
+ "llm_int8_skip_modules": null,
255
+ "llm_int8_threshold": 6.0,
256
+ "load_in_4bit": true,
257
+ "load_in_8bit": false,
258
+ "quant_method": "bitsandbytes"
259
+ }
260
+ }
configuration_opencua.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.configuration_utils import PretrainedConfig
2
+ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
3
+ from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
4
+
5
+
6
+ class OpenCUAConfig(PretrainedConfig):
7
+ """OpenCUA-2.5-7B model configuration.
8
+
9
+ Args:
10
+ vision_config: Configuration for the vision model.Qwen2_5_VLVisionConfig
11
+ text_config: Configuration for the text model. Qwen2Config
12
+ pad_token_id: The token ID to use for padding.
13
+ """
14
+
15
+ model_type = "opencua"
16
+
17
+ def __init__(
18
+ self,
19
+ vision_config: dict | Qwen2_5_VLVisionConfig | None = None,
20
+ text_config: dict | Qwen2Config | None = None,
21
+ ignore_index: int = -100,
22
+ media_placeholder_token_id: int = 151664,
23
+ pad_token_id: int = 0,
24
+ **kwargs
25
+ ):
26
+ if isinstance(vision_config, dict):
27
+ vision_config = Qwen2_5_VLVisionConfig(**vision_config)
28
+ self.vision_config = vision_config
29
+
30
+ if isinstance(text_config, dict):
31
+ text_config = Qwen2Config(**text_config)
32
+ self.text_config = text_config
33
+
34
+ self.ignore_index = ignore_index
35
+ self.media_placeholder_token_id = media_placeholder_token_id
36
+
37
+ super().__init__(pad_token_id=pad_token_id, **kwargs)
38
+
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation": "sdpa",
3
+ "attn_implementation": "sdpa",
4
+ "eos_token_id": 151644,
5
+ "max_length": 32768,
6
+ "transformers_version": "4.49.0"
7
+ }
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1bcbdd077a2e97bf65a6beaff1fee1327aec3665b6a5b8fc68131bfe2ab9cb0
3
+ size 4809638813
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9be0baa3e20f5a2bcf0f0dee07c9586336a48fcacb297944fc5e449bec9ba3d
3
+ size 1089994896
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_opencua.py ADDED
@@ -0,0 +1,449 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------
2
+ # OpenCUA‑7B Model
3
+ #
4
+ # This implementation is adapted from the Qwen2‑VL reference code in
5
+ # Hugging Face Transformers v4.53.0:
6
+ # https://github.com/huggingface/transformers/tree/v4.53.0/src/transformers/models/qwen2_5_vl
7
+ #
8
+ # Checkpoint used for weight initialisation:
9
+ # "Qwen/Qwen2.5-VL-7B-Instruct" – https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct
10
+ #
11
+ # Key modifications
12
+ # -----------------
13
+ # • Replaced Multimodal Rotary Position Embedding (M‑RoPE) with 1‑D RoPE for
14
+ # compatibility with OpenCUA training settings.
15
+ # • Wrapped vision encoder and language model into a single
16
+ # `OpenCUAForConditionalGeneration` class.
17
+ # • Simplified weight initialisation — this file targets inference / fine‑tuning,
18
+ # not training from scratch.
19
+ #
20
+ # Copyright (c) 2025 XLANG Lab, The University of Hong Kong
21
+ #
22
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
23
+ # of this software and associated documentation files (the “Software”), to deal
24
+ # in the Software without restriction, including without limitation the rights
25
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
26
+ # copies of the Software, and to permit persons to whom the Software is
27
+ # furnished to do so, subject to the following conditions:
28
+ #
29
+ # The above copyright notice and this permission notice shall be included in all
30
+ # copies or substantial portions of the Software.
31
+ #
32
+ # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
33
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
34
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
35
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
37
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
38
+ # SOFTWARE.
39
+ #
40
+ # ------------------------------------------------------------------------------
41
+ # Prohibited Uses & Additional Disclaimer
42
+ # ---------------------------------------
43
+ # • The Software may **not** be used for any purpose or activity that violates
44
+ # applicable laws or regulations in any jurisdiction.
45
+ # • The authors, contributors, and copyright holders are **not responsible**
46
+ # for any illegal, unethical, or harmful use of the Software, nor for any
47
+ # direct or indirect damages resulting from such use.
48
+ # • Use of the “OpenCUA” name, logo, or trademarks does **not** imply any
49
+ # endorsement or affiliation unless a separate written permission is obtained.
50
+
51
+ import torch
52
+ import torch.nn as nn
53
+ from transformers.cache_utils import Cache
54
+ from transformers.modeling_utils import PreTrainedModel
55
+ from transformers.models.llava.modeling_llava import LlavaCausalLMOutputWithPast
56
+
57
+ from .configuration_opencua import OpenCUAConfig
58
+ from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VisionTransformerPretrainedModel
59
+ from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM
60
+
61
+
62
+ class OpenCUAPreTrainedModel(PreTrainedModel):
63
+ config_class = OpenCUAConfig
64
+ base_model_prefix = "model"
65
+ _no_split_modules = ["Qwen2_5_VisionTransformerPretrainedModel"]
66
+ _skip_keys_device_placement = "past_key_values"
67
+ _supports_flash_attn_2 = True
68
+
69
+ def _init_weights(self, module):
70
+ # important: this ported version of Llava isn't meant for training from scratch - only
71
+ # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
72
+ # https://github.com/haotian-liu/LLaVA/tree/main/llava should serve for that purpose
73
+ std = (
74
+ self.config.initializer_range
75
+ if hasattr(self.config, "initializer_range")
76
+ else self.config.text_config.initializer_range
77
+ )
78
+
79
+ if hasattr(module, "class_embedding"):
80
+ module.class_embedding.data.normal_(mean=0.0, std=std)
81
+
82
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
83
+ module.weight.data.normal_(mean=0.0, std=std)
84
+ if module.bias is not None:
85
+ module.bias.data.zero_()
86
+ elif isinstance(module, nn.Embedding):
87
+ module.weight.data.normal_(mean=0.0, std=std)
88
+ if module.padding_idx is not None:
89
+ module.weight.data[module.padding_idx].zero_()
90
+
91
+ @property
92
+ def _supports_sdpa(self):
93
+ """
94
+ Retrieve language_model's attribute to check whether the model supports
95
+ SDPA or not.
96
+ """
97
+ return self.language_model._supports_sdpa
98
+
99
+
100
+ class OpenCUAForConditionalGeneration(OpenCUAPreTrainedModel):
101
+
102
+ def __init__(self, config: OpenCUAConfig):
103
+ super().__init__(config)
104
+ self.vision_tower = Qwen2_5_VisionTransformerPretrainedModel(config.vision_config)
105
+ self.language_model = Qwen2ForCausalLM(config.text_config)
106
+ self.post_init()
107
+
108
+ def get_input_embeddings(self):
109
+ return self.language_model.get_input_embeddings()
110
+
111
+ def set_input_embeddings(self, value):
112
+ self.language_model.set_input_embeddings(value)
113
+
114
+ def get_output_embeddings(self):
115
+ return self.language_model.get_output_embeddings()
116
+
117
+ def set_output_embeddings(self, new_embeddings):
118
+ self.language_model.set_output_embeddings(new_embeddings)
119
+
120
+ def set_decoder(self, decoder):
121
+ self.language_model.set_decoder(decoder)
122
+
123
+ def get_decoder(self):
124
+ return self.language_model.get_decoder()
125
+
126
+ def tie_weights(self):
127
+ return self.language_model.tie_weights()
128
+
129
+ def resize_token_embeddings(self, new_num_tokens: int | None = None, pad_to_multiple_of=None) -> nn.Embedding:
130
+ model_embeds = self.language_model.resize_token_embeddings(
131
+ new_num_tokens, pad_to_multiple_of)
132
+ # update vocab size
133
+ self.config.text_config.vocab_size = model_embeds.num_embeddings
134
+ self.vocab_size = model_embeds.num_embeddings
135
+ return model_embeds
136
+
137
+ def _merge_input_ids_with_image_features(
138
+ self,
139
+ image_features: torch.Tensor,
140
+ feature_lengths: list[int],
141
+ inputs_embeds: torch.Tensor,
142
+ input_ids: torch.Tensor,
143
+ attention_mask: torch.Tensor,
144
+ labels: torch.Tensor | None = None):
145
+ """
146
+ Args:
147
+ image_features (:obj:`torch.Tensor` of shape :obj:`(num_image_tokens, embed_dim)`):
148
+ The image features to merge with the input embeddings.
149
+ feature_lengths: the length of image feature.
150
+ inputs_embeds (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length, embed_dim)`):
151
+ The input embeddings.
152
+ input_ids (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`):
153
+ The input ids.
154
+ attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`):
155
+ The attention mask.
156
+ labels (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, *optional*):
157
+ The labels.
158
+ """
159
+
160
+ image_token_index: int = self.config.media_placeholder_token_id
161
+ pad_token_id: int = self.config.pad_token_id
162
+ ignore_index: int = self.config.ignore_index
163
+
164
+ _, embed_dim = image_features.shape
165
+
166
+ batch_size, sequence_length = input_ids.shape
167
+ left_padding = not torch.sum(
168
+ input_ids[:, -1] == torch.tensor(pad_token_id))
169
+
170
+ # 1. Create a mask to know where special image tokens are
171
+ _token_occupation_table = torch.ones_like(input_ids.flatten())
172
+ _token_occupation_table[input_ids.flatten() == image_token_index] = \
173
+ torch.tensor(feature_lengths,
174
+ dtype=torch.long, device=input_ids.device)
175
+ _token_occupation_table = _token_occupation_table.reshape(
176
+ input_ids.shape)
177
+
178
+ max_embed_dim = _token_occupation_table.sum(-1).max().item()
179
+ assert max_embed_dim >= sequence_length, (
180
+ f"The maximum embedding dimension ({max_embed_dim}) is less than the sequence length ({sequence_length})"
181
+ )
182
+ batch_indices, non_image_indices = torch.where(input_ids != image_token_index)
183
+
184
+ # 2. Compute the positions where text should be written
185
+ # Calculate new positions for text tokens in merged image-text sequence.
186
+ new_token_positions = torch.cumsum(_token_occupation_table, -1) - 1
187
+ nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
188
+ if left_padding:
189
+ new_token_positions += nb_image_pad[:, None] # offset for left padding
190
+ text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
191
+
192
+ # 3. Create the full embedding, already padded to the maximum position
193
+ final_embedding = torch.zeros(
194
+ batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
195
+ )
196
+ final_attention_mask = torch.zeros(
197
+ batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
198
+ )
199
+ if labels is not None:
200
+ final_labels = torch.full(
201
+ (batch_size, max_embed_dim), ignore_index, dtype=input_ids.dtype, device=input_ids.device
202
+ )
203
+ # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
204
+ # set the corresponding tensors into their correct target device.
205
+ target_device = inputs_embeds.device
206
+ batch_indices, non_image_indices, text_to_overwrite = (
207
+ batch_indices.to(target_device),
208
+ non_image_indices.to(target_device),
209
+ text_to_overwrite.to(target_device),
210
+ )
211
+ attention_mask = attention_mask.to(target_device)
212
+
213
+ # 4. Fill the embeddings based on the mask.
214
+ final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
215
+ final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
216
+ if labels is not None:
217
+ final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
218
+
219
+ # 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835)
220
+ image_to_overwrite = torch.full(
221
+ (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device
222
+ )
223
+ image_to_overwrite[batch_indices, text_to_overwrite] = False
224
+ image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
225
+
226
+ if image_to_overwrite.sum() != image_features.shape[:-1].numel():
227
+ raise ValueError(
228
+ f"The input provided to the model are wrong. The number of image tokens is {image_to_overwrite.sum()} while"
229
+ f" the number of image features given to the model is {image_features.shape[:-1].numel()}. "
230
+ "This prevents correct indexing and breaks batch generation."
231
+ )
232
+
233
+ final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
234
+ final_attention_mask |= image_to_overwrite
235
+ position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
236
+
237
+ # 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens.
238
+ batch_indices, pad_indices = torch.where(input_ids == pad_token_id)
239
+ indices_to_mask = new_token_positions[batch_indices, pad_indices]
240
+
241
+ final_embedding[batch_indices, indices_to_mask] = 0
242
+
243
+ if labels is None:
244
+ final_labels = None
245
+
246
+ return final_embedding, final_attention_mask, final_labels, position_ids
247
+
248
+ def _extract_image_features(self,
249
+ pixel_values: torch.FloatTensor | list[torch.FloatTensor],
250
+ grid_thws: torch.FloatTensor,
251
+ ):
252
+ """
253
+ Args:
254
+ pixel_values (:obj:`torch.FloatTensor` of shape :obj:`(sum_num_image_tokens, channels)`):
255
+ The pixel values of the images processed by image processor.
256
+ grid_thws: (B,3)
257
+
258
+ Returns:
259
+ selected_image_feature (:obj:`torch.FloatTensor` of shape :obj:`(num_image_tokens, embed_dim)`):
260
+ The selected image features to use as input to the projector head.
261
+
262
+ """
263
+
264
+ assert len(grid_thws.shape)==2 and grid_thws.shape[1]==3, f"grid_thws must be a 2D tensor with shape (batched, 3), but got {grid_thws.shape}"
265
+ if isinstance(pixel_values, list):
266
+ pixel_values = torch.cat(pixel_values, dim=0)
267
+ image_features_ = self.vision_tower(pixel_values, grid_thw=grid_thws)
268
+ image_features_list = []
269
+ start_idx = 0
270
+ for i, grid_thw in enumerate(grid_thws):
271
+ end_idx = start_idx + (grid_thw[0] * grid_thw[1] * grid_thw[2]) // 4
272
+ image_features_list.append(image_features_[start_idx:end_idx, :])
273
+ start_idx = end_idx
274
+
275
+ selected_image_feature = torch.cat(image_features_list, dim=0)
276
+ feature_lengths = [x.size(0) for x in image_features_list]
277
+ return selected_image_feature, feature_lengths
278
+
279
+ def forward(
280
+ self,
281
+ input_ids: torch.LongTensor | None = None,
282
+ pixel_values: torch.FloatTensor | list[torch.FloatTensor] | None = None,
283
+ grid_thws: torch.Tensor = None,
284
+ attention_mask: torch.Tensor | None = None,
285
+ position_ids: torch.LongTensor | None = None,
286
+ past_key_values: list[torch.FloatTensor] | None = None,
287
+ inputs_embeds: torch.FloatTensor | None = None,
288
+ labels: torch.LongTensor | None = None,
289
+ use_cache: bool | None = None,
290
+ output_attentions: bool | None = None,
291
+ output_hidden_states: bool | None = None,
292
+ return_dict: bool | None = None,
293
+ ) -> tuple | LlavaCausalLMOutputWithPast:
294
+ r"""
295
+ Args:
296
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
297
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
298
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
299
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
300
+
301
+ ```"""
302
+
303
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
304
+ output_hidden_states = (
305
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
306
+ )
307
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
308
+ if inputs_embeds is None:
309
+ # 1. Extra the input embeddings
310
+ inputs_embeds = self.get_input_embeddings()(input_ids)
311
+ # 2. Merge text and images
312
+ if pixel_values is not None and len(pixel_values) > 0 and input_ids.shape[1] != 1:
313
+ image_feature, feature_lengths = self._extract_image_features(
314
+ pixel_values, grid_thws)
315
+
316
+ inputs_embeds = inputs_embeds.to(image_feature.dtype) # num_tokens, embed_dim
317
+ inputs_embeds, attention_mask, labels, position_ids = \
318
+ self._merge_input_ids_with_image_features(image_feature, feature_lengths, inputs_embeds, input_ids, attention_mask, labels
319
+ )
320
+ # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
321
+ # generation with cache
322
+ elif past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
323
+ # Retrieve the first layer to inspect the logits and mask out the hidden states
324
+ # that are set to 0
325
+ first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
326
+
327
+ # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
328
+ batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
329
+
330
+ # Get the target length
331
+ target_length = input_ids.shape[1]
332
+ past_length = first_layer_past_key_value.shape[-1]
333
+
334
+ extended_attention_mask = torch.ones(
335
+ (attention_mask.shape[0], past_length),
336
+ dtype=attention_mask.dtype,
337
+ device=attention_mask.device,
338
+ )
339
+
340
+ # Filter out only the tokens that can be un-attended, this can happen
341
+ # if one uses Llava + Fused modules where the cache on the
342
+ # first iteration is already big enough, or if one passes custom cache
343
+ valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
344
+ new_batch_index = batch_index[valid_indices]
345
+ new_non_attended_tokens = non_attended_tokens[valid_indices]
346
+
347
+ # Zero-out the places where we don't need to attend
348
+ extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
349
+
350
+ attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
351
+ position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
352
+
353
+ outputs = self.language_model(
354
+ attention_mask=attention_mask,
355
+ position_ids=position_ids,
356
+ past_key_values=past_key_values,
357
+ inputs_embeds=inputs_embeds,
358
+ use_cache=use_cache,
359
+ output_attentions=output_attentions,
360
+ output_hidden_states=output_hidden_states,
361
+ return_dict=return_dict,
362
+ )
363
+
364
+ logits = outputs[0]
365
+
366
+ loss = None
367
+ if labels is not None:
368
+ # Shift so that tokens < n predict n
369
+ if attention_mask is not None:
370
+ shift_attention_mask = attention_mask[..., 1:]
371
+ shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
372
+ shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
373
+ else:
374
+ shift_logits = logits[..., :-1, :].contiguous()
375
+ shift_labels = labels[..., 1:].contiguous()
376
+ # Flatten the tokens
377
+ loss_fct = nn.CrossEntropyLoss()
378
+ loss = loss_fct(
379
+ shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
380
+ )
381
+
382
+ if not return_dict:
383
+ output = (logits,) + outputs[1:]
384
+ return (loss,) + output if loss is not None else output
385
+
386
+ return LlavaCausalLMOutputWithPast(
387
+ loss=loss,
388
+ logits=logits,
389
+ past_key_values=outputs.past_key_values,
390
+ hidden_states=outputs.hidden_states,
391
+ attentions=outputs.attentions,
392
+ )
393
+
394
+ def prepare_inputs_for_generation(
395
+ self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None, grid_thws=None, attention_mask=None, **kwargs
396
+ ):
397
+ if past_key_values is not None:
398
+ if isinstance(past_key_values, Cache):
399
+ cache_length = past_key_values.get_seq_length()
400
+ past_length = past_key_values.seen_tokens
401
+ else:
402
+ cache_length = past_length = past_key_values[0][0].shape[2]
403
+
404
+ # Keep only the unprocessed tokens:
405
+ # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
406
+ # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
407
+ # input)
408
+ if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
409
+ input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
410
+ # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
411
+ # input_ids based on the past_length.
412
+ elif past_length < input_ids.shape[1]:
413
+ input_ids = input_ids[:, past_length:]
414
+ # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
415
+ elif self.config.media_placeholder_token_id in input_ids:
416
+ input_ids = input_ids[:, input_ids.shape[1] - 1 :]
417
+ # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
418
+ # older attention values, as their corresponding values are not part of the input.
419
+ if cache_length < past_length and attention_mask is not None:
420
+ attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
421
+
422
+ position_ids = kwargs.get("position_ids", None)
423
+ if attention_mask is not None and position_ids is None:
424
+ # create position_ids on the fly for batch generation
425
+ position_ids = attention_mask.long().cumsum(-1) - 1
426
+ position_ids.masked_fill_(attention_mask == 0, 1)
427
+ if past_key_values:
428
+ position_ids = position_ids[:, -input_ids.shape[1] :]
429
+
430
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
431
+ if inputs_embeds is not None and past_key_values is None:
432
+ model_inputs = {"inputs_embeds": inputs_embeds}
433
+ else:
434
+ model_inputs = {"input_ids": input_ids}
435
+
436
+ model_inputs.update(
437
+ {
438
+ "position_ids": position_ids,
439
+ "past_key_values": past_key_values,
440
+ "use_cache": kwargs.get("use_cache"),
441
+ "attention_mask": attention_mask,
442
+ "pixel_values": pixel_values,
443
+ "grid_thws": grid_thws,
444
+ }
445
+ )
446
+ return model_inputs
447
+
448
+ def _reorder_cache(self, *args, **kwargs):
449
+ return self.language_model._reorder_cache(*args, **kwargs)
preprocessor_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "min_pixels": 3136,
3
+ "max_pixels": 12845056,
4
+ "patch_size": 14,
5
+ "temporal_patch_size": 2,
6
+ "merge_size": 2,
7
+ "image_mean": [
8
+ 0.48145466,
9
+ 0.4578275,
10
+ 0.40821073
11
+ ],
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "image_processor_type": "Qwen2VLImageProcessor"
18
+ }
quantization_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "quant_method": "bitsandbytes",
3
+ "_load_in_8bit": false,
4
+ "_load_in_4bit": true,
5
+ "llm_int8_threshold": 6.0,
6
+ "llm_int8_skip_modules": null,
7
+ "llm_int8_enable_fp32_cpu_offload": false,
8
+ "llm_int8_has_fp16_weight": false,
9
+ "bnb_4bit_quant_type": "nf4",
10
+ "bnb_4bit_use_double_quant": true,
11
+ "bnb_4bit_compute_dtype": "fp16",
12
+ "bnb_4bit_quant_storage": "uint8",
13
+ "load_in_4bit": true,
14
+ "load_in_8bit": false,
15
+ "is_pre_quantized": true
16
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_end|>",
4
+ "<|im_user|>",
5
+ "<|im_assistant|>",
6
+ "<|reserved_token_0|>",
7
+ "<|start_header_id|>",
8
+ "<|end_header_id|>",
9
+ "<|reserved_token_1|>",
10
+ "[EOT]",
11
+ "<|im_system|>",
12
+ "<|reserved_token_2|>",
13
+ "<|reserved_token_3|>",
14
+ "<|reserved_token_4|>",
15
+ "<|reserved_token_5|>",
16
+ "<|reserved_token_6|>",
17
+ "<|reserved_token_7|>",
18
+ "<|im_middle|>",
19
+ "<|media_begin|>",
20
+ "<|media_content|>",
21
+ "<|media_end|>",
22
+ "<|media_placeholder|>"
23
+ ],
24
+ "bos_token": {
25
+ "content": "[BOS]",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ "eos_token": {
32
+ "content": "[EOS]",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ "pad_token": {
39
+ "content": "[PAD]",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ },
45
+ "unk_token": {
46
+ "content": "[UNK]",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false
51
+ }
52
+ }
tiktoken.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2b1b8dfb5cc5f024bafc373121c6aba3f66f9a5a0269e243470a1de16a33186
3
+ size 2561218
tokenization_opencua.py ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tiktoken
3
+
4
+ from logging import getLogger
5
+ from pathlib import Path
6
+ from typing import (
7
+ cast,
8
+ Tuple,
9
+ Dict,
10
+ Iterator,
11
+ List,
12
+ Union,
13
+ Optional,
14
+ )
15
+ from shutil import copyfile
16
+ from tiktoken.load import load_tiktoken_bpe
17
+ from tokenizers import AddedToken
18
+ from transformers.tokenization_utils import PreTrainedTokenizer
19
+ from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
20
+
21
+
22
+
23
+ logger = getLogger(__name__)
24
+ VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
25
+
26
+ class TikTokenTokenizer(PreTrainedTokenizer):
27
+ """
28
+ Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
29
+
30
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
31
+ this superclass for more information regarding those methods.
32
+
33
+ Args:
34
+ vocab_file (`str`):
35
+ The path to the Tiktoken model file.
36
+ bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
37
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
38
+ eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
39
+ The end of sequence token.
40
+ unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
41
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
42
+ token instead. The second to last item in special_tokens.
43
+ pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
44
+ The token used for padding, for example when batching sequences of different lengths.
45
+ additional_special_tokens (list of `str`, *optional*):
46
+ A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
47
+ skipped when decoding if `skip_special_tokens` is set to `True`.
48
+ """
49
+
50
+ vocab_files_names = VOCAB_FILES_NAMES
51
+
52
+ model_input_names = ["input_ids", "attention_mask"]
53
+
54
+ special_tokens: Dict[str, int]
55
+
56
+ num_reserved_special_tokens = 256
57
+
58
+ pat_str = "|".join(
59
+ [
60
+ r"""[\p{Han}]+""",
61
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
62
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
63
+ r"""\p{N}{1,3}""",
64
+ r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
65
+ r"""\s*[\r\n]+""",
66
+ r"""\s+(?!\S)""",
67
+ r"""\s+""",
68
+ ]
69
+ )
70
+
71
+ def __init__(
72
+ self,
73
+ vocab_file,
74
+ bos_token: Union[str, AddedToken]="[BOS]",
75
+ eos_token: Union[str, AddedToken]="[EOS]",
76
+ unk_token: Union[str, AddedToken, None]=None,
77
+ pad_token: Union[str, AddedToken, None]=None,
78
+ additional_special_tokens: List[str]=None,
79
+ added_tokens_decoder: Optional[dict] = None,
80
+ **kwargs,
81
+ ):
82
+ assert os.path.isfile(vocab_file), vocab_file
83
+
84
+ if additional_special_tokens is None:
85
+ # dumping mode
86
+ used_special_tokens = [
87
+ "<|im_end|>",
88
+ "<|im_user|>",
89
+ "<|im_assistant|>",
90
+ "<|reserved_token_0|>",
91
+ "<|start_header_id|>",
92
+ "<|end_header_id|>",
93
+ "<|reserved_token_1|>",
94
+ "[EOT]",
95
+ "<|im_system|>",
96
+ "<|reserved_token_2|>",
97
+ "<|reserved_token_3|>",
98
+ "<|reserved_token_4|>",
99
+ "<|reserved_token_5|>",
100
+ "<|reserved_token_6|>",
101
+ "<|reserved_token_7|>",
102
+ "<|im_middle|>",
103
+ "<|media_begin|>",
104
+ "<|media_content|>",
105
+ "<|media_end|>",
106
+ "<|media_placeholder|>",
107
+ ]
108
+ used_reserved_tokens = 8
109
+ last_reserved_token_id = self.num_reserved_special_tokens - 4 - len(used_special_tokens) + used_reserved_tokens - 1
110
+ additional_special_tokens = used_special_tokens + [
111
+ f"<|reserved_token_{i}|>"
112
+ for i in range(used_reserved_tokens, last_reserved_token_id + 1)
113
+ ]
114
+ # num_reserved_special_tokens = additional_special_tokens + BOS + EOS + unk_token + pad_token
115
+ assert len(additional_special_tokens) + 4 == self.num_reserved_special_tokens, f"additional_special_tokens num: {len(additional_special_tokens)} is not correct"
116
+ # we assume that the instance is under initialization and unk_token and pad_token should be automatically inferred
117
+ if unk_token is not None:
118
+ raise ValueError("unk_token should not be set in dumping mode when additional_special_tokens is None")
119
+ if pad_token is not None:
120
+ raise ValueError("pad_token should not be set in dumping mode when additional_special_tokens is None")
121
+ # last two reserved tokens
122
+ unk_token = f"[UNK]"
123
+ pad_token = f"[PAD]"
124
+
125
+ logger.info(f"adding unk_token: {unk_token} and pad_token: {pad_token}")
126
+ self.additional_special_tokens = additional_special_tokens
127
+ special_tokens = [str(bos_token), str(eos_token)] + additional_special_tokens + [str(unk_token), str(pad_token)]
128
+
129
+ self.vocab_file = vocab_file
130
+ mergeable_ranks = load_tiktoken_bpe(vocab_file)
131
+ num_base_tokens = len(mergeable_ranks)
132
+ self.special_tokens = {
133
+ token: num_base_tokens + i for i, token in enumerate(special_tokens)
134
+ }
135
+ else:
136
+ self.additional_special_tokens = additional_special_tokens
137
+ special_tokens_mapping = {
138
+ i: added_tokens_decoder[i].content for i in added_tokens_decoder
139
+ }
140
+
141
+ self.vocab_file = vocab_file
142
+ mergeable_ranks = load_tiktoken_bpe(vocab_file)
143
+ num_base_tokens = len(mergeable_ranks)
144
+ self.special_tokens = {
145
+ special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
146
+ for i in range(
147
+ num_base_tokens, num_base_tokens + self.num_reserved_special_tokens + 2
148
+ )
149
+ }
150
+
151
+
152
+
153
+ self.model = tiktoken.Encoding(
154
+ name=Path(vocab_file).name,
155
+ pat_str=self.pat_str,
156
+ mergeable_ranks=mergeable_ranks,
157
+ special_tokens=self.special_tokens,
158
+ )
159
+ logger.info(f"Reloaded tiktoken model from {vocab_file}")
160
+
161
+ self.n_words: int = self.model.n_vocab
162
+ # BOS / EOS token IDs
163
+ self.bos_id: int = self.special_tokens[str(bos_token)]
164
+ self.eos_id: int = self.special_tokens[str(eos_token)]
165
+
166
+ logger.info(
167
+ f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
168
+ )
169
+
170
+ self.pad_id: int = self.special_tokens[str(pad_token)]
171
+ self.unk_id: int = self.special_tokens[str(unk_token)]
172
+ self.byte_encoder = bytes_to_unicode()
173
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
174
+
175
+ self.decoder = {}
176
+ for i in range(self.n_words):
177
+ # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
178
+ decoding = ''.join([
179
+ self.byte_encoder[ord(char)] for char in
180
+ self.model.decode_single_token_bytes(i).decode('latin-1')
181
+ ])
182
+ self.decoder[i] = decoding
183
+
184
+ self.encoder = {}
185
+ for i in range(self.n_words):
186
+ if i in self.decoder:
187
+ self.encoder[self.decoder[i]] = i
188
+
189
+ super().__init__(
190
+ bos_token=bos_token,
191
+ eos_token=eos_token,
192
+ unk_token=unk_token,
193
+ pad_token=pad_token,
194
+ additional_special_tokens=self.additional_special_tokens,
195
+ **kwargs,
196
+ )
197
+ self.all_special_ids_set = set(self.all_special_ids)
198
+
199
+ def encode(
200
+ self,
201
+ text: str,
202
+ allow_special_tokens = True,
203
+ **kwargs
204
+ ) -> List[int]:
205
+ """
206
+ Encodes a string into a list of token IDs.
207
+
208
+ Args:
209
+ text (str): The input string to be encoded.
210
+
211
+ Returns:
212
+ list[int]: A list of token IDs.
213
+ """
214
+ # If there are other args, we should call super().encode because there are a lot of code
215
+ # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
216
+ # NOTE: our encode method is not compatible with the super().encode method,
217
+ # e.g. split_special_tokens' default is True in our encode method.
218
+ if len(kwargs) > 0:
219
+ logger.warning( f"Calling super().encode with {kwargs}" )
220
+ return super().encode(text, **kwargs)
221
+
222
+ assert type(text) is str
223
+
224
+ # The tiktoken tokenizer can handle <=400k chars without
225
+ # pyo3_runtime.PanicException.
226
+ TIKTOKEN_MAX_ENCODE_CHARS = 400_000
227
+
228
+ # https://github.com/openai/tiktoken/issues/195
229
+ # Here we iterate over subsequences and split if we exceed the limit
230
+ # of max consecutive non-whitespace or whitespace characters.
231
+ MAX_NO_WHITESPACES_CHARS = 25_000
232
+
233
+ texts = self.pre_tokenizer_process(text)
234
+
235
+ all_substrs = []
236
+ for text in texts:
237
+ substrs = (
238
+ substr
239
+ for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
240
+ for substr in self._split_whitespaces_or_nonwhitespaces(
241
+ text[i: i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
242
+ )
243
+ )
244
+ all_substrs.extend(substrs)
245
+
246
+ t: List[int] = []
247
+ for substr in all_substrs:
248
+ if allow_special_tokens:
249
+ t.extend(
250
+ self.model.encode(
251
+ substr,
252
+ allowed_special="all",
253
+ )
254
+ )
255
+ else:
256
+ t.extend(
257
+ self.model.encode(
258
+ substr,
259
+ disallowed_special=(),
260
+ )
261
+ )
262
+
263
+ return t
264
+
265
+ def decode(
266
+ self,
267
+ token_ids: Union[int, List[int]],
268
+ **kwargs
269
+ ) -> str:
270
+ """
271
+ Decodes a list of token IDs into a string.
272
+
273
+ Args:
274
+ token_ids (List[int]): The list of token IDs to be decoded.
275
+
276
+ Returns:
277
+ str: The decoded string.
278
+ """
279
+ # If there are other args, we should call super().decode because there are a lot of code
280
+ # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
281
+ if len(kwargs) > 0:
282
+ return super().decode(token_ids, **kwargs)
283
+
284
+ if type(token_ids) is int:
285
+ token_ids = [token_ids]
286
+
287
+ return self.model.decode(cast(List[int], token_ids))
288
+
289
+ @staticmethod
290
+ def _split_whitespaces_or_nonwhitespaces(
291
+ s: str, max_consecutive_slice_len: int
292
+ ) -> Iterator[str]:
293
+ """
294
+ Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
295
+ consecutive whitespaces or consecutive non-whitespaces.
296
+ """
297
+ current_slice_len = 0
298
+ current_slice_is_space = s[0].isspace() if len(s) > 0 else False
299
+ slice_start = 0
300
+
301
+ for i in range(len(s)):
302
+ is_now_space = s[i].isspace()
303
+
304
+ if current_slice_is_space ^ is_now_space:
305
+ current_slice_len = 1
306
+ current_slice_is_space = is_now_space
307
+ else:
308
+ current_slice_len += 1
309
+ if current_slice_len > max_consecutive_slice_len:
310
+ yield s[slice_start:i]
311
+ slice_start = i
312
+ current_slice_len = 1
313
+ yield s[slice_start:]
314
+
315
+ def pre_tokenizer_process(self, text: str) -> List[str]:
316
+ """
317
+ pre-tokenizes the input text into a list of tokens.
318
+ This method is used to split the input text into smaller chunks for internal processing.
319
+ """
320
+ return [text]
321
+
322
+
323
+ """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
324
+ @property
325
+ def vocab_size(self) -> int:
326
+ return self.n_words
327
+
328
+ def get_vocab(self) -> Dict[str, int]:
329
+ return self.encoder
330
+
331
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
332
+ return [
333
+ self.decoder[t]
334
+ for t in self.encode(text)
335
+ ]
336
+
337
+ def _convert_token_to_id(self, token: str) -> int:
338
+ return self.encoder.get(token, self.unk_id)
339
+
340
+ def _convert_id_to_token(self, index: int) -> str:
341
+ return self.decoder.get(index)
342
+
343
+ @staticmethod
344
+ def clean_up_tokenization(out_string: str) -> str:
345
+ return out_string
346
+
347
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
348
+ text = ''.join(tokens)
349
+ text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', 'replace')
350
+ return text
351
+
352
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
353
+ if not os.path.isdir(save_directory):
354
+ raise ValueError(f"vocabulary path ({save_directory}) should be a directory")
355
+ out_vocab_file = os.path.join(
356
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
357
+ )
358
+
359
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
360
+ copyfile(self.vocab_file, out_vocab_file)
361
+
362
+ return (out_vocab_file,)
363
+
364
+
365
+ class TikTokenV3(TikTokenTokenizer):
366
+ num_reserved_special_tokens = 293 + 128
367
+ pat_str = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
tokenizer_config.json ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "151643": {
4
+ "content": "[BOS]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "151644": {
12
+ "content": "[EOS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "151645": {
20
+ "content": "<|im_end|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "151646": {
28
+ "content": "<|im_user|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "151647": {
36
+ "content": "<|im_assistant|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "151648": {
44
+ "content": "<|reserved_token_0|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "151649": {
52
+ "content": "<|start_header_id|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "151650": {
60
+ "content": "<|end_header_id|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "151651": {
68
+ "content": "<|reserved_token_1|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "151652": {
76
+ "content": "[EOT]",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "151653": {
84
+ "content": "<|im_system|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "151654": {
92
+ "content": "<|reserved_token_2|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "151655": {
100
+ "content": "<|reserved_token_3|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "151656": {
108
+ "content": "<|reserved_token_4|>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "151657": {
116
+ "content": "<|reserved_token_5|>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "151658": {
124
+ "content": "<|reserved_token_6|>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "151659": {
132
+ "content": "<|reserved_token_7|>",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "151660": {
140
+ "content": "<|im_middle|>",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "151661": {
148
+ "content": "<|media_begin|>",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "151662": {
156
+ "content": "<|media_content|>",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "151663": {
164
+ "content": "<|media_end|>",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "151664": {
172
+ "content": "<|media_placeholder|>",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": true
178
+ },
179
+ "152062": {
180
+ "content": "[UNK]",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": true
186
+ },
187
+ "152063": {
188
+ "content": "[PAD]",
189
+ "lstrip": false,
190
+ "normalized": false,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": true
194
+ }
195
+
196
+ },
197
+ "additional_special_tokens": [
198
+ "<|im_end|>",
199
+ "<|im_user|>",
200
+ "<|im_assistant|>",
201
+ "<|reserved_token_0|>",
202
+ "<|start_header_id|>",
203
+ "<|end_header_id|>",
204
+ "<|reserved_token_1|>",
205
+ "[EOT]",
206
+ "<|im_system|>",
207
+ "<|reserved_token_2|>",
208
+ "<|reserved_token_3|>",
209
+ "<|reserved_token_4|>",
210
+ "<|reserved_token_5|>",
211
+ "<|reserved_token_6|>",
212
+ "<|reserved_token_7|>",
213
+ "<|im_middle|>",
214
+ "<|media_begin|>",
215
+ "<|media_content|>",
216
+ "<|media_end|>",
217
+ "<|media_placeholder|>"
218
+ ],
219
+ "bos_token": "[BOS]",
220
+ "clean_up_tokenization_spaces": false,
221
+ "eos_token": "[EOS]",
222
+ "extra_special_tokens": {},
223
+ "chat_template": "{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}{%- endif -%}{%- if message['role'] == 'system' -%}{{'<|im_system|>'}}{%- endif -%}{%- if message['role'] == 'user' -%}{{'<|im_user|>'}}{%- endif -%}{%- if message['role'] == 'assistant' -%}{{'<|im_assistant|>'}}{%- endif -%}{{- message['role'] -}}{{'<|im_middle|>'}}{%- if message['content'] is string -%}{{- message['content'] + '<|im_end|>' -}}{%- else -%}{%- for content in message['content'] -%}{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}{{'<|media_begin|>image<|media_content|><|media_placeholder|><|media_end|>'}}{%- else -%}{{content['text']}}{%- endif -%}{%- endfor -%}{{'<|im_end|>'}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{'<|im_assistant|>assistant<|im_middle|>'}}{%- endif -%}",
224
+ "model_max_length": 1000000000000000019884624838656,
225
+ "pad_token": "[PAD]",
226
+ "tokenizer_class": "TikTokenV3",
227
+ "unk_token": "[UNK]",
228
+ "auto_map": {
229
+ "AutoTokenizer": [
230
+ "tokenization_opencua.TikTokenV3",
231
+ null
232
+ ]
233
+ }
234
+ }