shilinxu commited on
Commit
ad92440
·
verified ·
1 Parent(s): b5f73a9

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -2,12 +2,17 @@
2
  "architectures": [
3
  "Qwen2VisionTransformerPretrainedModel"
4
  ],
 
 
 
 
5
  "depth": 32,
6
  "embed_dim": 1280,
7
  "hidden_act": "quick_gelu",
8
  "hidden_size": 1536,
9
  "in_channels": 3,
10
  "in_chans": 3,
 
11
  "mlp_ratio": 4,
12
  "model_type": "qwen2_vl",
13
  "num_heads": 16,
@@ -16,5 +21,5 @@
16
  "spatial_patch_size": 14,
17
  "temporal_patch_size": 2,
18
  "torch_dtype": "bfloat16",
19
- "transformers_version": "4.49.0"
20
  }
 
2
  "architectures": [
3
  "Qwen2VisionTransformerPretrainedModel"
4
  ],
5
+ "auto_map": {
6
+ "AutoModel": "modeling_qwen2_vl.Qwen2VisionTransformerPretrainedModel",
7
+ "AutoConfig": "configuration_qwen2_vl.Qwen2VLVisionConfig"
8
+ },
9
  "depth": 32,
10
  "embed_dim": 1280,
11
  "hidden_act": "quick_gelu",
12
  "hidden_size": 1536,
13
  "in_channels": 3,
14
  "in_chans": 3,
15
+ "initializer_range": 0.02,
16
  "mlp_ratio": 4,
17
  "model_type": "qwen2_vl",
18
  "num_heads": 16,
 
21
  "spatial_patch_size": 14,
22
  "temporal_patch_size": 2,
23
  "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.52.1"
25
  }
configuration_qwen2_vl.py CHANGED
@@ -38,6 +38,7 @@ class Qwen2VLVisionConfig(PretrainedConfig):
38
  patch_size=14,
39
  spatial_merge_size=2,
40
  temporal_patch_size=2,
 
41
  **kwargs,
42
  ):
43
  super().__init__(**kwargs)
@@ -52,196 +53,5 @@ class Qwen2VLVisionConfig(PretrainedConfig):
52
  self.patch_size = patch_size
53
  self.spatial_merge_size = spatial_merge_size
54
  self.temporal_patch_size = temporal_patch_size
55
-
56
-
57
- class Qwen2VLConfig(PretrainedConfig):
58
- r"""
59
- This is the configuration class to store the configuration of a [`Qwen2VLModel`]. It is used to instantiate a
60
- Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
61
- with the defaults will yield a similar configuration to that of
62
- Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).
63
-
64
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
65
- documentation from [`PretrainedConfig`] for more information.
66
-
67
-
68
- Args:
69
- vocab_size (`int`, *optional*, defaults to 152064):
70
- Vocabulary size of the Qwen2VL model. Defines the number of different tokens that can be represented by the
71
- `inputs_ids` passed when calling [`Qwen2VLModel`]
72
- hidden_size (`int`, *optional*, defaults to 8192):
73
- Dimension of the hidden representations.
74
- intermediate_size (`int`, *optional*, defaults to 29568):
75
- Dimension of the MLP representations.
76
- num_hidden_layers (`int`, *optional*, defaults to 80):
77
- Number of hidden layers in the Transformer encoder.
78
- num_attention_heads (`int`, *optional*, defaults to 64):
79
- Number of attention heads for each attention layer in the Transformer encoder.
80
- num_key_value_heads (`int`, *optional*, defaults to 8):
81
- This is the number of key_value heads that should be used to implement Grouped Query Attention. If
82
- `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
83
- `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
84
- converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
85
- by meanpooling all the original heads within that group. For more details checkout [this
86
- paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
87
- hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
88
- The non-linear activation function (function or string) in the decoder.
89
- max_position_embeddings (`int`, *optional*, defaults to 32768):
90
- The maximum sequence length that this model might ever be used with.
91
- initializer_range (`float`, *optional*, defaults to 0.02):
92
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
93
- rms_norm_eps (`float`, *optional*, defaults to 1e-05):
94
- The epsilon used by the rms normalization layers.
95
- use_cache (`bool`, *optional*, defaults to `True`):
96
- Whether or not the model should return the last key/values attentions (not used by all models). Only
97
- relevant if `config.is_decoder=True`.
98
- tie_word_embeddings (`bool`, *optional*, defaults to `False`):
99
- Whether the model's input and output word embeddings should be tied.
100
- rope_theta (`float`, *optional*, defaults to 1000000.0):
101
- The base period of the RoPE embeddings.
102
- use_sliding_window (`bool`, *optional*, defaults to `False`):
103
- Whether to use sliding window attention.
104
- sliding_window (`int`, *optional*, defaults to 4096):
105
- Sliding window attention (SWA) window size. If not specified, will default to `4096`.
106
- max_window_layers (`int`, *optional*, defaults to 80):
107
- The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
108
- attention_dropout (`float`, *optional*, defaults to 0.0):
109
- The dropout ratio for the attention probabilities.
110
- vision_config (`Dict`, *optional*):
111
- The config for the visual encoder initialization.
112
- rope_scaling (`Dict`, *optional*):
113
- Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
114
- and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
115
- accordingly.
116
- Expected contents:
117
- `rope_type` (`str`):
118
- The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
119
- 'llama3'], with 'default' being the original RoPE implementation.
120
- `factor` (`float`, *optional*):
121
- Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
122
- most scaling types, a `factor` of x will enable the model to handle sequences of length x *
123
- original maximum pre-trained length.
124
- `original_max_position_embeddings` (`int`, *optional*):
125
- Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
126
- pretraining.
127
- `attention_factor` (`float`, *optional*):
128
- Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
129
- computation. If unspecified, it defaults to value recommended by the implementation, using the
130
- `factor` field to infer the suggested value.
131
- `beta_fast` (`float`, *optional*):
132
- Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
133
- ramp function. If unspecified, it defaults to 32.
134
- `beta_slow` (`float`, *optional*):
135
- Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
136
- ramp function. If unspecified, it defaults to 1.
137
- `short_factor` (`List[float]`, *optional*):
138
- Only used with 'longrope'. The scaling factor to be applied to short contexts (<
139
- `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
140
- size divided by the number of attention heads divided by 2
141
- `long_factor` (`List[float]`, *optional*):
142
- Only used with 'longrope'. The scaling factor to be applied to long contexts (<
143
- `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
144
- size divided by the number of attention heads divided by 2
145
- `low_freq_factor` (`float`, *optional*):
146
- Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
147
- `high_freq_factor` (`float`, *optional*):
148
- Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
149
-
150
- ```python
151
- >>> from transformers import Qwen2VLForConditionalGeneration, Qwen2VLConfig
152
-
153
- >>> # Initializing a Qwen2VL style configuration
154
- >>> configuration = Qwen2VLConfig()
155
-
156
- >>> # Initializing a model from the Qwen2-VL-7B style configuration
157
- >>> model = Qwen2VLForConditionalGeneration(configuration)
158
-
159
- >>> # Accessing the model configuration
160
- >>> configuration = model.config
161
- ```"""
162
-
163
- model_type = "qwen2_vl"
164
- sub_configs = {"vision_config": Qwen2VLVisionConfig}
165
- keys_to_ignore_at_inference = ["past_key_values"]
166
- # Default tensor parallel plan for base model `Qwen2VL`
167
- base_model_tp_plan = {
168
- "layers.*.self_attn.q_proj": "colwise",
169
- "layers.*.self_attn.k_proj": "colwise",
170
- "layers.*.self_attn.v_proj": "colwise",
171
- "layers.*.self_attn.o_proj": "rowwise",
172
- "layers.*.mlp.gate_proj": "colwise",
173
- "layers.*.mlp.up_proj": "colwise",
174
- "layers.*.mlp.down_proj": "rowwise",
175
- }
176
- base_model_pp_plan = {
177
- "embed_tokens": (["input_ids"], ["inputs_embeds"]),
178
- "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
179
- "norm": (["hidden_states"], ["hidden_states"]),
180
- }
181
-
182
- def __init__(
183
- self,
184
- vocab_size=152064,
185
- hidden_size=8192,
186
- intermediate_size=29568,
187
- num_hidden_layers=80,
188
- num_attention_heads=64,
189
- num_key_value_heads=8,
190
- hidden_act="silu",
191
- max_position_embeddings=32768,
192
- initializer_range=0.02,
193
- rms_norm_eps=1e-05,
194
- use_cache=True,
195
- tie_word_embeddings=False,
196
- rope_theta=1000000.0,
197
- use_sliding_window=False,
198
- sliding_window=4096,
199
- max_window_layers=80,
200
- attention_dropout=0.0,
201
- vision_config=None,
202
- rope_scaling=None,
203
- **kwargs,
204
- ):
205
- if isinstance(vision_config, dict):
206
- self.vision_config = self.sub_configs["vision_config"](**vision_config)
207
- elif vision_config is None:
208
- self.vision_config = self.sub_configs["vision_config"]()
209
-
210
- self.vocab_size = vocab_size
211
- self.max_position_embeddings = max_position_embeddings
212
- self.hidden_size = hidden_size
213
- self.intermediate_size = intermediate_size
214
- self.num_hidden_layers = num_hidden_layers
215
- self.num_attention_heads = num_attention_heads
216
- self.use_sliding_window = use_sliding_window
217
- self.sliding_window = sliding_window
218
- self.max_window_layers = max_window_layers
219
-
220
- # for backward compatibility
221
- if num_key_value_heads is None:
222
- num_key_value_heads = num_attention_heads
223
-
224
- self.num_key_value_heads = num_key_value_heads
225
- self.hidden_act = hidden_act
226
  self.initializer_range = initializer_range
227
- self.rms_norm_eps = rms_norm_eps
228
- self.use_cache = use_cache
229
- self.rope_theta = rope_theta
230
- self.attention_dropout = attention_dropout
231
- self.rope_scaling = rope_scaling
232
-
233
- # Validate the correctness of rotary position embeddings parameters
234
- # BC: if there is a 'type' field, move it to 'rope_type'.
235
- # and change type from 'mrope' to 'default' because `mrope` does defeault RoPE calculations
236
- # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
237
- # TODO: @raushan update config in the hub
238
- if self.rope_scaling is not None and "type" in self.rope_scaling:
239
- if self.rope_scaling["type"] == "mrope":
240
- self.rope_scaling["type"] = "default"
241
- self.rope_scaling["rope_type"] = self.rope_scaling["type"]
242
- rope_config_validation(self, ignore_keys={"mrope_section"})
243
-
244
- super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
245
-
246
 
247
- __all__ = ["Qwen2VLConfig"]
 
38
  patch_size=14,
39
  spatial_merge_size=2,
40
  temporal_patch_size=2,
41
+ initializer_range=0.02,
42
  **kwargs,
43
  ):
44
  super().__init__(**kwargs)
 
53
  self.patch_size = patch_size
54
  self.spatial_merge_size = spatial_merge_size
55
  self.temporal_patch_size = temporal_patch_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  self.initializer_range = initializer_range
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
 
image_processing_qwen2_vl.py CHANGED
@@ -24,78 +24,34 @@ from typing import Dict, List, Optional, Union
24
 
25
  import numpy as np
26
 
27
- from ...image_processing_utils import BaseImageProcessor, BatchFeature
28
- from ...image_transforms import (
29
  convert_to_rgb,
30
  resize,
31
  to_channel_dimension_format,
32
  )
33
- from ...image_utils import (
34
  OPENAI_CLIP_MEAN,
35
  OPENAI_CLIP_STD,
36
  ChannelDimension,
37
  ImageInput,
38
  PILImageResampling,
39
- VideoInput,
40
  get_image_size,
41
  infer_channel_dimension_format,
42
  is_scaled_image,
43
- is_valid_image,
44
  make_list_of_images,
45
  to_numpy_array,
46
  valid_images,
47
  validate_preprocess_arguments,
48
  )
49
- from ...utils import TensorType, is_vision_available, logging
 
50
 
51
 
52
  logger = logging.get_logger(__name__)
53
 
54
 
55
- if is_vision_available():
56
- from PIL import Image
57
-
58
-
59
- def make_batched_images(images) -> List[List[ImageInput]]:
60
- """
61
- Accepts images in list or nested list format, and makes a list of images for preprocessing.
62
-
63
- Args:
64
- images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
65
- The input image.
66
-
67
- Returns:
68
- list: A list of images.
69
- """
70
- if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
71
- return [img for img_list in images for img in img_list]
72
-
73
- elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
74
- return images
75
-
76
- elif is_valid_image(images):
77
- return [images]
78
-
79
- raise ValueError(f"Could not make batched images from {images}")
80
-
81
-
82
- # Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
83
- def make_batched_videos(videos) -> List[VideoInput]:
84
- if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
85
- return videos
86
-
87
- elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
88
- if isinstance(videos[0], Image.Image):
89
- return [videos]
90
- elif len(videos[0].shape) == 4:
91
- return [list(video) for video in videos]
92
-
93
- elif is_valid_image(videos) and len(videos.shape) == 4:
94
- return [list(videos)]
95
-
96
- raise ValueError(f"Could not make batched video from {videos}")
97
-
98
-
99
  def smart_resize(
100
  height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
101
  ):
@@ -109,7 +65,7 @@ def smart_resize(
109
 
110
  """
111
  if height < factor or width < factor:
112
- raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
113
  elif max(height, width) / min(height, width) > 200:
114
  raise ValueError(
115
  f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
@@ -134,6 +90,8 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
134
  Args:
135
  do_resize (`bool`, *optional*, defaults to `True`):
136
  Whether to resize the image's (height, width) dimensions.
 
 
137
  resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
138
  Resampling filter to use when resizing the image.
139
  do_rescale (`bool`, *optional*, defaults to `True`):
@@ -153,7 +111,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
153
  max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
154
  The max pixels of the image to resize the image.
155
  patch_size (`int`, *optional*, defaults to 14):
156
- The spacial patch size of the vision encoder.
157
  temporal_patch_size (`int`, *optional*, defaults to 2):
158
  The temporal patch size of the vision encoder.
159
  merge_size (`int`, *optional*, defaults to 2):
@@ -165,6 +123,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
165
  def __init__(
166
  self,
167
  do_resize: bool = True,
 
168
  resample: PILImageResampling = PILImageResampling.BICUBIC,
169
  do_rescale: bool = True,
170
  rescale_factor: Union[int, float] = 1 / 255,
@@ -172,14 +131,27 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
172
  image_mean: Optional[Union[float, List[float]]] = None,
173
  image_std: Optional[Union[float, List[float]]] = None,
174
  do_convert_rgb: bool = True,
175
- min_pixels: int = 56 * 56,
176
- max_pixels: int = 28 * 28 * 1280,
177
  patch_size: int = 14,
178
  temporal_patch_size: int = 2,
179
  merge_size: int = 2,
180
  **kwargs,
181
  ) -> None:
182
  super().__init__(**kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  self.do_resize = do_resize
184
  self.resample = resample
185
  self.do_rescale = do_rescale
@@ -187,25 +159,27 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
187
  self.do_normalize = do_normalize
188
  self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
189
  self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
190
- self.min_pixels = min_pixels
191
- self.max_pixels = max_pixels
192
  self.patch_size = patch_size
193
  self.temporal_patch_size = temporal_patch_size
194
  self.merge_size = merge_size
195
- self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
196
  self.do_convert_rgb = do_convert_rgb
197
 
198
  def _preprocess(
199
  self,
200
  images: Union[ImageInput, VideoInput],
201
- do_resize: bool = None,
 
202
  resample: PILImageResampling = None,
203
- do_rescale: bool = None,
204
- rescale_factor: float = None,
205
- do_normalize: bool = None,
206
  image_mean: Optional[Union[float, List[float]]] = None,
207
  image_std: Optional[Union[float, List[float]]] = None,
208
- do_convert_rgb: bool = None,
 
 
 
209
  data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
210
  input_data_format: Optional[Union[str, ChannelDimension]] = None,
211
  ):
@@ -219,6 +193,8 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
219
  Optional list of dictionaries containing additional information about vision inputs.
220
  do_resize (`bool`, *optional*, defaults to `self.do_resize`):
221
  Whether to resize the image.
 
 
222
  resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
223
  Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
224
  do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
@@ -231,6 +207,12 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
231
  Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
232
  image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
233
  Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
 
 
 
 
 
 
234
  do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
235
  Whether to convert the image to RGB.
236
  data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
@@ -269,9 +251,9 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
269
  resized_height, resized_width = smart_resize(
270
  height,
271
  width,
272
- factor=self.patch_size * self.merge_size,
273
- min_pixels=self.min_pixels,
274
- max_pixels=self.max_pixels,
275
  )
276
  image = resize(
277
  image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
@@ -291,26 +273,28 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
291
  patches = np.array(processed_images)
292
  if data_format == ChannelDimension.LAST:
293
  patches = patches.transpose(0, 3, 1, 2)
294
- if patches.shape[0] % self.temporal_patch_size != 0:
295
- repeats = np.repeat(patches[-1][np.newaxis], self.temporal_patch_size - 1, axis=0)
 
 
296
  patches = np.concatenate([patches, repeats], axis=0)
297
  channel = patches.shape[1]
298
- grid_t = patches.shape[0] // self.temporal_patch_size
299
- grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
300
  patches = patches.reshape(
301
  grid_t,
302
- self.temporal_patch_size,
303
  channel,
304
- grid_h // self.merge_size,
305
- self.merge_size,
306
- self.patch_size,
307
- grid_w // self.merge_size,
308
- self.merge_size,
309
- self.patch_size,
310
  )
311
  patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
312
  flatten_patches = patches.reshape(
313
- grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size
314
  )
315
 
316
  return flatten_patches, (grid_t, grid_h, grid_w)
@@ -319,15 +303,20 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
319
  self,
320
  images: ImageInput,
321
  videos: VideoInput = None,
322
- do_resize: bool = None,
323
- size: Dict[str, int] = None,
 
 
324
  resample: PILImageResampling = None,
325
- do_rescale: bool = None,
326
- rescale_factor: float = None,
327
- do_normalize: bool = None,
328
  image_mean: Optional[Union[float, List[float]]] = None,
329
  image_std: Optional[Union[float, List[float]]] = None,
330
- do_convert_rgb: bool = None,
 
 
 
331
  return_tensors: Optional[Union[str, TensorType]] = None,
332
  data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
333
  input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -359,6 +348,16 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
359
  image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
360
  Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
361
  `True`.
 
 
 
 
 
 
 
 
 
 
362
  do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
363
  Whether to convert the image to RGB.
364
  return_tensors (`str` or `TensorType`, *optional*):
@@ -381,20 +380,34 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
381
  - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
382
 
383
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  do_resize = do_resize if do_resize is not None else self.do_resize
385
- size = size if size is not None else self.size
386
  resample = resample if resample is not None else self.resample
387
  do_rescale = do_rescale if do_rescale is not None else self.do_rescale
388
  rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
389
  do_normalize = do_normalize if do_normalize is not None else self.do_normalize
390
  image_mean = image_mean if image_mean is not None else self.image_mean
391
  image_std = image_std if image_std is not None else self.image_std
 
 
 
392
  do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
393
 
394
  if images is not None:
395
- images = make_batched_images(images)
396
- if videos is not None:
397
- videos = make_batched_videos(videos)
398
 
399
  if images is not None and not valid_images(images):
400
  raise ValueError(
@@ -412,18 +425,23 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
412
  resample=resample,
413
  )
414
 
 
415
  if images is not None:
416
  pixel_values, vision_grid_thws = [], []
417
  for image in images:
418
  patches, image_grid_thw = self._preprocess(
419
  image,
420
  do_resize=do_resize,
 
421
  resample=resample,
422
  do_rescale=do_rescale,
423
  rescale_factor=rescale_factor,
424
  do_normalize=do_normalize,
425
  image_mean=image_mean,
426
  image_std=image_std,
 
 
 
427
  data_format=data_format,
428
  do_convert_rgb=do_convert_rgb,
429
  input_data_format=input_data_format,
@@ -432,29 +450,43 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
432
  vision_grid_thws.append(image_grid_thw)
433
  pixel_values = np.array(pixel_values)
434
  vision_grid_thws = np.array(vision_grid_thws)
435
- data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
436
 
 
437
  if videos is not None:
438
- pixel_values, vision_grid_thws = [], []
 
 
 
 
 
 
439
  for images in videos:
440
  patches, video_grid_thw = self._preprocess(
441
  images,
442
  do_resize=do_resize,
 
443
  resample=resample,
444
  do_rescale=do_rescale,
445
  rescale_factor=rescale_factor,
446
  do_normalize=do_normalize,
447
  image_mean=image_mean,
448
  image_std=image_std,
 
 
 
449
  data_format=data_format,
450
  do_convert_rgb=do_convert_rgb,
451
  input_data_format=input_data_format,
452
  )
453
- pixel_values.extend(patches)
454
- vision_grid_thws.append(video_grid_thw)
455
- pixel_values = np.array(pixel_values)
456
- vision_grid_thws = np.array(vision_grid_thws)
457
- data = {"pixel_values_videos": pixel_values, "video_grid_thw": vision_grid_thws}
 
 
 
458
 
459
  return BatchFeature(data=data, tensor_type=return_tensors)
460
 
 
24
 
25
  import numpy as np
26
 
27
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
28
+ from transformers.image_transforms import (
29
  convert_to_rgb,
30
  resize,
31
  to_channel_dimension_format,
32
  )
33
+ from transformers.image_utils import (
34
  OPENAI_CLIP_MEAN,
35
  OPENAI_CLIP_STD,
36
  ChannelDimension,
37
  ImageInput,
38
  PILImageResampling,
 
39
  get_image_size,
40
  infer_channel_dimension_format,
41
  is_scaled_image,
42
+ make_flat_list_of_images,
43
  make_list_of_images,
44
  to_numpy_array,
45
  valid_images,
46
  validate_preprocess_arguments,
47
  )
48
+ from transformers.utils import TensorType, logging
49
+ from transformers.video_utils import VideoInput, make_batched_videos
50
 
51
 
52
  logger = logging.get_logger(__name__)
53
 
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def smart_resize(
56
  height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
57
  ):
 
65
 
66
  """
67
  if height < factor or width < factor:
68
+ raise ValueError(f"height:{height} and width:{width} must be larger than factor:{factor}")
69
  elif max(height, width) / min(height, width) > 200:
70
  raise ValueError(
71
  f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
 
90
  Args:
91
  do_resize (`bool`, *optional*, defaults to `True`):
92
  Whether to resize the image's (height, width) dimensions.
93
+ size (`Dict[str, int]`, *optional*, defaults to `{"shortest_edge": 56 * 56, "longest_edge": 28 * 28 * 1280}`):
94
+ Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
95
  resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
96
  Resampling filter to use when resizing the image.
97
  do_rescale (`bool`, *optional*, defaults to `True`):
 
111
  max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
112
  The max pixels of the image to resize the image.
113
  patch_size (`int`, *optional*, defaults to 14):
114
+ The spatial patch size of the vision encoder.
115
  temporal_patch_size (`int`, *optional*, defaults to 2):
116
  The temporal patch size of the vision encoder.
117
  merge_size (`int`, *optional*, defaults to 2):
 
123
  def __init__(
124
  self,
125
  do_resize: bool = True,
126
+ size: Optional[Dict[str, int]] = None,
127
  resample: PILImageResampling = PILImageResampling.BICUBIC,
128
  do_rescale: bool = True,
129
  rescale_factor: Union[int, float] = 1 / 255,
 
131
  image_mean: Optional[Union[float, List[float]]] = None,
132
  image_std: Optional[Union[float, List[float]]] = None,
133
  do_convert_rgb: bool = True,
134
+ min_pixels: Optional[int] = None,
135
+ max_pixels: Optional[int] = None,
136
  patch_size: int = 14,
137
  temporal_patch_size: int = 2,
138
  merge_size: int = 2,
139
  **kwargs,
140
  ) -> None:
141
  super().__init__(**kwargs)
142
+ if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
143
+ raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
144
+ else:
145
+ size = {"shortest_edge": 56 * 56, "longest_edge": 28 * 28 * 1280}
146
+ # backward compatibility: override size with min_pixels and max_pixels if they are provided
147
+ if min_pixels is not None:
148
+ size["shortest_edge"] = min_pixels
149
+ if max_pixels is not None:
150
+ size["longest_edge"] = max_pixels
151
+ self.min_pixels = size["shortest_edge"]
152
+ self.max_pixels = size["longest_edge"]
153
+ self.size = size
154
+
155
  self.do_resize = do_resize
156
  self.resample = resample
157
  self.do_rescale = do_rescale
 
159
  self.do_normalize = do_normalize
160
  self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
161
  self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
162
+
 
163
  self.patch_size = patch_size
164
  self.temporal_patch_size = temporal_patch_size
165
  self.merge_size = merge_size
 
166
  self.do_convert_rgb = do_convert_rgb
167
 
168
  def _preprocess(
169
  self,
170
  images: Union[ImageInput, VideoInput],
171
+ do_resize: Optional[bool] = None,
172
+ size: Optional[Dict[str, int]] = None,
173
  resample: PILImageResampling = None,
174
+ do_rescale: Optional[bool] = None,
175
+ rescale_factor: Optional[float] = None,
176
+ do_normalize: Optional[bool] = None,
177
  image_mean: Optional[Union[float, List[float]]] = None,
178
  image_std: Optional[Union[float, List[float]]] = None,
179
+ patch_size: Optional[int] = None,
180
+ temporal_patch_size: Optional[int] = None,
181
+ merge_size: Optional[int] = None,
182
+ do_convert_rgb: Optional[bool] = None,
183
  data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
184
  input_data_format: Optional[Union[str, ChannelDimension]] = None,
185
  ):
 
193
  Optional list of dictionaries containing additional information about vision inputs.
194
  do_resize (`bool`, *optional*, defaults to `self.do_resize`):
195
  Whether to resize the image.
196
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
197
+ Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
198
  resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
199
  Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
200
  do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
 
207
  Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
208
  image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
209
  Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
210
+ patch_size (`int`, *optional*, defaults to `self.patch_size`):
211
+ The spatial patch size of the vision encoder.
212
+ temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
213
+ The temporal patch size of the vision encoder.
214
+ merge_size (`int`, *optional*, defaults to `self.merge_size`):
215
+ The merge size of the vision encoder to llm encoder.
216
  do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
217
  Whether to convert the image to RGB.
218
  data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
 
251
  resized_height, resized_width = smart_resize(
252
  height,
253
  width,
254
+ factor=patch_size * merge_size,
255
+ min_pixels=size["shortest_edge"],
256
+ max_pixels=size["longest_edge"],
257
  )
258
  image = resize(
259
  image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
 
273
  patches = np.array(processed_images)
274
  if data_format == ChannelDimension.LAST:
275
  patches = patches.transpose(0, 3, 1, 2)
276
+ if patches.shape[0] % temporal_patch_size != 0:
277
+ repeats = np.repeat(
278
+ patches[-1][np.newaxis], temporal_patch_size - (patches.shape[0] % temporal_patch_size), axis=0
279
+ )
280
  patches = np.concatenate([patches, repeats], axis=0)
281
  channel = patches.shape[1]
282
+ grid_t = patches.shape[0] // temporal_patch_size
283
+ grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
284
  patches = patches.reshape(
285
  grid_t,
286
+ temporal_patch_size,
287
  channel,
288
+ grid_h // merge_size,
289
+ merge_size,
290
+ patch_size,
291
+ grid_w // merge_size,
292
+ merge_size,
293
+ patch_size,
294
  )
295
  patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
296
  flatten_patches = patches.reshape(
297
+ grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size
298
  )
299
 
300
  return flatten_patches, (grid_t, grid_h, grid_w)
 
303
  self,
304
  images: ImageInput,
305
  videos: VideoInput = None,
306
+ do_resize: Optional[bool] = None,
307
+ size: Optional[Dict[str, int]] = None,
308
+ min_pixels: Optional[int] = None,
309
+ max_pixels: Optional[int] = None,
310
  resample: PILImageResampling = None,
311
+ do_rescale: Optional[bool] = None,
312
+ rescale_factor: Optional[float] = None,
313
+ do_normalize: Optional[bool] = None,
314
  image_mean: Optional[Union[float, List[float]]] = None,
315
  image_std: Optional[Union[float, List[float]]] = None,
316
+ patch_size: Optional[int] = None,
317
+ temporal_patch_size: Optional[int] = None,
318
+ merge_size: Optional[int] = None,
319
+ do_convert_rgb: Optional[bool] = None,
320
  return_tensors: Optional[Union[str, TensorType]] = None,
321
  data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
322
  input_data_format: Optional[Union[str, ChannelDimension]] = None,
 
348
  image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
349
  Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
350
  `True`.
351
+ min_pixels (`int`, *optional*, defaults to `self.min_pixels`):
352
+ The min pixels of the image to resize the image.
353
+ max_pixels (`int`, *optional*, defaults to `self.max_pixels`):
354
+ The max pixels of the image to resize the image.
355
+ patch_size (`int`, *optional*, defaults to `self.patch_size`):
356
+ The spatial patch size of the vision encoder.
357
+ temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
358
+ The temporal patch size of the vision encoder.
359
+ merge_size (`int`, *optional*, defaults to `self.merge_size`):
360
+ The merge size of the vision encoder to llm encoder.
361
  do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
362
  Whether to convert the image to RGB.
363
  return_tensors (`str` or `TensorType`, *optional*):
 
380
  - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
381
 
382
  """
383
+ min_pixels = min_pixels if min_pixels is not None else self.min_pixels
384
+ max_pixels = max_pixels if max_pixels is not None else self.max_pixels
385
+
386
+ if size is not None:
387
+ if "shortest_edge" not in size or "longest_edge" not in size:
388
+ raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
389
+ min_pixels = size["shortest_edge"]
390
+ elif min_pixels is not None and max_pixels is not None:
391
+ # backward compatibility: override size with min_pixels and max_pixels if they are provided
392
+ size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
393
+ else:
394
+ size = {**self.size}
395
+
396
  do_resize = do_resize if do_resize is not None else self.do_resize
397
+
398
  resample = resample if resample is not None else self.resample
399
  do_rescale = do_rescale if do_rescale is not None else self.do_rescale
400
  rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
401
  do_normalize = do_normalize if do_normalize is not None else self.do_normalize
402
  image_mean = image_mean if image_mean is not None else self.image_mean
403
  image_std = image_std if image_std is not None else self.image_std
404
+ patch_size = patch_size if patch_size is not None else self.patch_size
405
+ temporal_patch_size = temporal_patch_size if temporal_patch_size is not None else self.temporal_patch_size
406
+ merge_size = merge_size if merge_size is not None else self.merge_size
407
  do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
408
 
409
  if images is not None:
410
+ images = make_flat_list_of_images(images)
 
 
411
 
412
  if images is not None and not valid_images(images):
413
  raise ValueError(
 
425
  resample=resample,
426
  )
427
 
428
+ data = {}
429
  if images is not None:
430
  pixel_values, vision_grid_thws = [], []
431
  for image in images:
432
  patches, image_grid_thw = self._preprocess(
433
  image,
434
  do_resize=do_resize,
435
+ size=size,
436
  resample=resample,
437
  do_rescale=do_rescale,
438
  rescale_factor=rescale_factor,
439
  do_normalize=do_normalize,
440
  image_mean=image_mean,
441
  image_std=image_std,
442
+ patch_size=patch_size,
443
+ temporal_patch_size=temporal_patch_size,
444
+ merge_size=merge_size,
445
  data_format=data_format,
446
  do_convert_rgb=do_convert_rgb,
447
  input_data_format=input_data_format,
 
450
  vision_grid_thws.append(image_grid_thw)
451
  pixel_values = np.array(pixel_values)
452
  vision_grid_thws = np.array(vision_grid_thws)
453
+ data.update({"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws})
454
 
455
+ # kept for BC only and should be removed after v5.0
456
  if videos is not None:
457
+ logger.warning(
458
+ "`Qwen2VLImageProcessor` works only with image inputs and doesn't process videos anymore. "
459
+ "This is a deprecated behavior and will be removed in v5.0. "
460
+ "Your videos should be forwarded to `Qwen2VLVideoProcessor`. "
461
+ )
462
+ videos = make_batched_videos(videos)
463
+ pixel_values_videos, vision_grid_thws_videos = [], []
464
  for images in videos:
465
  patches, video_grid_thw = self._preprocess(
466
  images,
467
  do_resize=do_resize,
468
+ size=size,
469
  resample=resample,
470
  do_rescale=do_rescale,
471
  rescale_factor=rescale_factor,
472
  do_normalize=do_normalize,
473
  image_mean=image_mean,
474
  image_std=image_std,
475
+ patch_size=patch_size,
476
+ temporal_patch_size=temporal_patch_size,
477
+ merge_size=merge_size,
478
  data_format=data_format,
479
  do_convert_rgb=do_convert_rgb,
480
  input_data_format=input_data_format,
481
  )
482
+ pixel_values_videos.extend(patches)
483
+ vision_grid_thws_videos.append(video_grid_thw)
484
+ data.update(
485
+ {
486
+ "pixel_values_videos": np.array(pixel_values_videos),
487
+ "video_grid_thw": np.array(vision_grid_thws_videos),
488
+ }
489
+ )
490
 
491
  return BatchFeature(data=data, tensor_type=return_tensors)
492
 
modeling_qwen2_vl.py CHANGED
@@ -27,139 +27,30 @@ import torch
27
  import torch.nn as nn
28
  import torch.nn.functional as F
29
  import torch.utils.checkpoint
30
- from torch.nn import CrossEntropyLoss, LayerNorm
31
 
32
  from transformers.activations import ACT2FN
33
  from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
34
  from transformers.generation import GenerationMixin
35
  from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 
36
  from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
37
- from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
38
  from transformers.modeling_utils import PreTrainedModel
39
- from transformers.utils import (
40
- add_start_docstrings,
41
- add_start_docstrings_to_model_forward,
42
- is_flash_attn_2_available,
43
- is_flash_attn_greater_or_equal_2_10,
44
- is_torchdynamo_compiling,
45
- logging,
46
- replace_return_docstrings,
47
- )
48
- from .configuration_qwen2_vl import Qwen2VLConfig, Qwen2VLVisionConfig
49
 
50
 
51
- if is_flash_attn_2_available():
52
- from flash_attn import flash_attn_varlen_func
53
 
54
- from transformers.modeling_flash_attention_utils import _flash_attention_forward
55
- else:
56
- flash_attn_varlen_func = None
57
 
 
58
 
59
- logger = logging.get_logger(__name__)
60
-
61
- _CONFIG_FOR_DOC = "Qwen2VLConfig"
62
-
63
-
64
- @dataclass
65
- class Qwen2VLCausalLMOutputWithPast(ModelOutput):
66
- """
67
- Base class for Qwen2VL causal language model (or autoregressive) outputs.
68
-
69
- Args:
70
- loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
71
- Language modeling loss (for next-token prediction).
72
- logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
73
- Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
74
- past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
75
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
76
- `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
77
-
78
- Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
79
- `past_key_values` input) to speed up sequential decoding.
80
- hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
81
- Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
82
- one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
83
-
84
- Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
85
- attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
86
- Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
87
- sequence_length)`.
88
-
89
- Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
90
- heads.
91
- rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
92
- The rope index difference between sequence length and multimodal rope.
93
- """
94
-
95
- loss: Optional[torch.FloatTensor] = None
96
- logits: torch.FloatTensor = None
97
- past_key_values: Optional[List[torch.FloatTensor]] = None
98
- hidden_states: Optional[Tuple[torch.FloatTensor]] = None
99
- attentions: Optional[Tuple[torch.FloatTensor]] = None
100
- rope_deltas: Optional[torch.LongTensor] = None
101
-
102
-
103
- class Qwen2VLRotaryEmbedding(nn.Module):
104
- def __init__(self, config: Qwen2VLConfig, device=None):
105
- super().__init__()
106
- # BC: "rope_type" was originally "type"
107
- if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
108
- self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
109
- else:
110
- self.rope_type = "default"
111
- self.max_seq_len_cached = config.max_position_embeddings
112
- self.original_max_seq_len = config.max_position_embeddings
113
-
114
- self.config = config
115
- self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
116
-
117
- inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
118
- self.register_buffer("inv_freq", inv_freq, persistent=False)
119
- self.original_inv_freq = self.inv_freq
120
-
121
- def _dynamic_frequency_update(self, position_ids, device):
122
- """
123
- dynamic RoPE layers should recompute `inv_freq` in the following situations:
124
- 1 - growing beyond the cached sequence length (allow scaling)
125
- 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
126
- """
127
- seq_len = torch.max(position_ids) + 1
128
- if seq_len > self.max_seq_len_cached: # growth
129
- inv_freq, self.attention_scaling = self.rope_init_fn(
130
- self.config, device, seq_len=seq_len, **self.rope_kwargs
131
- )
132
- self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
133
- self.max_seq_len_cached = seq_len
134
-
135
- if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
136
- self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
137
- self.max_seq_len_cached = self.original_max_seq_len
138
-
139
- @torch.no_grad()
140
- def forward(self, x, position_ids):
141
- if "dynamic" in self.rope_type:
142
- self._dynamic_frequency_update(position_ids, device=x.device)
143
-
144
- # Core RoPE block. In contrast to other models, Qwen2_VL has different position ids for thw grids
145
- # So we expand the inv_freq to shape (3, ...)
146
- inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
147
- position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
148
- # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
149
- device_type = x.device.type
150
- device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
151
- with torch.autocast(device_type=device_type, enabled=False):
152
- freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
153
- emb = torch.cat((freqs, freqs), dim=-1)
154
- cos = emb.cos()
155
- sin = emb.sin()
156
-
157
- # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
158
- cos = cos * self.attention_scaling
159
- sin = sin * self.attention_scaling
160
-
161
- return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
162
 
 
163
 
164
  # Copied from transformers.models.llama.modeling_llama.rotate_half
165
  def rotate_half(x):
@@ -169,58 +60,13 @@ def rotate_half(x):
169
  return torch.cat((-x2, x1), dim=-1)
170
 
171
 
172
- def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
173
- """Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/).
174
-
175
- Explanation:
176
- Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
177
- sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
178
- vision embedding part, we apply rotary position embedding on temporal, height and width dimension seperately.
179
- Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
180
- For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
181
- height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
182
- difference with modern LLMs.
183
-
184
- Args:
185
- q (`torch.Tensor`): The query tensor.
186
- k (`torch.Tensor`): The key tensor.
187
- cos (`torch.Tensor`): The cosine part of the rotary embedding.
188
- sin (`torch.Tensor`): The sine part of the rotary embedding.
189
- position_ids (`torch.Tensor`):
190
- The position indices of the tokens corresponding to the query and key tensors. For example, this can be
191
- used to pass offsetted position ids when working with a KV-cache.
192
- mrope_section(`List(int)`):
193
- Multimodal rope section is for channel dimension of temporal, height and width in rope calculation.
194
- unsqueeze_dim (`int`, *optional*, defaults to 1):
195
- The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
196
- sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
197
- that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
198
- k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
199
- cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
200
- the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
201
- Returns:
202
- `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
203
- """
204
- mrope_section = mrope_section * 2
205
- cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
206
- unsqueeze_dim
207
- )
208
- sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
209
- unsqueeze_dim
210
- )
211
-
212
- q_embed = (q * cos) + (rotate_half(q) * sin)
213
- k_embed = (k * cos) + (rotate_half(k) * sin)
214
- return q_embed, k_embed
215
-
216
-
217
  def apply_rotary_pos_emb_vision(
218
  q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
219
  ) -> Tuple[torch.Tensor, torch.Tensor]:
220
  orig_q_dtype = q.dtype
221
  orig_k_dtype = k.dtype
222
  q, k = q.float(), k.float()
223
- cos, sin = cos.unsqueeze(-2), sin.unsqueeze(-2)
224
  q_embed = (q * cos) + (rotate_half(q) * sin)
225
  k_embed = (k * cos) + (rotate_half(k) * sin)
226
  q_embed = q_embed.to(orig_q_dtype)
@@ -318,8 +164,8 @@ class VisionAttention(nn.Module):
318
  "removed and `position_embeddings` will be mandatory."
319
  )
320
  emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
321
- cos = emb.cos().float()
322
- sin = emb.sin().float()
323
  else:
324
  cos, sin = position_embeddings
325
  q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
@@ -367,8 +213,8 @@ class VisionFlashAttention2(nn.Module):
367
  "removed and `position_embeddings` will be mandatory."
368
  )
369
  emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
370
- cos = emb.cos().float()
371
- sin = emb.sin().float()
372
  else:
373
  cos, sin = position_embeddings
374
  q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
@@ -405,8 +251,8 @@ class VisionSdpaAttention(nn.Module):
405
  "removed and `position_embeddings` will be mandatory."
406
  )
407
  emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
408
- cos = emb.cos().float()
409
- sin = emb.sin().float()
410
  else:
411
  cos, sin = position_embeddings
412
  q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
@@ -417,8 +263,10 @@ class VisionSdpaAttention(nn.Module):
417
  q = q.transpose(0, 1)
418
  k = k.transpose(0, 1)
419
  v = v.transpose(0, 1)
420
- attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
421
- attn_output = attn_output.transpose(0, 1)
 
 
422
  attn_output = attn_output.reshape(seq_length, -1)
423
  attn_output = self.proj(attn_output)
424
  return attn_output
@@ -460,481 +308,10 @@ class Qwen2VLVisionBlock(nn.Module):
460
  return hidden_states
461
 
462
 
463
- # Copied from transformers.models.qwen2.modeling_qwen2.Qwen2RMSNorm
464
- class Qwen2RMSNorm(nn.Module):
465
- def __init__(self, hidden_size, eps=1e-6):
466
- """
467
- Qwen2RMSNorm is equivalent to T5LayerNorm
468
- """
469
- super().__init__()
470
- self.weight = nn.Parameter(torch.ones(hidden_size))
471
- self.variance_epsilon = eps
472
-
473
- def forward(self, hidden_states):
474
- input_dtype = hidden_states.dtype
475
- hidden_states = hidden_states.to(torch.float32)
476
- variance = hidden_states.pow(2).mean(-1, keepdim=True)
477
- hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
478
- return self.weight * hidden_states.to(input_dtype)
479
-
480
- def extra_repr(self):
481
- return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
482
-
483
-
484
- # Copied from transformers.models.qwen2.modeling_qwen2.Qwen2MLP
485
- class Qwen2MLP(nn.Module):
486
- def __init__(self, config):
487
- super().__init__()
488
- self.config = config
489
- self.hidden_size = config.hidden_size
490
- self.intermediate_size = config.intermediate_size
491
- self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
492
- self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
493
- self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
494
- self.act_fn = ACT2FN[config.hidden_act]
495
-
496
- def forward(self, x):
497
- down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
498
- return down_proj
499
-
500
-
501
- # Copied from transformers.models.llama.modeling_llama.repeat_kv
502
- def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
503
- """
504
- This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
505
- num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
506
- """
507
- batch, num_key_value_heads, slen, head_dim = hidden_states.shape
508
- if n_rep == 1:
509
- return hidden_states
510
- hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
511
- return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
512
-
513
-
514
- class Qwen2VLAttention(nn.Module):
515
- """
516
- Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
517
- and "Generating Long Sequences with Sparse Transformers".
518
- """
519
-
520
- def __init__(self, config: Qwen2VLConfig, layer_idx: Optional[int] = None):
521
- super().__init__()
522
- self.config = config
523
- self.layer_idx = layer_idx
524
- if layer_idx is None:
525
- logger.warning_once(
526
- f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
527
- "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
528
- "when creating this class."
529
- )
530
-
531
- self.hidden_size = config.hidden_size
532
- self.num_heads = config.num_attention_heads
533
- self.head_dim = self.hidden_size // self.num_heads
534
- self.num_key_value_heads = config.num_key_value_heads
535
- self.num_key_value_groups = self.num_heads // self.num_key_value_heads
536
- self.is_causal = True
537
- self.attention_dropout = config.attention_dropout
538
- self.rope_scaling = config.rope_scaling
539
-
540
- if (self.head_dim * self.num_heads) != self.hidden_size:
541
- raise ValueError(
542
- f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
543
- f" and `num_heads`: {self.num_heads})."
544
- )
545
- self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
546
- self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
547
- self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
548
- self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
549
-
550
- self.rotary_emb = Qwen2VLRotaryEmbedding(config=config)
551
-
552
- def forward(
553
- self,
554
- hidden_states: torch.Tensor,
555
- attention_mask: Optional[torch.Tensor] = None,
556
- position_ids: Optional[torch.LongTensor] = None,
557
- past_key_value: Optional[Cache] = None,
558
- output_attentions: bool = False,
559
- use_cache: bool = False,
560
- cache_position: Optional[torch.LongTensor] = None,
561
- position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
562
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
563
- bsz, q_len, _ = hidden_states.size()
564
-
565
- query_states = self.q_proj(hidden_states)
566
- key_states = self.k_proj(hidden_states)
567
- value_states = self.v_proj(hidden_states)
568
-
569
- query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
570
- key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
571
- value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
572
-
573
- cos, sin = position_embeddings
574
- query_states, key_states = apply_multimodal_rotary_pos_emb(
575
- query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
576
- )
577
-
578
- if past_key_value is not None:
579
- cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
580
- key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
581
-
582
- # repeat k/v heads if n_kv_heads < n_heads
583
- key_states = repeat_kv(key_states, self.num_key_value_groups)
584
- value_states = repeat_kv(value_states, self.num_key_value_groups)
585
-
586
- attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
587
-
588
- if attention_mask is not None: # no matter the length, we just slice it
589
- causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
590
- attn_weights = attn_weights + causal_mask
591
-
592
- # Fix precision issues in Qwen2-VL float16 inference
593
- # Replace inf values with zeros in attention weights to prevent NaN propagation
594
- if query_states.dtype == torch.float16:
595
- attn_weights = torch.where(torch.isinf(attn_weights), torch.zeros_like(attn_weights), attn_weights)
596
-
597
- # upcast attention to fp32
598
- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
599
- attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
600
- attn_output = torch.matmul(attn_weights, value_states)
601
-
602
- if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
603
- raise ValueError(
604
- f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
605
- f" {attn_output.size()}"
606
- )
607
-
608
- attn_output = attn_output.transpose(1, 2).contiguous()
609
- attn_output = attn_output.reshape(bsz, q_len, -1)
610
-
611
- attn_output = self.o_proj(attn_output)
612
-
613
- if not output_attentions:
614
- attn_weights = None
615
-
616
- return attn_output, attn_weights, past_key_value
617
-
618
-
619
- class Qwen2VLFlashAttention2(Qwen2VLAttention):
620
- """
621
- Qwen2VL flash attention module, following Qwen2VL attention module. This module inherits from `Qwen2VLAttention`
622
- as the weights of the module stays untouched. The only required change would be on the forward pass
623
- where it needs to correctly call the public API of flash attention and deal with padding tokens
624
- in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
625
- config.max_window_layers layers.
626
- """
627
-
628
- def __init__(self, *args, **kwargs):
629
- super().__init__(*args, **kwargs)
630
-
631
- # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
632
- # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
633
- # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
634
- self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
635
-
636
- def forward(
637
- self,
638
- hidden_states: torch.Tensor,
639
- attention_mask: Optional[torch.Tensor] = None,
640
- position_ids: Optional[torch.LongTensor] = None,
641
- past_key_value: Optional[Cache] = None,
642
- output_attentions: bool = False,
643
- use_cache: bool = False,
644
- cache_position: Optional[torch.LongTensor] = None,
645
- position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
646
- ):
647
- bsz, q_len, _ = hidden_states.size()
648
-
649
- query_states = self.q_proj(hidden_states)
650
- key_states = self.k_proj(hidden_states)
651
- value_states = self.v_proj(hidden_states)
652
-
653
- query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
654
- key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
655
- value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
656
-
657
- # Because the input can be padded, the absolute sequence length depends on the max position id.
658
- cos, sin = position_embeddings
659
- query_states, key_states = apply_multimodal_rotary_pos_emb(
660
- query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
661
- )
662
-
663
- if past_key_value is not None:
664
- cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
665
- key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
666
-
667
- # repeat k/v heads if n_kv_heads < n_heads
668
- key_states = repeat_kv(key_states, self.num_key_value_groups)
669
- value_states = repeat_kv(value_states, self.num_key_value_groups)
670
- dropout_rate = 0.0 if not self.training else self.attention_dropout
671
-
672
- # In PEFT, usually we cast the layer norms in float32 for training stability reasons
673
- # therefore the input hidden states gets silently casted in float32. Hence, we need
674
- # cast them back in float16 just to be sure everything works as expected.
675
- input_dtype = query_states.dtype
676
- if input_dtype == torch.float32:
677
- if torch.is_autocast_enabled():
678
- target_dtype = torch.get_autocast_gpu_dtype()
679
- # Handle the case where the model is quantized
680
- elif hasattr(self.config, "_pre_quantization_dtype"):
681
- target_dtype = self.config._pre_quantization_dtype
682
- else:
683
- target_dtype = self.q_proj.weight.dtype
684
-
685
- logger.warning_once(
686
- f"The input hidden states seems to be silently casted in float32, this might be related to"
687
- f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
688
- f" {target_dtype}."
689
- )
690
-
691
- query_states = query_states.to(target_dtype)
692
- key_states = key_states.to(target_dtype)
693
- value_states = value_states.to(target_dtype)
694
-
695
- # Reashape to the expected shape for Flash Attention
696
- query_states = query_states.transpose(1, 2)
697
- key_states = key_states.transpose(1, 2)
698
- value_states = value_states.transpose(1, 2)
699
-
700
- if (
701
- self.config.use_sliding_window
702
- and getattr(self.config, "sliding_window", None) is not None
703
- and self.layer_idx >= self.config.max_window_layers
704
- ):
705
- sliding_window = self.config.sliding_window
706
- else:
707
- sliding_window = None
708
-
709
- attn_output = _flash_attention_forward(
710
- query_states,
711
- key_states,
712
- value_states,
713
- attention_mask,
714
- q_len,
715
- dropout=dropout_rate,
716
- sliding_window=sliding_window,
717
- is_causal=self.is_causal,
718
- use_top_left_mask=self._flash_attn_uses_top_left_mask,
719
- )
720
-
721
- attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
722
- attn_output = self.o_proj(attn_output)
723
-
724
- if not output_attentions:
725
- attn_weights = None
726
-
727
- return attn_output, attn_weights, past_key_value
728
-
729
-
730
- class Qwen2VLSdpaAttention(Qwen2VLAttention):
731
- """
732
- Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
733
- `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
734
- SDPA API.
735
- """
736
-
737
- # Adapted from Qwen2Attention.forward
738
- def forward(
739
- self,
740
- hidden_states: torch.Tensor,
741
- attention_mask: Optional[torch.Tensor] = None,
742
- position_ids: Optional[torch.LongTensor] = None,
743
- past_key_value: Optional[Cache] = None,
744
- output_attentions: bool = False,
745
- use_cache: bool = False,
746
- cache_position: Optional[torch.LongTensor] = None,
747
- position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
748
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
749
- if output_attentions:
750
- # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
751
- logger.warning_once(
752
- "Qwen2VLModel is using Qwen2VLSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
753
- 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
754
- )
755
- return super().forward(
756
- hidden_states=hidden_states,
757
- attention_mask=attention_mask,
758
- position_ids=position_ids,
759
- past_key_value=past_key_value,
760
- output_attentions=output_attentions,
761
- use_cache=use_cache,
762
- cache_position=cache_position,
763
- position_embeddings=position_embeddings,
764
- )
765
-
766
- bsz, q_len, _ = hidden_states.size()
767
-
768
- query_states = self.q_proj(hidden_states)
769
- key_states = self.k_proj(hidden_states)
770
- value_states = self.v_proj(hidden_states)
771
-
772
- query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
773
- key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
774
- value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
775
-
776
- cos, sin = position_embeddings
777
- query_states, key_states = apply_multimodal_rotary_pos_emb(
778
- query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
779
- )
780
-
781
- if past_key_value is not None:
782
- cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
783
- key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
784
-
785
- key_states = repeat_kv(key_states, self.num_key_value_groups)
786
- value_states = repeat_kv(value_states, self.num_key_value_groups)
787
-
788
- causal_mask = attention_mask
789
- if attention_mask is not None: # no matter the length, we just slice it
790
- causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
791
-
792
- # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
793
- # Reference: https://github.com/pytorch/pytorch/issues/112577.
794
- if query_states.device.type == "cuda" and attention_mask is not None:
795
- query_states = query_states.contiguous()
796
- key_states = key_states.contiguous()
797
- value_states = value_states.contiguous()
798
-
799
- # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
800
- # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
801
- # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
802
- is_causal = True if causal_mask is None and q_len > 1 else False
803
-
804
- attn_output = torch.nn.functional.scaled_dot_product_attention(
805
- query_states,
806
- key_states,
807
- value_states,
808
- attn_mask=causal_mask,
809
- dropout_p=self.attention_dropout if self.training else 0.0,
810
- is_causal=is_causal,
811
- )
812
-
813
- attn_output = attn_output.transpose(1, 2).contiguous()
814
- attn_output = attn_output.view(bsz, q_len, self.hidden_size)
815
-
816
- attn_output = self.o_proj(attn_output)
817
-
818
- return attn_output, None, past_key_value
819
-
820
-
821
- QWEN2_VL_ATTENTION_CLASSES = {
822
- "eager": Qwen2VLAttention,
823
- "flash_attention_2": Qwen2VLFlashAttention2,
824
- "sdpa": Qwen2VLSdpaAttention,
825
- }
826
-
827
-
828
- class Qwen2VLDecoderLayer(nn.Module):
829
- def __init__(self, config: Qwen2VLConfig, layer_idx: int):
830
- super().__init__()
831
- self.hidden_size = config.hidden_size
832
-
833
- if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
834
- logger.warning_once(
835
- f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
836
- "unexpected results may be encountered."
837
- )
838
- self.self_attn = QWEN2_VL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
839
-
840
- self.mlp = Qwen2MLP(config)
841
- self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
842
- self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
843
-
844
- def forward(
845
- self,
846
- hidden_states: torch.Tensor,
847
- attention_mask: Optional[torch.Tensor] = None,
848
- position_ids: Optional[torch.LongTensor] = None,
849
- past_key_value: Optional[Tuple[torch.Tensor]] = None,
850
- output_attentions: Optional[bool] = False,
851
- use_cache: Optional[bool] = False,
852
- cache_position: Optional[torch.LongTensor] = None,
853
- position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
854
- **kwargs,
855
- ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
856
- """
857
- Args:
858
- hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
859
- attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
860
- `(batch, sequence_length)` where padding elements are indicated by 0.
861
- output_attentions (`bool`, *optional*):
862
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under
863
- returned tensors for more detail.
864
- use_cache (`bool`, *optional*):
865
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
866
- (see `past_key_values`).
867
- past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
868
- cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
869
- Indices depicting the position of the input sequence tokens in the sequence.
870
- position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
871
- Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
872
- with `head_dim` being the embedding dimension of each attention head.
873
- kwargs (`dict`, *optional*):
874
- Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
875
- into the model
876
- """
877
-
878
- residual = hidden_states
879
-
880
- hidden_states = self.input_layernorm(hidden_states)
881
-
882
- # Self Attention
883
- hidden_states, self_attn_weights, present_key_value = self.self_attn(
884
- hidden_states=hidden_states,
885
- attention_mask=attention_mask,
886
- position_ids=position_ids,
887
- past_key_value=past_key_value,
888
- output_attentions=output_attentions,
889
- use_cache=use_cache,
890
- cache_position=cache_position,
891
- position_embeddings=position_embeddings,
892
- )
893
- hidden_states = residual + hidden_states
894
-
895
- # Fully Connected
896
- residual = hidden_states
897
- hidden_states = self.post_attention_layernorm(hidden_states)
898
- hidden_states = self.mlp(hidden_states)
899
- hidden_states = residual + hidden_states
900
-
901
- outputs = (hidden_states,)
902
-
903
- if output_attentions:
904
- outputs += (self_attn_weights,)
905
-
906
- if use_cache:
907
- outputs += (present_key_value,)
908
-
909
- return outputs
910
-
911
-
912
- QWEN2VL_START_DOCSTRING = r"""
913
- This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
914
- library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
915
- etc.)
916
-
917
- This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
918
- Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
919
- and behavior.
920
-
921
- Parameters:
922
- config ([`Qwen2VLConfig`]):
923
- Model configuration class with all the parameters of the model. Initializing with a config file does not
924
- load the weights associated with the model, only the configuration. Check out the
925
- [`~PreTrainedModel.from_pretrained`] method to load the model weights.
926
- """
927
-
928
-
929
- @add_start_docstrings(
930
- "The bare Qwen2VL Model outputting raw hidden-states without any specific head on top.",
931
- QWEN2VL_START_DOCSTRING,
932
- )
933
  class Qwen2VLPreTrainedModel(PreTrainedModel):
934
- config_class = Qwen2VLConfig
935
  base_model_prefix = "model"
936
  supports_gradient_checkpointing = True
937
- _no_split_modules = ["Qwen2VLDecoderLayer", "Qwen2VLVisionBlock"]
938
  _skip_keys_device_placement = "past_key_values"
939
  _supports_flash_attn_2 = True
940
  _supports_sdpa = True
@@ -942,7 +319,7 @@ class Qwen2VLPreTrainedModel(PreTrainedModel):
942
  _supports_static_cache = False # TODO (joao): fix. torch.compile failing probably due to `cache_positions`
943
 
944
  def _init_weights(self, module):
945
- std = self.config.initializer_range
946
  if isinstance(module, (nn.Linear, nn.Conv3d)):
947
  module.weight.data.normal_(mean=0.0, std=std)
948
  if module.bias is not None:
@@ -951,8 +328,12 @@ class Qwen2VLPreTrainedModel(PreTrainedModel):
951
  module.weight.data.normal_(mean=0.0, std=std)
952
  if module.padding_idx is not None:
953
  module.weight.data[module.padding_idx].zero_()
 
 
 
954
 
955
 
 
956
  class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel):
957
  config_class = Qwen2VLVisionConfig
958
  _no_split_modules = ["Qwen2VLVisionBlock"]
@@ -1014,7 +395,12 @@ class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel):
1014
  rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
1015
  return rotary_pos_emb
1016
 
 
1017
  def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
 
 
 
 
1018
  hidden_states = self.patch_embed(hidden_states)
1019
  rotary_pos_emb = self.rot_pos_emb(grid_thw)
1020
  emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
@@ -1039,942 +425,3 @@ class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel):
1039
  hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, position_embeddings=position_embeddings)
1040
 
1041
  return self.merger(hidden_states)
1042
-
1043
-
1044
- @add_start_docstrings(
1045
- "The bare Qwen2VL Model outputting raw hidden-states without any specific head on top.",
1046
- QWEN2VL_START_DOCSTRING,
1047
- )
1048
- class Qwen2VLModel(Qwen2VLPreTrainedModel):
1049
- def __init__(self, config: Qwen2VLConfig):
1050
- super().__init__(config)
1051
- self.padding_idx = config.pad_token_id
1052
- self.vocab_size = config.vocab_size
1053
-
1054
- self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
1055
- self.layers = nn.ModuleList(
1056
- [Qwen2VLDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
1057
- )
1058
- self._attn_implementation = config._attn_implementation
1059
- self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1060
- self.rotary_emb = Qwen2VLRotaryEmbedding(config=config)
1061
-
1062
- self.gradient_checkpointing = False
1063
- # Initialize weights and apply final processing
1064
- self.post_init()
1065
-
1066
- def get_input_embeddings(self):
1067
- return self.embed_tokens
1068
-
1069
- def set_input_embeddings(self, value):
1070
- self.embed_tokens = value
1071
-
1072
- def forward(
1073
- self,
1074
- input_ids: torch.LongTensor = None,
1075
- attention_mask: Optional[torch.Tensor] = None,
1076
- position_ids: Optional[torch.LongTensor] = None,
1077
- past_key_values: Optional[List[torch.FloatTensor]] = None,
1078
- inputs_embeds: Optional[torch.FloatTensor] = None,
1079
- use_cache: Optional[bool] = None,
1080
- output_attentions: Optional[bool] = None,
1081
- output_hidden_states: Optional[bool] = None,
1082
- return_dict: Optional[bool] = None,
1083
- cache_position: Optional[torch.LongTensor] = None,
1084
- ) -> Union[Tuple, BaseModelOutputWithPast]:
1085
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1086
- output_hidden_states = (
1087
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1088
- )
1089
- use_cache = use_cache if use_cache is not None else self.config.use_cache
1090
-
1091
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1092
-
1093
- if (input_ids is None) ^ (inputs_embeds is not None):
1094
- raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
1095
-
1096
- if self.gradient_checkpointing and self.training:
1097
- if use_cache:
1098
- logger.warning_once(
1099
- "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
1100
- )
1101
- use_cache = False
1102
-
1103
- # torch.jit.trace() doesn't support cache objects in the output
1104
- if use_cache and past_key_values is None and not torch.jit.is_tracing():
1105
- past_key_values = DynamicCache()
1106
-
1107
- if inputs_embeds is None:
1108
- inputs_embeds = self.embed_tokens(input_ids)
1109
-
1110
- if cache_position is None:
1111
- past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
1112
- cache_position = torch.arange(
1113
- past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
1114
- )
1115
-
1116
- # the hard coded `3` is for temporal, height and width.
1117
- if position_ids is None:
1118
- position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
1119
- elif position_ids.dim() == 2:
1120
- position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
1121
-
1122
- causal_mask = self._update_causal_mask(
1123
- attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
1124
- )
1125
-
1126
- hidden_states = inputs_embeds
1127
-
1128
- # create position embeddings to be shared across the decoder layers
1129
- position_embeddings = self.rotary_emb(hidden_states, position_ids)
1130
-
1131
- # decoder layers
1132
- all_hidden_states = () if output_hidden_states else None
1133
- all_self_attns = () if output_attentions else None
1134
- next_decoder_cache = None
1135
-
1136
- for decoder_layer in self.layers:
1137
- if output_hidden_states:
1138
- all_hidden_states += (hidden_states,)
1139
-
1140
- if self.gradient_checkpointing and self.training:
1141
- layer_outputs = self._gradient_checkpointing_func(
1142
- decoder_layer.__call__,
1143
- hidden_states,
1144
- causal_mask,
1145
- position_ids,
1146
- past_key_values,
1147
- output_attentions,
1148
- use_cache,
1149
- cache_position,
1150
- position_embeddings,
1151
- )
1152
- else:
1153
- layer_outputs = decoder_layer(
1154
- hidden_states,
1155
- attention_mask=causal_mask,
1156
- position_ids=position_ids,
1157
- past_key_value=past_key_values,
1158
- output_attentions=output_attentions,
1159
- use_cache=use_cache,
1160
- cache_position=cache_position,
1161
- position_embeddings=position_embeddings,
1162
- )
1163
-
1164
- hidden_states = layer_outputs[0]
1165
-
1166
- if use_cache:
1167
- next_decoder_cache = layer_outputs[2 if output_attentions else 1]
1168
-
1169
- if output_attentions:
1170
- all_self_attns += (layer_outputs[1],)
1171
-
1172
- hidden_states = self.norm(hidden_states)
1173
-
1174
- # add hidden states from the last decoder layer
1175
- if output_hidden_states:
1176
- all_hidden_states += (hidden_states,)
1177
-
1178
- next_cache = next_decoder_cache if use_cache else None
1179
-
1180
- if not return_dict:
1181
- return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
1182
- return BaseModelOutputWithPast(
1183
- last_hidden_state=hidden_states,
1184
- past_key_values=next_cache,
1185
- hidden_states=all_hidden_states,
1186
- attentions=all_self_attns,
1187
- )
1188
-
1189
- # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask with Phi3->Qwen2VL
1190
- def _update_causal_mask(
1191
- self,
1192
- attention_mask: torch.Tensor,
1193
- input_tensor: torch.Tensor,
1194
- cache_position: torch.Tensor,
1195
- past_key_values: Cache,
1196
- output_attentions: bool,
1197
- ):
1198
- if self.config._attn_implementation == "flash_attention_2":
1199
- if attention_mask is not None and past_key_values is not None:
1200
- is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
1201
- if is_padding_right:
1202
- raise ValueError(
1203
- "You are attempting to perform batched generation with padding_side='right'"
1204
- " this may lead to unexpected behaviour for Flash Attention version of Qwen2VL. Make sure to "
1205
- " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
1206
- )
1207
- if attention_mask is not None and 0.0 in attention_mask:
1208
- return attention_mask
1209
- return None
1210
-
1211
- # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
1212
- # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
1213
- # to infer the attention mask.
1214
- past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
1215
- using_static_cache = isinstance(past_key_values, StaticCache)
1216
- using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
1217
-
1218
- # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
1219
- if (
1220
- self.config._attn_implementation == "sdpa"
1221
- and not (using_static_cache or using_sliding_window_cache)
1222
- and not output_attentions
1223
- ):
1224
- if AttentionMaskConverter._ignore_causal_mask_sdpa(
1225
- attention_mask,
1226
- inputs_embeds=input_tensor,
1227
- past_key_values_length=past_seen_tokens,
1228
- sliding_window=self.config.sliding_window,
1229
- is_training=self.training,
1230
- ):
1231
- return None
1232
-
1233
- dtype, device = input_tensor.dtype, input_tensor.device
1234
- min_dtype = torch.finfo(dtype).min
1235
- sequence_length = input_tensor.shape[1]
1236
- # SlidingWindowCache or StaticCache
1237
- if using_sliding_window_cache or using_static_cache:
1238
- target_length = past_key_values.get_max_cache_shape()
1239
- # DynamicCache or no cache
1240
- else:
1241
- target_length = (
1242
- attention_mask.shape[-1]
1243
- if isinstance(attention_mask, torch.Tensor)
1244
- else past_seen_tokens + sequence_length + 1
1245
- )
1246
-
1247
- # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
1248
- causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
1249
- attention_mask,
1250
- sequence_length=sequence_length,
1251
- target_length=target_length,
1252
- dtype=dtype,
1253
- device=device,
1254
- cache_position=cache_position,
1255
- batch_size=input_tensor.shape[0],
1256
- config=self.config,
1257
- past_key_values=past_key_values,
1258
- )
1259
-
1260
- if (
1261
- self.config._attn_implementation == "sdpa"
1262
- and attention_mask is not None
1263
- and attention_mask.device.type in ["cuda", "xpu"]
1264
- and not output_attentions
1265
- ):
1266
- # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
1267
- # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
1268
- # Details: https://github.com/pytorch/pytorch/issues/110213
1269
- causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
1270
-
1271
- return causal_mask
1272
-
1273
- @staticmethod
1274
- # Copied from transformers.models.mistral.modeling_mistral.MistralModel._prepare_4d_causal_attention_mask_with_cache_position with Mistral->Qwen2VL
1275
- def _prepare_4d_causal_attention_mask_with_cache_position(
1276
- attention_mask: torch.Tensor,
1277
- sequence_length: int,
1278
- target_length: int,
1279
- dtype: torch.dtype,
1280
- device: torch.device,
1281
- cache_position: torch.Tensor,
1282
- batch_size: int,
1283
- config: Qwen2VLConfig,
1284
- past_key_values: Cache,
1285
- ):
1286
- """
1287
- Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
1288
- `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
1289
-
1290
- Args:
1291
- attention_mask (`torch.Tensor`):
1292
- A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
1293
- sequence_length (`int`):
1294
- The sequence length being processed.
1295
- target_length (`int`):
1296
- The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
1297
- dtype (`torch.dtype`):
1298
- The dtype to use for the 4D attention mask.
1299
- device (`torch.device`):
1300
- The device to plcae the 4D attention mask on.
1301
- cache_position (`torch.Tensor`):
1302
- Indices depicting the position of the input sequence tokens in the sequence.
1303
- batch_size (`torch.Tensor`):
1304
- Batch size.
1305
- config (`Qwen2VLConfig`):
1306
- The model's configuration class
1307
- past_key_values (`Cache`):
1308
- The cache class that is being used currently to generate
1309
- """
1310
- if attention_mask is not None and attention_mask.dim() == 4:
1311
- # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
1312
- causal_mask = attention_mask
1313
- else:
1314
- min_dtype = torch.finfo(dtype).min
1315
- causal_mask = torch.full(
1316
- (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
1317
- )
1318
- diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
1319
- if config.sliding_window is not None:
1320
- # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
1321
- # the check is needed to verify is current checkpoint was trained with sliding window or not
1322
- if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
1323
- sliding_attend_mask = torch.arange(target_length, device=device) <= (
1324
- cache_position.reshape(-1, 1) - config.sliding_window
1325
- )
1326
- diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
1327
- causal_mask *= diagonal_attend_mask
1328
- causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
1329
- if attention_mask is not None:
1330
- causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
1331
- if attention_mask.shape[-1] > target_length:
1332
- attention_mask = attention_mask[:, :target_length]
1333
- mask_length = attention_mask.shape[-1]
1334
- padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
1335
- causal_mask.device
1336
- )
1337
- padding_mask = padding_mask == 0
1338
- causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
1339
- padding_mask, min_dtype
1340
- )
1341
- return causal_mask
1342
-
1343
-
1344
- QWEN2_VL_INPUTS_DOCSTRING = r"""
1345
- Args:
1346
- input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1347
- Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
1348
- it.
1349
-
1350
- Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1351
- [`PreTrainedTokenizer.__call__`] for details.
1352
-
1353
- [What are input IDs?](../glossary#input-ids)
1354
- attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1355
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1356
-
1357
- - 1 for tokens that are **not masked**,
1358
- - 0 for tokens that are **masked**.
1359
-
1360
- [What are attention masks?](../glossary#attention-mask)
1361
-
1362
- Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1363
- [`PreTrainedTokenizer.__call__`] for details.
1364
-
1365
- If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
1366
- `past_key_values`).
1367
-
1368
- If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
1369
- and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
1370
- information on the default strategy.
1371
-
1372
- - 1 indicates the head is **not masked**,
1373
- - 0 indicates the head is **masked**.
1374
- position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1375
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1376
- config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
1377
- past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
1378
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
1379
- `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
1380
- `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
1381
-
1382
- Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
1383
- blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
1384
-
1385
- If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
1386
- don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
1387
- `decoder_input_ids` of shape `(batch_size, sequence_length)`.
1388
- inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1389
- Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
1390
- is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
1391
- model's internal embedding lookup matrix.
1392
- use_cache (`bool`, *optional*):
1393
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
1394
- `past_key_values`).
1395
- output_attentions (`bool`, *optional*):
1396
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
1397
- tensors for more detail.
1398
- output_hidden_states (`bool`, *optional*):
1399
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
1400
- more detail.
1401
- return_dict (`bool`, *optional*):
1402
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
1403
- pixel_values (`torch.FloatTensor` of shape `(seq_length, num_channels * image_size * image_size)):
1404
- The tensors corresponding to the input images. Pixel values can be obtained using
1405
- [`AutoImageProcessor`]. See [`Qwen2VLImageProcessor.__call__`] for details. [`Qwen2VLProcessor`] uses
1406
- [`Qwen2VLImageProcessor`] for processing images.
1407
- pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
1408
- The tensors corresponding to the input videos. Pixel values can be obtained using
1409
- [`AutoImageProcessor`]. See [`Qwen2VLImageProcessor.__call__`] for details. [`Qwen2VLProcessor`] uses
1410
- [`Qwen2VLImageProcessor`] for processing videos.
1411
- image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1412
- The temporal, height and width of feature shape of each image in LLM.
1413
- video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
1414
- The temporal, height and width of feature shape of each video in LLM.
1415
- rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
1416
- The rope index difference between sequence length and multimodal rope.
1417
- """
1418
-
1419
-
1420
- class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
1421
- _tied_weights_keys = ["lm_head.weight"]
1422
-
1423
- def __init__(self, config):
1424
- super().__init__(config)
1425
- self.visual = Qwen2VisionTransformerPretrainedModel._from_config(config.vision_config)
1426
- self.model = Qwen2VLModel(config)
1427
- self.vocab_size = config.vocab_size
1428
- self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1429
- self.rope_deltas = None # cache rope_deltas here
1430
-
1431
- # Initialize weights and apply final processing
1432
- self.post_init()
1433
-
1434
- def get_input_embeddings(self):
1435
- return self.model.embed_tokens
1436
-
1437
- def set_input_embeddings(self, value):
1438
- self.model.embed_tokens = value
1439
-
1440
- def get_output_embeddings(self):
1441
- return self.lm_head
1442
-
1443
- def set_output_embeddings(self, new_embeddings):
1444
- self.lm_head = new_embeddings
1445
-
1446
- def set_decoder(self, decoder):
1447
- self.model = decoder
1448
-
1449
- def get_decoder(self):
1450
- return self.model
1451
-
1452
- def get_rope_index(
1453
- self,
1454
- input_ids: Optional[torch.LongTensor] = None,
1455
- image_grid_thw: Optional[torch.LongTensor] = None,
1456
- video_grid_thw: Optional[torch.LongTensor] = None,
1457
- attention_mask: Optional[torch.Tensor] = None,
1458
- ) -> Tuple[torch.Tensor, torch.Tensor]:
1459
- """
1460
- Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
1461
-
1462
- Explanation:
1463
- Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
1464
-
1465
- For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
1466
- Examples:
1467
- input_ids: [T T T T T], here T is for text.
1468
- temporal position_ids: [0, 1, 2, 3, 4]
1469
- height position_ids: [0, 1, 2, 3, 4]
1470
- width position_ids: [0, 1, 2, 3, 4]
1471
-
1472
- For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
1473
- and 1D rotary position embeddin for text part.
1474
- Examples:
1475
- Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
1476
- input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
1477
- vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
1478
- vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
1479
- vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
1480
- text temporal position_ids: [3, 4, 5, 6, 7]
1481
- text height position_ids: [3, 4, 5, 6, 7]
1482
- text width position_ids: [3, 4, 5, 6, 7]
1483
- Here we calculate the text start position_ids as the max vision position_ids plus 1.
1484
-
1485
- Args:
1486
- input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1487
- Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
1488
- it.
1489
- image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1490
- The temporal, height and width of feature shape of each image in LLM.
1491
- video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
1492
- The temporal, height and width of feature shape of each video in LLM.
1493
- attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1494
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1495
-
1496
- - 1 for tokens that are **not masked**,
1497
- - 0 for tokens that are **masked**.
1498
-
1499
- Returns:
1500
- position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
1501
- mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
1502
- """
1503
- spatial_merge_size = self.config.vision_config.spatial_merge_size
1504
- image_token_id = self.config.image_token_id
1505
- video_token_id = self.config.video_token_id
1506
- vision_start_token_id = self.config.vision_start_token_id
1507
- mrope_position_deltas = []
1508
- if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
1509
- total_input_ids = input_ids
1510
- if attention_mask is None:
1511
- attention_mask = torch.ones_like(total_input_ids)
1512
- position_ids = torch.ones(
1513
- 3, input_ids.shape[0], input_ids.shape[1], dtype=input_ids.dtype, device=input_ids.device
1514
- )
1515
- image_index, video_index = 0, 0
1516
- for i, input_ids in enumerate(total_input_ids):
1517
- input_ids = input_ids[attention_mask[i].to(input_ids.device) == 1]
1518
- image_nums, video_nums = 0, 0
1519
- vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
1520
- vision_tokens = input_ids[vision_start_indices + 1]
1521
- image_nums = (vision_tokens == image_token_id).sum()
1522
- video_nums = (vision_tokens == video_token_id).sum()
1523
- input_tokens = input_ids.tolist()
1524
- llm_pos_ids_list: list = []
1525
- st = 0
1526
- remain_images, remain_videos = image_nums, video_nums
1527
- for _ in range(image_nums + video_nums):
1528
- if image_token_id in input_tokens and remain_images > 0:
1529
- ed_image = input_tokens.index(image_token_id, st)
1530
- else:
1531
- ed_image = len(input_tokens) + 1
1532
- if video_token_id in input_tokens and remain_videos > 0:
1533
- ed_video = input_tokens.index(video_token_id, st)
1534
- else:
1535
- ed_video = len(input_tokens) + 1
1536
- if ed_image < ed_video:
1537
- t, h, w = (
1538
- image_grid_thw[image_index][0],
1539
- image_grid_thw[image_index][1],
1540
- image_grid_thw[image_index][2],
1541
- )
1542
- image_index += 1
1543
- remain_images -= 1
1544
- ed = ed_image
1545
- else:
1546
- t, h, w = (
1547
- video_grid_thw[video_index][0],
1548
- video_grid_thw[video_index][1],
1549
- video_grid_thw[video_index][2],
1550
- )
1551
- video_index += 1
1552
- remain_videos -= 1
1553
- ed = ed_video
1554
- llm_grid_t, llm_grid_h, llm_grid_w = (
1555
- t.item(),
1556
- h.item() // spatial_merge_size,
1557
- w.item() // spatial_merge_size,
1558
- )
1559
- text_len = ed - st
1560
-
1561
- st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
1562
- llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
1563
-
1564
- t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
1565
- h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
1566
- w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
1567
- llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
1568
- st = ed + llm_grid_t * llm_grid_h * llm_grid_w
1569
-
1570
- if st < len(input_tokens):
1571
- st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
1572
- text_len = len(input_tokens) - st
1573
- llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
1574
-
1575
- llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
1576
- position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
1577
- mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
1578
- mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
1579
- return position_ids, mrope_position_deltas
1580
- else:
1581
- if attention_mask is not None:
1582
- position_ids = attention_mask.long().cumsum(-1) - 1
1583
- position_ids.masked_fill_(attention_mask == 0, 1)
1584
- position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
1585
- max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
1586
- mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
1587
- else:
1588
- position_ids = (
1589
- torch.arange(input_ids.shape[1], device=input_ids.device)
1590
- .view(1, 1, -1)
1591
- .expand(3, input_ids.shape[0], -1)
1592
- )
1593
- mrope_position_deltas = torch.zeros(
1594
- [input_ids.shape[0], 1],
1595
- device=input_ids.device,
1596
- dtype=input_ids.dtype,
1597
- )
1598
-
1599
- return position_ids, mrope_position_deltas
1600
-
1601
- @add_start_docstrings_to_model_forward(QWEN2_VL_INPUTS_DOCSTRING)
1602
- @replace_return_docstrings(output_type=Qwen2VLCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1603
- def forward(
1604
- self,
1605
- input_ids: torch.LongTensor = None,
1606
- attention_mask: Optional[torch.Tensor] = None,
1607
- position_ids: Optional[torch.LongTensor] = None,
1608
- past_key_values: Optional[List[torch.FloatTensor]] = None,
1609
- inputs_embeds: Optional[torch.FloatTensor] = None,
1610
- labels: Optional[torch.LongTensor] = None,
1611
- use_cache: Optional[bool] = None,
1612
- output_attentions: Optional[bool] = None,
1613
- output_hidden_states: Optional[bool] = None,
1614
- return_dict: Optional[bool] = None,
1615
- pixel_values: Optional[torch.Tensor] = None,
1616
- pixel_values_videos: Optional[torch.FloatTensor] = None,
1617
- image_grid_thw: Optional[torch.LongTensor] = None,
1618
- video_grid_thw: Optional[torch.LongTensor] = None,
1619
- rope_deltas: Optional[torch.LongTensor] = None,
1620
- cache_position: Optional[torch.LongTensor] = None,
1621
- ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
1622
- r"""
1623
- Args:
1624
- labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1625
- Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1626
- config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1627
- (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1628
-
1629
- Returns:
1630
-
1631
- Example:
1632
-
1633
- ```python
1634
- >>> from PIL import Image
1635
- >>> import requests
1636
- >>> from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
1637
-
1638
- >>> model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
1639
- >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
1640
-
1641
- >>> messages = [
1642
- {
1643
- "role": "user",
1644
- "content": [
1645
- {"type": "image"},
1646
- {"type": "text", "text": "What is shown in this image?"},
1647
- ],
1648
- },
1649
- ]
1650
- >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
1651
- >>> image = Image.open(requests.get(url, stream=True).raw)
1652
-
1653
- >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
1654
- >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
1655
-
1656
- >>> # Generate
1657
- >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1658
- >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1659
- "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
1660
- ```"""
1661
-
1662
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1663
- output_hidden_states = (
1664
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1665
- )
1666
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1667
-
1668
- if inputs_embeds is None:
1669
- inputs_embeds = self.model.embed_tokens(input_ids)
1670
- if pixel_values is not None:
1671
- pixel_values = pixel_values.type(self.visual.get_dtype())
1672
- image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
1673
- n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
1674
- n_image_features = image_embeds.shape[0]
1675
- if n_image_tokens != n_image_features:
1676
- raise ValueError(
1677
- f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
1678
- )
1679
- image_mask = (
1680
- (input_ids == self.config.image_token_id)
1681
- .unsqueeze(-1)
1682
- .expand_as(inputs_embeds)
1683
- .to(inputs_embeds.device)
1684
- )
1685
- image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
1686
- inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
1687
-
1688
- if pixel_values_videos is not None:
1689
- pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
1690
- video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
1691
- n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
1692
- n_video_features = video_embeds.shape[0]
1693
- if n_video_tokens != n_video_features:
1694
- raise ValueError(
1695
- f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
1696
- )
1697
- video_mask = (
1698
- (input_ids == self.config.video_token_id)
1699
- .unsqueeze(-1)
1700
- .expand_as(inputs_embeds)
1701
- .to(inputs_embeds.device)
1702
- )
1703
- video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
1704
- inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
1705
-
1706
- if attention_mask is not None:
1707
- attention_mask = attention_mask.to(inputs_embeds.device)
1708
-
1709
- # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
1710
- if position_ids is None and (attention_mask is None or attention_mask.ndim == 2):
1711
- # calculate RoPE index once per generation in the pre-fill stage only
1712
- if (
1713
- (cache_position is not None and cache_position[0] == 0)
1714
- or self.rope_deltas is None
1715
- or (past_key_values is None or past_key_values.get_seq_length() == 0)
1716
- ):
1717
- position_ids, rope_deltas = self.get_rope_index(
1718
- input_ids, image_grid_thw, video_grid_thw, attention_mask
1719
- )
1720
- self.rope_deltas = rope_deltas
1721
- # then use the prev pre-calculated rope-deltas to get the correct position ids
1722
- else:
1723
- batch_size, seq_length, _ = inputs_embeds.shape
1724
- delta = cache_position[0] + self.rope_deltas if cache_position is not None else 0
1725
- position_ids = torch.arange(seq_length, device=inputs_embeds.device)
1726
- position_ids = position_ids.view(1, -1).expand(batch_size, -1)
1727
- if cache_position is not None: # otherwise `deltas` is an int `0`
1728
- delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
1729
- delta = delta.to(position_ids.device)
1730
- position_ids = position_ids.add(delta)
1731
- position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
1732
-
1733
- outputs = self.model(
1734
- input_ids=None,
1735
- position_ids=position_ids,
1736
- attention_mask=attention_mask,
1737
- past_key_values=past_key_values,
1738
- inputs_embeds=inputs_embeds,
1739
- use_cache=use_cache,
1740
- output_attentions=output_attentions,
1741
- output_hidden_states=output_hidden_states,
1742
- return_dict=return_dict,
1743
- cache_position=cache_position,
1744
- )
1745
-
1746
- hidden_states = outputs[0]
1747
- logits = self.lm_head(hidden_states)
1748
-
1749
- loss = None
1750
- if labels is not None:
1751
- # Upcast to float if we need to compute the loss to avoid potential precision issues
1752
- logits = logits.float()
1753
- # Shift so that tokens < n predict n
1754
- shift_logits = logits[..., :-1, :].contiguous()
1755
- shift_labels = labels[..., 1:].contiguous()
1756
- # Flatten the tokens
1757
- loss_fct = CrossEntropyLoss()
1758
- shift_logits = shift_logits.view(-1, self.config.vocab_size)
1759
- shift_labels = shift_labels.view(-1)
1760
- # Enable model parallelism
1761
- shift_labels = shift_labels.to(shift_logits.device)
1762
- loss = loss_fct(shift_logits, shift_labels)
1763
-
1764
- if not return_dict:
1765
- output = (logits,) + outputs[1:]
1766
- return (loss,) + output if loss is not None else output
1767
-
1768
- return Qwen2VLCausalLMOutputWithPast(
1769
- loss=loss,
1770
- logits=logits,
1771
- past_key_values=outputs.past_key_values,
1772
- hidden_states=outputs.hidden_states,
1773
- attentions=outputs.attentions,
1774
- rope_deltas=self.rope_deltas,
1775
- )
1776
-
1777
- def prepare_inputs_for_generation(
1778
- self,
1779
- input_ids,
1780
- past_key_values=None,
1781
- attention_mask=None,
1782
- inputs_embeds=None,
1783
- cache_position=None,
1784
- position_ids=None,
1785
- use_cache=True,
1786
- pixel_values=None,
1787
- pixel_values_videos=None,
1788
- image_grid_thw=None,
1789
- video_grid_thw=None,
1790
- **kwargs,
1791
- ):
1792
- # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
1793
-
1794
- # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
1795
- # Exception 1: when passing input_embeds, input_ids may be missing entries
1796
- # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
1797
- # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
1798
- # (we can't check exception 3 while compiling)
1799
- # Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
1800
- # generate the first token for each sequence. Later use the generated Input ids for continuation.
1801
- if past_key_values is not None:
1802
- if inputs_embeds is not None and input_ids.shape[1] == 0: # Exception 4
1803
- inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :]
1804
- elif (
1805
- inputs_embeds is not None # Exception 1
1806
- or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1]) # Exception 3
1807
- ):
1808
- input_ids = input_ids[:, -cache_position.shape[0] :]
1809
- elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
1810
- input_ids = input_ids[:, cache_position]
1811
-
1812
- if cache_position[0] != 0:
1813
- pixel_values = None
1814
- pixel_values_videos = None
1815
-
1816
- # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1817
- if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]:
1818
- model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
1819
- else:
1820
- model_inputs = {"input_ids": input_ids, "inputs_embeds": None}
1821
-
1822
- if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
1823
- if model_inputs["inputs_embeds"] is not None:
1824
- batch_size, sequence_length, _ = inputs_embeds.shape
1825
- device = inputs_embeds.device
1826
- else:
1827
- batch_size, sequence_length = input_ids.shape
1828
- device = input_ids.device
1829
-
1830
- attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
1831
- attention_mask,
1832
- sequence_length=sequence_length,
1833
- target_length=past_key_values.get_max_cache_shape(),
1834
- dtype=self.lm_head.weight.dtype,
1835
- device=device,
1836
- cache_position=cache_position,
1837
- batch_size=batch_size,
1838
- config=self.config,
1839
- past_key_values=past_key_values,
1840
- )
1841
-
1842
- model_inputs.update(
1843
- {
1844
- "position_ids": position_ids,
1845
- "past_key_values": past_key_values,
1846
- "use_cache": use_cache,
1847
- "attention_mask": attention_mask,
1848
- "pixel_values": pixel_values,
1849
- "pixel_values_videos": pixel_values_videos,
1850
- "image_grid_thw": image_grid_thw,
1851
- "video_grid_thw": video_grid_thw,
1852
- "cache_position": cache_position,
1853
- }
1854
- )
1855
- return model_inputs
1856
-
1857
- def _get_image_nums_and_video_nums(
1858
- self,
1859
- input_ids: Optional[torch.LongTensor],
1860
- ) -> Tuple[torch.Tensor, torch.Tensor]:
1861
- """
1862
- Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
1863
- These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.
1864
-
1865
- Args:
1866
- input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1867
- Indices of input sequence tokens in the vocabulary.
1868
-
1869
- Returns:
1870
- image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`)
1871
- video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`)
1872
- """
1873
- image_token_id = self.config.image_token_id
1874
- video_token_id = self.config.video_token_id
1875
- vision_start_token_id = self.config.vision_start_token_id
1876
-
1877
- vision_start_mask = input_ids == vision_start_token_id
1878
- vision_first_mask = torch.roll(vision_start_mask, shifts=1, dims=1)
1879
- image_mask = input_ids == image_token_id
1880
- video_mask = input_ids == video_token_id
1881
- image_nums = torch.sum(vision_first_mask & image_mask, dim=1)
1882
- video_nums = torch.sum(vision_first_mask & video_mask, dim=1)
1883
-
1884
- return image_nums, video_nums
1885
-
1886
- def _expand_inputs_for_generation(
1887
- self,
1888
- expand_size: int = 1,
1889
- is_encoder_decoder: bool = False,
1890
- input_ids: Optional[torch.LongTensor] = None,
1891
- **model_kwargs,
1892
- ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
1893
- # Overwritten -- Support for expanding tensors without a batch size dimension
1894
- # e.g., pixel_values, image_grid_thw, pixel_values_videos, video_grid_thw, second_per_grid_t
1895
- # pixel_values.shape[0] is sum(seqlen_images for samples)
1896
- # image_grid_thw.shape[0] is sum(num_images for samples)
1897
-
1898
- if expand_size == 1:
1899
- return input_ids, model_kwargs
1900
-
1901
- visual_keys = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw", "second_per_grid_ts"]
1902
-
1903
- def _expand_dict_for_generation_visual(dict_to_expand):
1904
- image_grid_thw = model_kwargs.get("image_grid_thw", None)
1905
- video_grid_thw = model_kwargs.get("video_grid_thw", None)
1906
- image_nums, video_nums = self._get_image_nums_and_video_nums(input_ids)
1907
-
1908
- def _repeat_interleave_samples(x, lengths, repeat_times):
1909
- samples = torch.split(x, lengths)
1910
- repeat_args = [repeat_times] + [1] * (x.dim() - 1)
1911
- result = torch.cat([sample.repeat(*repeat_args) for sample in samples], dim=0)
1912
- return result
1913
-
1914
- for key in dict_to_expand:
1915
- if key == "pixel_values":
1916
- # split images into samples
1917
- samples = torch.split(image_grid_thw, list(image_nums))
1918
- # compute the sequence length of images for each sample
1919
- lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
1920
- dict_to_expand[key] = _repeat_interleave_samples(
1921
- dict_to_expand[key], lengths=lengths, repeat_times=expand_size
1922
- )
1923
- elif key == "image_grid_thw":
1924
- # get the num of images for each sample
1925
- lengths = list(image_nums)
1926
- dict_to_expand[key] = _repeat_interleave_samples(
1927
- dict_to_expand[key], lengths=lengths, repeat_times=expand_size
1928
- )
1929
- elif key == "pixel_values_videos":
1930
- samples = torch.split(video_grid_thw, list(video_nums))
1931
- lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
1932
- dict_to_expand[key] = _repeat_interleave_samples(
1933
- dict_to_expand[key], lengths=lengths, repeat_times=expand_size
1934
- )
1935
- elif key == "video_grid_thw":
1936
- lengths = list(video_nums)
1937
- dict_to_expand[key] = _repeat_interleave_samples(
1938
- dict_to_expand[key], lengths=lengths, repeat_times=expand_size
1939
- )
1940
- elif key == "second_per_grid_ts":
1941
- if not isinstance(dict_to_expand[key], list):
1942
- raise TypeError(
1943
- f"Expected value for key '{key}' to be a list, but got {type(dict_to_expand[key])} instead."
1944
- )
1945
- tensor = torch.tensor(dict_to_expand[key])
1946
- lengths = list(video_nums)
1947
- tensor = _repeat_interleave_samples(tensor, lengths=lengths, repeat_times=expand_size)
1948
- dict_to_expand[key] = tensor.tolist()
1949
- return dict_to_expand
1950
-
1951
- def _expand_dict_for_generation(dict_to_expand):
1952
- for key in dict_to_expand:
1953
- if (
1954
- key != "cache_position"
1955
- and dict_to_expand[key] is not None
1956
- and isinstance(dict_to_expand[key], torch.Tensor)
1957
- and key not in visual_keys
1958
- ):
1959
- dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
1960
- return dict_to_expand
1961
-
1962
- # input_ids is required for expanding visual inputs
1963
- # If input_ids is unavailable, visual inputs will not be used; therefore, there is no need to expand visual inputs.
1964
- if input_ids is not None and input_ids.numel() != 0:
1965
- model_kwargs = _expand_dict_for_generation_visual(model_kwargs)
1966
-
1967
- if input_ids is not None:
1968
- input_ids = input_ids.repeat_interleave(expand_size, dim=0)
1969
-
1970
- model_kwargs = _expand_dict_for_generation(model_kwargs)
1971
-
1972
- if is_encoder_decoder:
1973
- if model_kwargs.get("encoder_outputs") is None:
1974
- raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
1975
- model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
1976
-
1977
- return input_ids, model_kwargs
1978
-
1979
-
1980
- __all__ = ["Qwen2VLForConditionalGeneration", "Qwen2VLModel", "Qwen2VLPreTrainedModel"]
 
27
  import torch.nn as nn
28
  import torch.nn.functional as F
29
  import torch.utils.checkpoint
30
+ from torch.nn import LayerNorm
31
 
32
  from transformers.activations import ACT2FN
33
  from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
34
  from transformers.generation import GenerationMixin
35
  from transformers.modeling_attn_mask_utils import AttentionMaskConverter
36
+ from transformers.modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
37
  from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
38
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
39
  from transformers.modeling_utils import PreTrainedModel
40
+ from transformers.utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, is_torchdynamo_compiling, logging
41
+ from .configuration_qwen2_vl import Qwen2VLVisionConfig
 
 
 
 
 
 
 
 
42
 
43
 
44
+ if is_flash_attn_available():
45
+ from transformers.modeling_flash_attention_utils import _flash_attention_forward, flash_attn_varlen_func
46
 
47
+ if is_torch_flex_attn_available():
48
+ from torch.nn.attention.flex_attention import BlockMask
 
49
 
50
+ from transformers.integrations.flex_attention import make_flex_block_causal_mask
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
+ logger = logging.get_logger(__name__)
54
 
55
  # Copied from transformers.models.llama.modeling_llama.rotate_half
56
  def rotate_half(x):
 
60
  return torch.cat((-x2, x1), dim=-1)
61
 
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def apply_rotary_pos_emb_vision(
64
  q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
65
  ) -> Tuple[torch.Tensor, torch.Tensor]:
66
  orig_q_dtype = q.dtype
67
  orig_k_dtype = k.dtype
68
  q, k = q.float(), k.float()
69
+ cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float()
70
  q_embed = (q * cos) + (rotate_half(q) * sin)
71
  k_embed = (k * cos) + (rotate_half(k) * sin)
72
  q_embed = q_embed.to(orig_q_dtype)
 
164
  "removed and `position_embeddings` will be mandatory."
165
  )
166
  emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
167
+ cos = emb.cos()
168
+ sin = emb.sin()
169
  else:
170
  cos, sin = position_embeddings
171
  q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
 
213
  "removed and `position_embeddings` will be mandatory."
214
  )
215
  emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
216
+ cos = emb.cos()
217
+ sin = emb.sin()
218
  else:
219
  cos, sin = position_embeddings
220
  q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
 
251
  "removed and `position_embeddings` will be mandatory."
252
  )
253
  emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
254
+ cos = emb.cos()
255
+ sin = emb.sin()
256
  else:
257
  cos, sin = position_embeddings
258
  q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
 
263
  q = q.transpose(0, 1)
264
  k = k.transpose(0, 1)
265
  v = v.transpose(0, 1)
266
+ attn_output = F.scaled_dot_product_attention(
267
+ q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0), attention_mask, dropout_p=0.0
268
+ )
269
+ attn_output = attn_output.squeeze(0).transpose(0, 1)
270
  attn_output = attn_output.reshape(seq_length, -1)
271
  attn_output = self.proj(attn_output)
272
  return attn_output
 
308
  return hidden_states
309
 
310
 
311
+ @auto_docstring
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  class Qwen2VLPreTrainedModel(PreTrainedModel):
 
313
  base_model_prefix = "model"
314
  supports_gradient_checkpointing = True
 
315
  _skip_keys_device_placement = "past_key_values"
316
  _supports_flash_attn_2 = True
317
  _supports_sdpa = True
 
319
  _supports_static_cache = False # TODO (joao): fix. torch.compile failing probably due to `cache_positions`
320
 
321
  def _init_weights(self, module):
322
+ std = self.config.get_text_config().initializer_range
323
  if isinstance(module, (nn.Linear, nn.Conv3d)):
324
  module.weight.data.normal_(mean=0.0, std=std)
325
  if module.bias is not None:
 
328
  module.weight.data.normal_(mean=0.0, std=std)
329
  if module.padding_idx is not None:
330
  module.weight.data[module.padding_idx].zero_()
331
+ elif isinstance(module, nn.LayerNorm):
332
+ module.weight.data.fill_(1.0)
333
+ module.bias.data.zero_()
334
 
335
 
336
+ @auto_docstring
337
  class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel):
338
  config_class = Qwen2VLVisionConfig
339
  _no_split_modules = ["Qwen2VLVisionBlock"]
 
395
  rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
396
  return rotary_pos_emb
397
 
398
+ @auto_docstring
399
  def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
400
+ r"""
401
+ grid_thw (`torch.LongTensor` of shape `(num_images, 3)`):
402
+ The temporal, height and width dimensions of feature shape for each image. Each row contains [t, h, w] values.
403
+ """
404
  hidden_states = self.patch_embed(hidden_states)
405
  rotary_pos_emb = self.rot_pos_emb(grid_thw)
406
  emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
 
425
  hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, position_embeddings=position_embeddings)
426
 
427
  return self.merger(hidden_states)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocessor_config.json CHANGED
@@ -11,6 +11,7 @@
11
  0.4578275,
12
  0.40821073
13
  ],
 
14
  "image_std": [
15
  0.26862954,
16
  0.26130258,
@@ -20,6 +21,7 @@
20
  "merge_size": 2,
21
  "min_pixels": 3136,
22
  "patch_size": 14,
 
23
  "resample": 3,
24
  "rescale_factor": 0.00392156862745098,
25
  "size": {
 
11
  0.4578275,
12
  0.40821073
13
  ],
14
+ "image_processor_type": "Qwen2VLImageProcessor",
15
  "image_std": [
16
  0.26862954,
17
  0.26130258,
 
21
  "merge_size": 2,
22
  "min_pixels": 3136,
23
  "patch_size": 14,
24
+ "processor_class": "Qwen2VLProcessor",
25
  "resample": 3,
26
  "rescale_factor": 0.00392156862745098,
27
  "size": {