qwen2vit600m / modeling_qwen2_vl.py

KaiChen1998

Upload 6 files

0cdbea6 verified about 1 month ago

55.2 kB

	# coding=utf-8
	# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
	#
	# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
	# and OPT implementations in this library. It has been modified from its
	# original forms to accommodate minor architectural differences compared
	# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""PyTorch Qwen2-VL model."""

	import math
	from dataclasses import dataclass
	from typing import Any, Dict, List, Optional, Tuple, Union

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.utils.checkpoint
	from torch.nn import CrossEntropyLoss, LayerNorm

	from transformers.activations import ACT2FN
	from transformers.cache_utils import Cache, StaticCache
	from transformers.modeling_attn_mask_utils import (
	AttentionMaskConverter,
	)
	from transformers.modeling_outputs import (
	BaseModelOutputWithPast,
	ModelOutput,
	)
	from .modeling_rope_utils import ROPE_INIT_FUNCTIONS
	from transformers.modeling_utils import PreTrainedModel
	from transformers.utils import (
	add_start_docstrings,
	add_start_docstrings_to_model_forward,
	is_torch_npu_available,
	is_flash_attn_2_available,
	is_flash_attn_greater_or_equal_2_10,
	logging,
	replace_return_docstrings,
	)
	from .configuration_qwen2_vl import Qwen2VLConfig, Qwen2VLVisionConfig

	if is_flash_attn_2_available():
	from flash_attn import flash_attn_varlen_func

	from transformers.modeling_flash_attention_utils import _flash_attention_forward
	else:
	flash_attn_varlen_func = None

	logger = logging.get_logger(__name__)

	_CONFIG_FOR_DOC = "Qwen2VLConfig"

	USE_NPU = is_torch_npu_available()


	@dataclass
	class Qwen2VLCausalLMOutputWithPast(ModelOutput):
	"""
	Base class for Qwen2VL causal language model (or autoregressive) outputs.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Language modeling loss (for next-token prediction).
	logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
	past_key_values (`tuple(tuple(torch.FloatTensor))`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
	`(batch_size, num_heads, sequence_length, embed_size_per_head)`)

	Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
	`past_key_values` input) to speed up sequential decoding.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, optional):
	The rope index difference between sequence length and multimodal rope.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: torch.FloatTensor = None
	past_key_values: Optional[List[torch.FloatTensor]] = None
	hidden_states: Optional[Tuple[torch.FloatTensor]] = None
	attentions: Optional[Tuple[torch.FloatTensor]] = None
	rope_deltas: Optional[torch.LongTensor] = None


	class Qwen2VLRotaryEmbedding(nn.Module):
	def __init__(
	self,
	dim=None,
	max_position_embeddings=2048,
	base=10000,
	device=None,
	scaling_factor=1.0,
	rope_type="default",
	config: Optional[Qwen2VLConfig] = None,
	):
	super().__init__()
	# TODO (joao): remove the `if` below, only used for BC
	self.rope_kwargs = {}
	if config is None:
	logger.warning_once(
	"`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the "
	"`config` argument. All other arguments will be removed in v4.46"
	)
	self.rope_kwargs = {
	"rope_type": rope_type,
	"factor": scaling_factor,
	"dim": dim,
	"base": base,
	"max_position_embeddings": max_position_embeddings,
	}
	self.rope_type = rope_type
	self.max_seq_len_cached = max_position_embeddings
	self.original_max_seq_len = max_position_embeddings
	else:
	# BC: "rope_type" was originally "type"
	if config.rope_scaling is not None:
	self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
	else:
	self.rope_type = "default"
	self.max_seq_len_cached = config.max_position_embeddings
	self.original_max_seq_len = config.max_position_embeddings

	self.config = config
	self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]

	inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
	self.register_buffer("inv_freq", inv_freq, persistent=False)
	self.original_inv_freq = self.inv_freq

	def _dynamic_frequency_update(self, position_ids, device):
	"""
	dynamic RoPE layers should recompute `inv_freq` in the following situations:
	1 - growing beyond the cached sequence length (allow scaling)
	2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
	"""
	seq_len = torch.max(position_ids) + 1
	if seq_len > self.max_seq_len_cached: # growth
	inv_freq, self.attention_scaling = self.rope_init_fn(
	self.config, device, seq_len=seq_len, **self.rope_kwargs
	)
	self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
	self.max_seq_len_cached = seq_len

	if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
	self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
	self.max_seq_len_cached = self.original_max_seq_len

	@torch.no_grad()
	def forward(self, x, position_ids):
	if "dynamic" in self.rope_type:
	self._dynamic_frequency_update(position_ids, device=x.device)

	# Core RoPE block. In contrast to other models, Qwen2_VL has different position ids for thw grids
	# So we expand the inv_freq to shape (3, ...)
	inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
	position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
	# Force float32 (see https://github.com/huggingface/transformers/pull/29285)
	device_type = x.device.type
	device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
	with torch.autocast(device_type=device_type, enabled=False):
	freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
	emb = torch.cat((freqs, freqs), dim=-1)
	cos = emb.cos()
	sin = emb.sin()

	# Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
	cos = cos * self.attention_scaling
	sin = sin * self.attention_scaling

	return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


	# Copied from transformers.models.llama.modeling_llama.rotate_half
	def rotate_half(x):
	"""Rotates half the hidden dims of the input."""
	x1 = x[..., : x.shape[-1] // 2]
	x2 = x[..., x.shape[-1] // 2:]
	return torch.cat((-x2, x1), dim=-1)


	def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
	"""Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/).

	Explanation:
	Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
	sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
	vision embedding part, we apply rotary position embedding on temporal, height and width dimension seperately.
	Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
	For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
	height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
	difference with modern LLMs.

	Args:
	q (`torch.Tensor`): The query tensor.
	k (`torch.Tensor`): The key tensor.
	cos (`torch.Tensor`): The cosine part of the rotary embedding.
	sin (`torch.Tensor`): The sine part of the rotary embedding.
	position_ids (`torch.Tensor`):
	The position indices of the tokens corresponding to the query and key tensors. For example, this can be
	used to pass offsetted position ids when working with a KV-cache.
	mrope_section(`List(int)`):
	Multimodal rope section is for channel dimension of temporal, height and width in rope calculation.
	unsqueeze_dim (`int`, optional, defaults to 1):
	The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
	sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
	that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
	k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
	cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
	the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
	Returns:
	`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
	"""
	mrope_section = mrope_section * 2
	cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
	unsqueeze_dim
	)
	sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
	unsqueeze_dim
	)

	q_embed = (q * cos) + (rotate_half(q) * sin)
	k_embed = (k * cos) + (rotate_half(k) * sin)
	return q_embed, k_embed


	def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
	orig_dtype = tensor.dtype
	tensor = tensor.float()
	cos = freqs.cos()
	sin = freqs.sin()
	cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
	sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
	output = (tensor * cos) + (rotate_half(tensor) * sin)
	output = output.to(orig_dtype)
	return output


	class VisionRotaryEmbedding(nn.Module):
	def __init__(self, dim: int, theta: float = 10000.0) -> None:
	super().__init__()
	inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
	self.register_buffer("inv_freq", inv_freq, persistent=False)

	def forward(self, seqlen: int) -> torch.Tensor:
	seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
	freqs = torch.outer(seq, self.inv_freq)
	return freqs


	class PatchEmbed(nn.Module):
	def __init__(
	self,
	patch_size: int = 14,
	temporal_patch_size: int = 2,
	in_channels: int = 3,
	embed_dim: int = 1152,
	) -> None:
	super().__init__()
	self.patch_size = patch_size
	self.temporal_patch_size = temporal_patch_size
	self.in_channels = in_channels
	self.embed_dim = embed_dim

	kernel_size = [temporal_patch_size, patch_size, patch_size]
	self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False)

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	target_dtype = self.proj.weight.dtype
	if USE_NPU:
	hidden_states = F.linear(hidden_states, self.proj.weight.view(self.embed_dim, -1))
	else:
	hidden_states = hidden_states.view(
	-1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
	)
	hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
	return hidden_states


	class PatchMerger(nn.Module):
	def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
	super().__init__()
	self.hidden_size = context_dim * (spatial_merge_size ** 2)
	self.ln_q = LayerNorm(context_dim, eps=1e-6)
	self.mlp = nn.Sequential(
	nn.Linear(self.hidden_size, self.hidden_size),
	nn.GELU(),
	nn.Linear(self.hidden_size, dim),
	)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
	return x


	class VisionMlp(nn.Module):
	def __init__(self, dim: int, hidden_dim: int, hidden_act: str) -> None:
	super().__init__()
	self.fc1 = nn.Linear(dim, hidden_dim)
	self.act = ACT2FN[hidden_act]
	self.fc2 = nn.Linear(hidden_dim, dim)

	def forward(self, x) -> torch.Tensor:
	return self.fc2(self.act(self.fc1(x)))


	class VisionAttention(nn.Module):
	def __init__(self, dim: int, num_heads: int = 16) -> None:
	super().__init__()
	self.num_heads = num_heads
	self.head_dim = dim // num_heads
	self.qkv = nn.Linear(dim, dim * 3, bias=True)
	self.proj = nn.Linear(dim, dim)

	def forward(
	self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
	) -> torch.Tensor:
	seq_length = hidden_states.shape[0]
	q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
	q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
	k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)

	attention_mask = torch.full(
	[1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype
	)
	for i in range(1, len(cu_seqlens)):
	attention_mask[..., cu_seqlens[i - 1]: cu_seqlens[i], cu_seqlens[i - 1]: cu_seqlens[i]] = 0

	q = q.transpose(0, 1)
	k = k.transpose(0, 1)
	v = v.transpose(0, 1)
	attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
	attn_weights = attn_weights + attention_mask
	attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
	attn_output = torch.matmul(attn_weights, v)
	attn_output = attn_output.transpose(0, 1)
	attn_output = attn_output.reshape(seq_length, -1)
	attn_output = self.proj(attn_output)
	return attn_output


	class VisionFlashAttention2(nn.Module):
	def __init__(self, dim: int, num_heads: int = 16) -> None:
	super().__init__()
	self.num_heads = num_heads
	self.qkv = nn.Linear(dim, dim * 3, bias=True)
	self.proj = nn.Linear(dim, dim)

	def forward(
	self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
	) -> torch.Tensor:
	seq_length = hidden_states.shape[0]
	q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
	q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
	k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)

	max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
	attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
	seq_length, -1
	)
	attn_output = self.proj(attn_output)
	return attn_output


	class VisionSdpaAttention(nn.Module):
	def __init__(self, dim: int, num_heads: int = 16) -> None:
	super().__init__()
	self.num_heads = num_heads
	self.qkv = nn.Linear(dim, dim * 3, bias=True)
	self.proj = nn.Linear(dim, dim)

	def forward(
	self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
	) -> torch.Tensor:
	seq_length = hidden_states.shape[0]
	q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
	q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
	k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)

	attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool)
	for i in range(1, len(cu_seqlens)):
	attention_mask[..., cu_seqlens[i - 1]: cu_seqlens[i], cu_seqlens[i - 1]: cu_seqlens[i]] = True
	q = q.transpose(0, 1)
	k = k.transpose(0, 1)
	v = v.transpose(0, 1)
	attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
	attn_output = attn_output.transpose(0, 1)
	attn_output = attn_output.reshape(seq_length, -1)
	attn_output = self.proj(attn_output)
	return attn_output


	QWEN2_VL_VISION_ATTENTION_CLASSES = {
	"eager": VisionAttention,
	"flash_attention_2": VisionFlashAttention2,
	"sdpa": VisionSdpaAttention,
	}


	class Qwen2VLVisionBlock(nn.Module):
	def __init__(self, config, attn_implementation: str = "sdpa") -> None:
	super().__init__()
	self.norm1 = LayerNorm(config.embed_dim, eps=1e-6)
	self.norm2 = LayerNorm(config.embed_dim, eps=1e-6)
	mlp_hidden_dim = int(config.embed_dim * config.mlp_ratio)

	self.attn = QWEN2_VL_VISION_ATTENTION_CLASSES[attn_implementation](
	config.embed_dim, num_heads=config.num_heads
	)
	self.mlp = VisionMlp(dim=config.embed_dim, hidden_dim=mlp_hidden_dim, hidden_act=config.hidden_act)

	def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> torch.Tensor:
	hidden_states = hidden_states + self.attn(
	self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
	)
	hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
	return hidden_states


	# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
	def _prepare_4d_causal_attention_mask_with_cache_position(
	attention_mask: torch.Tensor,
	sequence_length: int,
	target_length: int,
	dtype: torch.dtype,
	device: torch.device,
	min_dtype: float,
	cache_position: torch.Tensor,
	batch_size: int,
	):
	"""
	Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
	`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

	Args:
	attention_mask (`torch.Tensor`):
	A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
	sequence_length (`int`):
	The sequence length being processed.
	target_length (`int`):
	The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
	dtype (`torch.dtype`):
	The dtype to use for the 4D attention mask.
	device (`torch.device`):
	The device to plcae the 4D attention mask on.
	min_dtype (`float`):
	The minimum value representable with the dtype `dtype`.
	cache_position (`torch.Tensor`):
	Indices depicting the position of the input sequence tokens in the sequence.
	batch_size (`torch.Tensor`):
	Batch size.
	"""
	if attention_mask is not None and attention_mask.dim() == 4:
	# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
	causal_mask = attention_mask
	else:
	causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
	if sequence_length != 1:
	causal_mask = torch.triu(causal_mask, diagonal=1)
	causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
	causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
	if attention_mask is not None:
	causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
	mask_length = attention_mask.shape[-1]
	padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
	padding_mask = padding_mask == 0
	causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
	padding_mask, min_dtype
	)

	return causal_mask


	# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2RMSNorm
	class Qwen2RMSNorm(nn.Module):
	def __init__(self, hidden_size, eps=1e-6):
	"""
	Qwen2RMSNorm is equivalent to T5LayerNorm
	"""
	super().__init__()
	self.weight = nn.Parameter(torch.ones(hidden_size))
	self.variance_epsilon = eps

	def forward(self, hidden_states):
	input_dtype = hidden_states.dtype
	hidden_states = hidden_states.to(torch.float32)
	variance = hidden_states.pow(2).mean(-1, keepdim=True)
	hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
	return self.weight * hidden_states.to(input_dtype)

	def extra_repr(self):
	return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


	# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2MLP
	class Qwen2MLP(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.hidden_size = config.hidden_size
	self.intermediate_size = config.intermediate_size
	self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
	self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
	self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
	self.act_fn = ACT2FN[config.hidden_act]

	def forward(self, hidden_state):
	return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))


	# Copied from transformers.models.llama.modeling_llama.repeat_kv
	def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
	"""
	This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
	num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
	"""
	batch, num_key_value_heads, slen, head_dim = hidden_states.shape
	if n_rep == 1:
	return hidden_states
	hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
	return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


	class Qwen2VLAttention(nn.Module):
	"""
	Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
	and "Generating Long Sequences with Sparse Transformers".
	"""

	def __init__(self, config: Qwen2VLConfig, layer_idx: Optional[int] = None):
	super().__init__()
	self.config = config
	self.layer_idx = layer_idx
	if layer_idx is None:
	logger.warning_once(
	f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
	"to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
	"when creating this class."
	)

	self.hidden_size = config.hidden_size
	self.num_heads = config.num_attention_heads
	self.head_dim = self.hidden_size // self.num_heads
	self.num_key_value_heads = config.num_key_value_heads
	self.num_key_value_groups = self.num_heads // self.num_key_value_heads
	self.max_position_embeddings = config.max_position_embeddings
	self.rope_theta = config.rope_theta
	self.is_causal = True
	self.attention_dropout = config.attention_dropout
	self.rope_scaling = config.rope_scaling

	if (self.head_dim * self.num_heads) != self.hidden_size:
	raise ValueError(
	f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
	f" and `num_heads`: {self.num_heads})."
	)
	self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
	self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
	self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
	self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)

	self.rotary_emb = Qwen2VLRotaryEmbedding(
	self.head_dim,
	max_position_embeddings=self.max_position_embeddings,
	base=self.rope_theta,
	)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_value: Optional[Cache] = None,
	output_attentions: bool = False,
	use_cache: bool = False,
	cache_position: Optional[torch.LongTensor] = None,
	position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
	) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
	bsz, q_len, _ = hidden_states.size()

	query_states = self.q_proj(hidden_states)
	key_states = self.k_proj(hidden_states)
	value_states = self.v_proj(hidden_states)

	query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
	key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
	value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

	kv_seq_len = key_states.shape[-2]
	if past_key_value is not None:
	kv_seq_len += cache_position[0] + 1

	if position_embeddings is None:
	logger.warning_once(
	"The attention layers in this model are transitioning from computing the RoPE embeddings internally "
	"through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
	"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
	"removed and `position_embeddings` will be mandatory."
	)
	cos, sin = self.rotary_emb(value_states, position_ids)
	else:
	cos, sin = position_embeddings
	query_states, key_states = apply_multimodal_rotary_pos_emb(
	query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
	)

	if past_key_value is not None:
	cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
	key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

	# repeat k/v heads if n_kv_heads < n_heads
	key_states = repeat_kv(key_states, self.num_key_value_groups)
	value_states = repeat_kv(value_states, self.num_key_value_groups)

	attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)

	if attention_mask is not None: # no matter the length, we just slice it
	causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
	attn_weights = attn_weights + causal_mask

	# upcast attention to fp32
	attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
	attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
	attn_output = torch.matmul(attn_weights, value_states)

	if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
	raise ValueError(
	f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
	f" {attn_output.size()}"
	)

	attn_output = attn_output.transpose(1, 2).contiguous()
	attn_output = attn_output.reshape(bsz, q_len, -1)

	attn_output = self.o_proj(attn_output)

	if not output_attentions:
	attn_weights = None

	return attn_output, attn_weights, past_key_value


	class Qwen2VLFlashAttention2(Qwen2VLAttention):
	"""
	Qwen2VL flash attention module, following Qwen2VL attention module. This module inherits from `Qwen2VLAttention`
	as the weights of the module stays untouched. The only required change would be on the forward pass
	where it needs to correctly call the public API of flash attention and deal with padding tokens
	in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
	config.max_window_layers layers.
	"""

	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)

	# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
	# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
	self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_value: Optional[Cache] = None,
	output_attentions: bool = False,
	use_cache: bool = False,
	cache_position: Optional[torch.LongTensor] = None,
	position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
	):
	bsz, q_len, _ = hidden_states.size()

	query_states = self.q_proj(hidden_states)
	key_states = self.k_proj(hidden_states)
	value_states = self.v_proj(hidden_states)

	query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
	key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
	value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

	kv_seq_len = key_states.shape[-2]
	if past_key_value is not None:
	if self.layer_idx is None:
	raise ValueError(
	f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
	"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
	"with a layer index."
	)
	kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

	# Because the input can be padded, the absolute sequence length depends on the max position id.
	if position_embeddings is None:
	logger.warning_once(
	"The attention layers in this model are transitioning from computing the RoPE embeddings internally "
	"through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
	"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
	"removed and `position_embeddings` will be mandatory."
	)
	cos, sin = self.rotary_emb(value_states, position_ids)
	else:
	cos, sin = position_embeddings

	query_states, key_states = apply_multimodal_rotary_pos_emb(
	query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
	)

	if past_key_value is not None:
	# Activate slicing cache only if the config has a value `sliding_windows` attribute
	cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
	if (
	getattr(self.config, "sliding_window", None) is not None
	and kv_seq_len > self.config.sliding_window
	and cache_has_contents
	):
	slicing_tokens = 1 - self.config.sliding_window

	past_key = past_key_value[self.layer_idx][0]
	past_value = past_key_value[self.layer_idx][1]

	past_key = past_key[:, :, slicing_tokens:, :].contiguous()
	past_value = past_value[:, :, slicing_tokens:, :].contiguous()

	if past_key.shape[-2] != self.config.sliding_window - 1:
	raise ValueError(
	f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
	f" {past_key.shape}"
	)

	if attention_mask is not None:
	attention_mask = attention_mask[:, slicing_tokens:]
	attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)

	cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
	key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

	# repeat k/v heads if n_kv_heads < n_heads
	key_states = repeat_kv(key_states, self.num_key_value_groups)
	value_states = repeat_kv(value_states, self.num_key_value_groups)
	dropout_rate = 0.0 if not self.training else self.attention_dropout

	# In PEFT, usually we cast the layer norms in float32 for training stability reasons
	# therefore the input hidden states gets silently casted in float32. Hence, we need
	# cast them back in float16 just to be sure everything works as expected.
	input_dtype = query_states.dtype
	if input_dtype == torch.float32:
	if torch.is_autocast_enabled():
	target_dtype = torch.get_autocast_gpu_dtype()
	# Handle the case where the model is quantized
	elif hasattr(self.config, "_pre_quantization_dtype"):
	target_dtype = self.config._pre_quantization_dtype
	else:
	target_dtype = self.q_proj.weight.dtype

	logger.warning_once(
	f"The input hidden states seems to be silently casted in float32, this might be related to"
	f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
	f" {target_dtype}."
	)

	query_states = query_states.to(target_dtype)
	key_states = key_states.to(target_dtype)
	value_states = value_states.to(target_dtype)

	# Reashape to the expected shape for Flash Attention
	query_states = query_states.transpose(1, 2)
	key_states = key_states.transpose(1, 2)
	value_states = value_states.transpose(1, 2)

	if (
	self.config.use_sliding_window
	and getattr(self.config, "sliding_window", None) is not None
	and self.layer_idx >= self.config.max_window_layers
	):
	sliding_window = self.config.sliding_window
	else:
	sliding_window = None

	attn_output = _flash_attention_forward(
	query_states,
	key_states,
	value_states,
	attention_mask,
	q_len,
	dropout=dropout_rate,
	sliding_window=sliding_window,
	is_causal=self.is_causal,
	use_top_left_mask=self._flash_attn_uses_top_left_mask,
	)

	attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
	attn_output = self.o_proj(attn_output)

	if not output_attentions:
	attn_weights = None

	return attn_output, attn_weights, past_key_value


	class Qwen2VLSdpaAttention(Qwen2VLAttention):
	"""
	Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
	`Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
	SDPA API.
	"""

	# Adapted from Qwen2Attention.forward
	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_value: Optional[Cache] = None,
	output_attentions: bool = False,
	use_cache: bool = False,
	cache_position: Optional[torch.LongTensor] = None,
	position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
	) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
	if output_attentions:
	# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
	logger.warning_once(
	"Qwen2VLModel is using Qwen2VLSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
	'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
	)
	return super().forward(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_value=past_key_value,
	output_attentions=output_attentions,
	use_cache=use_cache,
	cache_position=cache_position,
	)

	bsz, q_len, _ = hidden_states.size()

	query_states = self.q_proj(hidden_states)
	key_states = self.k_proj(hidden_states)
	value_states = self.v_proj(hidden_states)

	query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
	key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
	value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

	kv_seq_len = key_states.shape[-2]
	if past_key_value is not None:
	kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
	if position_embeddings is None:
	logger.warning_once(
	"The attention layers in this model are transitioning from computing the RoPE embeddings internally "
	"through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
	"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
	"removed and `position_embeddings` will be mandatory."
	)
	cos, sin = self.rotary_emb(value_states, position_ids)
	else:
	cos, sin = position_embeddings
	query_states, key_states = apply_multimodal_rotary_pos_emb(
	query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
	)

	if past_key_value is not None:
	cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
	key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

	key_states = repeat_kv(key_states, self.num_key_value_groups)
	value_states = repeat_kv(value_states, self.num_key_value_groups)

	causal_mask = attention_mask
	if attention_mask is not None: # no matter the length, we just slice it
	causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]

	# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
	# Reference: https://github.com/pytorch/pytorch/issues/112577.
	if query_states.device.type == "cuda" and attention_mask is not None:
	query_states = query_states.contiguous()
	key_states = key_states.contiguous()
	value_states = value_states.contiguous()

	# We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
	# in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
	# The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
	is_causal = True if causal_mask is None and q_len > 1 else False

	attn_output = torch.nn.functional.scaled_dot_product_attention(
	query_states,
	key_states,
	value_states,
	attn_mask=causal_mask,
	dropout_p=self.attention_dropout if self.training else 0.0,
	is_causal=is_causal,
	)

	attn_output = attn_output.transpose(1, 2).contiguous()
	attn_output = attn_output.view(bsz, q_len, self.hidden_size)

	attn_output = self.o_proj(attn_output)

	return attn_output, None, past_key_value


	QWEN2_VL_ATTENTION_CLASSES = {
	"eager": Qwen2VLAttention,
	"flash_attention_2": Qwen2VLFlashAttention2,
	"sdpa": Qwen2VLSdpaAttention,
	}


	class Qwen2VLDecoderLayer(nn.Module):
	def __init__(self, config: Qwen2VLConfig, layer_idx: int):
	super().__init__()
	self.hidden_size = config.hidden_size

	if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
	logger.warning_once(
	f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
	"unexpected results may be encountered."
	)
	self.self_attn = QWEN2_VL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)

	self.mlp = Qwen2MLP(config)
	self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
	self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_value: Optional[Tuple[torch.Tensor]] = None,
	output_attentions: Optional[bool] = False,
	use_cache: Optional[bool] = False,
	cache_position: Optional[torch.LongTensor] = None,
	position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
	**kwargs,
	) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
	"""
	Args:
	hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
	attention_mask (`torch.FloatTensor`, optional): attention mask of size
	`(batch, sequence_length)` where padding elements are indicated by 0.
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under
	returned tensors for more detail.
	use_cache (`bool`, optional):
	If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
	(see `past_key_values`).
	past_key_value (`Tuple(torch.FloatTensor)`, optional): cached past key and value projection states
	cache_position (`torch.LongTensor` of shape `(sequence_length)`, optional):
	Indices depicting the position of the input sequence tokens in the sequence.
	position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, optional):
	Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
	with `head_dim` being the embedding dimension of each attention head.
	kwargs (`dict`, optional):
	Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
	into the model
	"""

	residual = hidden_states

	hidden_states = self.input_layernorm(hidden_states)

	# Self Attention
	hidden_states, self_attn_weights, present_key_value = self.self_attn(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_value=past_key_value,
	output_attentions=output_attentions,
	use_cache=use_cache,
	cache_position=cache_position,
	position_embeddings=position_embeddings,
	)
	hidden_states = residual + hidden_states

	# Fully Connected
	residual = hidden_states
	hidden_states = self.post_attention_layernorm(hidden_states)
	hidden_states = self.mlp(hidden_states)
	hidden_states = residual + hidden_states

	outputs = (hidden_states,)

	if output_attentions:
	outputs += (self_attn_weights,)

	if use_cache:
	outputs += (present_key_value,)

	return outputs


	QWEN2VL_START_DOCSTRING = r"""
	This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
	library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
	etc.)

	This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
	Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
	and behavior.

	Parameters:
	config ([`Qwen2VLConfig`]):
	Model configuration class with all the parameters of the model. Initializing with a config file does not
	load the weights associated with the model, only the configuration. Check out the
	[`~PreTrainedModel.from_pretrained`] method to load the model weights.
	"""


	@add_start_docstrings(
	"The bare Qwen2VL Model outputting raw hidden-states without any specific head on top.",
	QWEN2VL_START_DOCSTRING,
	)
	class Qwen2VLPreTrainedModel(PreTrainedModel):
	config_class = Qwen2VLConfig
	base_model_prefix = "model"
	supports_gradient_checkpointing = True
	_no_split_modules = ["Qwen2VLDecoderLayer", "Qwen2VLVisionBlock"]
	_skip_keys_device_placement = "past_key_values"
	_supports_flash_attn_2 = True
	_supports_sdpa = True
	_supports_cache_class = True
	_supports_static_cache = True

	def _init_weights(self, module):
	std = self.config.initializer_range
	if isinstance(module, (nn.Linear, nn.Conv3d)):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()


	class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel):
	config_class = Qwen2VLVisionConfig
	_no_split_modules = ["Qwen2VLVisionBlock"]
	gradient_checkpointing = False

	def __init__(self, config) -> None:
	super().__init__(config)
	self.spatial_merge_size = config.spatial_merge_size

	self.patch_embed = PatchEmbed(
	patch_size=config.patch_size,
	temporal_patch_size=config.temporal_patch_size,
	in_channels=config.in_channels,
	embed_dim=config.embed_dim,
	)

	head_dim = config.embed_dim // config.num_heads
	self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)

	###################
	# Modify for NPU
	###################
	if is_torch_npu_available():
	_attn_implementation = "sdpa"
	elif torch.__version__ > "2.1.2":
	_attn_implementation = "sdpa"
	elif is_flash_attn_2_available():
	_attn_implementation = "flash_attention_2"
	else:
	_attn_implementation = "eager"
	self.blocks = nn.ModuleList(
	[Qwen2VLVisionBlock(config, _attn_implementation) for _ in range(config.depth)]
	)
	self.merger = PatchMerger(
	dim=config.hidden_size, context_dim=config.embed_dim, spatial_merge_size=config.spatial_merge_size
	)

	def get_dtype(self) -> torch.dtype:
	return self.blocks[0].mlp.fc2.weight.dtype

	def get_device(self) -> torch.device:
	return self.blocks[0].mlp.fc2.weight.device

	def rot_pos_emb(self, grid_thw):
	pos_ids = []
	for t, h, w in grid_thw:
	hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
	hpos_ids = hpos_ids.reshape(
	h // self.spatial_merge_size,
	self.spatial_merge_size,
	w // self.spatial_merge_size,
	self.spatial_merge_size,
	)
	hpos_ids = hpos_ids.permute(0, 2, 1, 3)
	hpos_ids = hpos_ids.flatten()

	wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
	wpos_ids = wpos_ids.reshape(
	h // self.spatial_merge_size,
	self.spatial_merge_size,
	w // self.spatial_merge_size,
	self.spatial_merge_size,
	)
	wpos_ids = wpos_ids.permute(0, 2, 1, 3)
	wpos_ids = wpos_ids.flatten()
	pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
	pos_ids = torch.cat(pos_ids, dim=0)
	max_grid_size = grid_thw[:, 1:].max()
	rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
	rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
	return rotary_pos_emb

	def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
	hidden_states = self.patch_embed(hidden_states)
	rotary_pos_emb = self.rot_pos_emb(grid_thw)

	cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
	dim=0, dtype=torch.int32
	)
	cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0).to(hidden_states.device)

	for blk in self.blocks:
	if self.gradient_checkpointing and self.training:
	hidden_states = torch.utils.checkpoint.checkpoint(
	blk,
	hidden_states,
	cu_seqlens, rotary_pos_emb)
	else:
	hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)

	return self.merger(hidden_states)