Spaces:

shi-labs
/

slow-fast-video-mllm

Runtime error

App Files Files Community

Flying-Lynx commited on Apr 8

Commit

c315863

1 Parent(s): 717fa66

add model code

Browse files

Files changed (18) hide show

llava/__init__.py +1 -0
llava/constants.py +15 -0
llava/conversation.py +487 -0
llava/mm_utils.py +335 -0
llava/model/__init__.py +2 -0
llava/model/builder.py +73 -0
llava/model/language_model/hybrid_decoder_layer.py +1473 -0
llava/model/language_model/llava_qwen2.py +143 -0
llava/model/language_model/llava_qwen2_slowfast.py +632 -0
llava/model/llava_arch.py +355 -0
llava/model/multimodal_encoder/builder.py +29 -0
llava/model/multimodal_encoder/clip_encoder.py +190 -0
llava/model/multimodal_encoder/convnext_encoder.py +143 -0
llava/model/multimodal_encoder/vision_models/__init__.py +0 -0
llava/model/multimodal_encoder/vision_models/convnext.py +1109 -0
llava/model/multimodal_projector/builder.py +53 -0
llava/train/llava_trainer.py +321 -0
llava/utils.py +212 -0

llava/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .model import LlavaQwenSlowFastForCausalLM

llava/constants.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# This file is from https://github.com/haotian-liu/LLaVA/
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"

llava/conversation.py ADDED Viewed

	@@ -0,0 +1,487 @@

+# This file is modified from https://github.com/haotian-liu/LLaVA/
+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+import base64
+from io import BytesIO
+from PIL import Image
+import re
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+    QWEN = auto()
+    CHATML = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if 'mmtag' in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = "" if self.system == "" else self.system + self.sep + "\n"
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, images, _ = message
+                        message = "<image>" * len(images) + message
+                    ret += role + "\n" + message + self.sep + "\n"
+                else:
+                    ret += role + "\n"
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0: message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=1344, min_len=672):
+        if image_process_mode == "Pad":
+            def expand2square(pil_img, background_color=(122, 116, 104)):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+            image = expand2square(image)
+        elif image_process_mode in ["Default", "Crop"]:
+            pass
+        elif image_process_mode == "Resize":
+            image = image.resize((336, 336))
+        else:
+            raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+        if max(image.size) > max_len:
+            max_hw, min_hw = max(image.size), min(image.size)
+            aspect_ratio = max_hw / min_hw
+            shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+            longest_edge = int(shortest_edge * aspect_ratio)
+            W, H = image.size
+            if H > W:
+                H, W = longest_edge, shortest_edge
+            else:
+                H, W = shortest_edge, longest_edge
+            image = image.resize((W, H))
+        if return_pil:
+            return image
+        else:
+            buffered = BytesIO()
+            image.save(buffered, format=image_format)
+            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            return img_b64_str
+    def is_image_file(self, filename):
+        image_extensions = [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"]
+        return any(filename.lower().endswith(ext) for ext in image_extensions)
+    def is_video_file(self, filename):
+        video_extensions = [".mp4", ".mov", ".avi", ".mkv", ".wmv", ".flv", ".mpeg", ".mpg"]
+        return any(filename.lower().endswith(ext) for ext in video_extensions)
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    image = self.process_image(image, image_process_mode, return_pil=return_pil)
+                    images.append(image)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    if type(image) != list:
+                        image = [image]
+                    if len(image) == 1:
+                        msg = "<image>\n" + msg.replace("<image>", "").strip()
+                    else:
+                        msg = re.sub(r"(<image>)\n(?=<image>)", r"\1 ", msg)
+                    img_str_list = []
+                    for img in image:
+                        if self.is_image_file(img):
+                            img_b64_str = self.process_image(img, "Default", return_pil=False, image_format="JPEG")
+                            img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" style="max-width: 256px; max-height: 256px; width: auto; height: auto; object-fit: contain;"/>'
+                            img_str_list.append(img_str)
+                        elif self.is_video_file(img):
+                            ret.append(((img,), None))
+                    msg = msg.strip()
+                    img_place_holder = ""
+                    for img_str in img_str_list:
+                        img_place_holder += f"{img_str}\n\n"
+                    if len(img_str_list) > 0:
+                        msg = f"{img_place_holder}\n\n{msg}"
+                    if len(msg) > 0:
+                        ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        ("Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+           "You are able to understand the visual content that the user provides, "
+           "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_qwen = Conversation(
+    system="""<|im_start|>system
+You are a helpful assistant.""",
+    roles=("<|im_start|>user", "<|im_start|>assistant"),
+    version="qwen",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.CHATML,
+    sep="<|im_end|>",
+)
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+conv_mistral_instruct = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="",
+    sep2="</s>",
+)
+conv_chatml_direct = Conversation(
+    system="""<|im_start|>system
+Answer the questions.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_yi34b_chatml_direct = Conversation(
+    system="""<|im_start|>system
+Answer the questions.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt-yi-34b",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_llama3 = Conversation(
+    system="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.""",
+    roles=("<|start_header_id|>user<|end_header_id|>\n\n", "<|start_header_id|>assistant<|end_header_id|>\n\n"),
+    version="llama3",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|eot_id|>",
+)
+conv_chatml_direct = Conversation(
+    system="""<|im_start|>system
+Answer the questions.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+default_conversation = conv_vicuna_v1
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "llama_2": conv_llama_2,
+    "mistral_instruct": conv_mistral_instruct,
+    "chatml_direct": conv_chatml_direct,
+    "yi_34b_chatml_direct": conv_yi34b_chatml_direct,
+    "mistral_direct": conv_chatml_direct,
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+    "mpt": conv_mpt,
+    "llama3": conv_llama3,
+    "qwen_1_5": conv_qwen,
+    "qwen_2": conv_qwen,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

llava/mm_utils.py ADDED Viewed

	@@ -0,0 +1,335 @@

+# This file is modified from https://github.com/haotian-liu/LLaVA/
+from PIL import Image
+from io import BytesIO
+import base64
+import torch
+import math
+import ast
+from transformers import StoppingCriteria
+from llava.constants import IMAGE_TOKEN_INDEX
+from decord import VideoReader
+from decord import cpu
+import av
+from av.codec.context import CodecContext
+import numpy as np
+def get_frame_indices(total_frames, original_fps, target_fps, num_frm):
+    sample_fps = round(original_fps / target_fps)
+    frame_idx = [i for i in range(0, total_frames, sample_fps)]
+    if len(frame_idx) < num_frm:
+        # If we have fewer frames than num_frm, just return all the frames
+        return frame_idx
+    scale = 1.0 * len(frame_idx) / num_frm
+    uniform_idx = [round((i + 1) * scale - 1) for i in range(num_frm)]
+    frame_idx = [frame_idx[i] for i in uniform_idx]
+    return frame_idx
+def read_video_decord(video_path, num_frm=16, target_fps=2):
+    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+    total_frames = len(vr)
+    original_fps = vr.get_avg_fps()
+    target_fps = min(target_fps, original_fps) # target fps should not exceed the video fps
+    indices = get_frame_indices(total_frames, original_fps, target_fps, num_frm)
+    frames = vr.get_batch(indices)
+    vr.seek(0)
+    # video info string
+    total_time = total_frames/original_fps
+    video_info_string = f"Time: {round(total_time, 2)}s; Time interval between frame {round(total_time/len(indices),3)}s; video tokens:"
+    return frames.asnumpy(), video_info_string
+def read_video_pyav2(video_path, num_frm=16, target_fps=1, threads=4):
+    container = av.open(video_path)
+    stream = container.streams.video[0]
+    stream.thread_type = 'AUTO'
+    stream.codec_context.thread_count = threads
+    original_fps = stream.average_rate
+    total_frames = stream.frames
+    if "webm" not in video_path and "mkv" not in video_path:
+        try:
+            indices = get_frame_indices(total_frames, original_fps, target_fps, num_frm)
+            frames = record_video_length_stream(container, indices)
+        except:
+            container = av.open(video_path)
+            frames = record_video_length_packet(container)
+            total_frames = len(frames)
+            indices = get_frame_indices(total_frames, original_fps, target_fps, num_frm)
+            frames = [frames[i] for i in indices]
+    else:
+        frames = record_video_length_packet(container)
+        total_frames = len(frames)
+        indices = get_frame_indices(total_frames, original_fps, target_fps, num_frm)
+        frames = [frames[i] for i in indices]
+    return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+# This one is faster
+def record_video_length_stream(container, indices):
+    frames = []
+    start_index = indices[0]
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= start_index and i in indices:
+            frames.append(frame)
+    return frames
+# This one works for all types of video
+def record_video_length_packet(container):
+    frames = []
+    # https://github.com/PyAV-Org/PyAV/issues/1269
+    # https://www.cnblogs.com/beyond-tester/p/17641872.html
+    # context = CodecContext.create("libvpx-vp9", "r")
+    for packet in container.demux(video=0):
+        for frame in packet.decode():
+            frames.append(frame)
+    return frames
+def select_best_resolution(original_size, possible_resolutions):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float('inf')
+    for width, height in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+    return best_fit
+def resize_and_pad_image(image, target_resolution):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+    if scale_w < scale_h:
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+    # Resize the image
+    resized_image = image.resize((new_width, new_height))
+    new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
+    paste_x = (target_width - new_width) // 2
+    paste_y = (target_height - new_height) // 2
+    new_image.paste(resized_image, (paste_x, paste_y))
+    return new_image
+def divide_to_patches(image, patch_size):
+    """
+    Divides an image into patches of a specified size.
+    Args:
+        image (PIL.Image.Image): The input image.
+        patch_size (int): The size of each patch.
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    patches = []
+    width, height = image.size
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            box = (j, i, j + patch_size, i + patch_size)
+            patch = image.crop(box)
+            patches.append(patch)
+    return patches
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+    Args:
+        image_size (tuple): The size of the input image in the format (width, height).
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+        patch_size (int): The size of each image patch.
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    width, height = select_best_resolution(image_size, possible_resolutions)
+    return width // patch_size, height // patch_size
+def process_anyres_image(image, processor, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    image_padded = resize_and_pad_image(image, best_resolution)
+    patches = divide_to_patches(image_padded, processor.crop_size['height'])
+    image_original_resize = image.resize((processor.size['shortest_edge'], processor.size['shortest_edge']))
+    image_patches = [image_original_resize] + patches
+    image_patches = [processor.preprocess(image_patch, return_tensors='pt')['pixel_values'][0]
+                     for image_patch in image_patches]
+    return torch.stack(image_patches, dim=0)
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def process_images(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+    new_images = []
+    if image_aspect_ratio == 'pad':
+        for image in images:
+            image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
+            image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            new_images.append(image)
+    elif image_aspect_ratio == "anyres":
+        for image in images:
+            image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
+            new_images.append(image)
+    else:
+        return image_processor(images, return_tensors='pt')['pixel_values']
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images
+def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith('checkpoint-'):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        self.max_keyword_len = 0
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            if len(cur_keyword_ids) > self.max_keyword_len:
+                self.max_keyword_len = len(cur_keyword_ids)
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            truncated_output_ids = output_ids[0, -keyword_id.shape[0]:]
+            if torch.equal(truncated_output_ids, keyword_id):
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        outputs = []
+        for i in range(output_ids.shape[0]):
+            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
+        return all(outputs)

llava/model/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .language_model.llava_qwen2 import LlavaQwenForCausalLM, LlavaQwenConfig
2	+ from .language_model.llava_qwen2_slowfast import LlavaQwenSlowFastForCausalLM, LlavaQwenSlowFastConfig

llava/model/builder.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# This file is modified from https://github.com/haotian-liu/LLaVA/
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import os
+import warnings
+import shutil
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
+import torch
+from llava.model import *
+from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
+    kwargs = {"device_map": device_map, **kwargs}
+    if device != "cuda":
+        kwargs['device_map'] = {"": device}
+    if load_8bit:
+        kwargs['load_in_8bit'] = True
+    elif load_4bit:
+        kwargs['load_in_4bit'] = True
+        kwargs['quantization_config'] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4'
+        )
+    else:
+        kwargs['torch_dtype'] = torch.float16
+    if use_flash_attn:
+        kwargs['attn_implementation'] = 'flash_attention_2'
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = LlavaQwenSlowFastForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+    mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+    if mm_use_im_patch_token:
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+    if mm_use_im_start_end:
+        tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+    model.resize_token_embeddings(len(tokenizer))
+    vision_tower = model.get_vision_tower()
+    if not vision_tower.is_loaded:
+        vision_tower.load_model(device_map=device_map)
+    if device_map != 'auto':
+        vision_tower.to(device=device_map, dtype=torch.float16)
+    image_processor = vision_tower.image_processor
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+    return tokenizer, model, image_processor, context_len

llava/model/language_model/hybrid_decoder_layer.py ADDED Viewed

	@@ -0,0 +1,1473 @@

+"""PyTorch Qwen2 model."""
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from einops import rearrange
+from transformers.cache_utils import Cache
+from transformers.modeling_flash_attention_utils import _flash_attention_forward
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from transformers.utils import (
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10
+)
+from transformers.activations import ACT2FN
+if is_flash_attn_2_available():
+    from flash_attn.bert_padding import index_first_axis
+    from flash_attn import flash_attn_varlen_func
+class ScaleDotProductCrossAttention(nn.Module):
+    def __init__(self, layer_number, softmax_scale=None, attention_dropout=0.0):
+        super().__init__()
+        self.layer_number = layer_number
+        self.softmax_scale = softmax_scale
+        self.dropout_p = attention_dropout
+    def forward(self, q, k, v, attn_mask=None):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            q, k, v: The tensor containing the query, key, and value. (B, S, H, D)
+        """
+        # (N,...,L,E)
+        if attn_mask is not None:
+            attn_mask = attn_mask[:,None,:,:].repeat(1, q.shape[1], 1, 1)
+        # attention mask, True means it will take part in attention B H s_q s_k
+        if self.training:
+            dropout_p = self.dropout_p
+        else:
+            dropout_p = 0.0
+        if q.device.type == "cuda" and attn_mask is not None:
+            q = q.contiguous()
+            k = k.contiguous()
+            v = v.contiguous()
+        # debug only, calculate the FLOPs for cross-attn
+        ##################
+        # attn_weights = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(128) # hardcode
+        # if attn_mask is not None:  # no matter the length, we just slice it
+        #     causal_mask = attn_mask[:, :, :, : k.shape[-2]]
+        #     attn_weights = attn_weights + causal_mask
+        # # upcast attention to fp32
+        # attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
+        # # attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        # o = torch.matmul(attn_weights, v)
+        ###################
+        o = nn.functional.scaled_dot_product_attention(q, k, v,
+            attn_mask=attn_mask,
+            dropout_p=dropout_p,
+            is_causal=False,
+            scale=self.softmax_scale)
+        # B Head L D -> L B (Head D)
+        o = rearrange(o, 'B Head L D -> B L (Head D)').contiguous()
+        return o
+class FlashAttnCrossAttention(nn.Module):
+    def __init__(self, layer_number, softmax_scale=None, attention_dropout=0.0):
+        super().__init__()
+        self.layer_number = layer_number
+        self.softmax_scale = softmax_scale
+        self.dropout_p = attention_dropout
+    def _get_unpad_data(self, attention_mask: torch.Tensor):
+        """
+        Retrieves indexing data required to repad unpadded (ragged) tensors.
+        Arguments:
+            attention_mask (`torch.Tensor`):
+                Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
+        Return:
+            indices (`torch.Tensor`):
+                The indices of non-masked tokens from the flattened input sequence.
+            cu_seqlens (`torch.Tensor`):
+                The cumulative sequence lengths, used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
+            max_seqlen_in_batch (`int`):
+                Maximum sequence length in batch.
+        """
+        seqlens_in_batch = attention_mask[:, 0, :].sum(dim=-1, dtype=torch.int32) # attn mask are the same for the query dimension, pick the first query
+        indices = torch.nonzero(attention_mask[:, 0, :].flatten(), as_tuple=False).flatten()
+        max_seqlen_in_batch = seqlens_in_batch.max().item()
+        cu_seqlens = nn.functional.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+        return (
+            indices,
+            cu_seqlens,
+            max_seqlen_in_batch,
+            seqlens_in_batch
+        )
+    def unpad_q(self, q_layer):
+        # no need to unpad, just flatten
+        batch_size, q_seq_len, num_key_value_heads, head_dim = q_layer.shape
+        cu_seqlens_q = torch.tensor([q_seq_len] * batch_size, dtype=torch.int32, device=q_layer.device)
+        cu_seqlens_q = nn.functional.pad(torch.cumsum(cu_seqlens_q, dim=0, dtype=torch.int32), (1, 0))
+        q_layer = q_layer.reshape(batch_size * q_seq_len, num_key_value_heads, head_dim)
+        return (
+            q_layer,
+            cu_seqlens_q,
+            q_seq_len)
+    def unpad_kv(self, key_layer, value_layer, attn_mask):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k, split_size = self._get_unpad_data(attn_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k)
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        return (
+            key_layer,
+            value_layer,
+            indices_k,
+            cu_seqlens_k,
+            max_seqlen_in_batch_k,
+            split_size)
+    def forward(self, q, k, v, attn_mask=None):
+        """
+        Implements the multihead softmax attention with flash attention varlen api.
+        Unpad the kv sequence
+        Arguments
+        ---------
+            q, k, v: The tensor containing the query, key, and value. (B, S, H, D)
+        """
+        # (N,...,L,E)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        # NOTE: don't know if it's necessary
+        if q.device.type == "cuda" and attn_mask is not None:
+            q = q.contiguous()
+            k = k.contiguous()
+            v = v.contiguous()
+        # batch_size = q.shape[0]
+        # first unpad the q and kv, get cu_seq_len and indices
+        batch_size, q_seq_len, head_num, head_dim = q.shape
+        q, cu_seq_lens_q, max_seqlen_in_batch_q = self.unpad_q(q)
+        k, v, indices_kv, cu_seq_lens_kv, max_seqlen_in_batch_kv, split_size = self.unpad_kv(k, v, attn_mask)
+        attn_output = flash_attn_varlen_func(
+            q,
+            k,
+            v,
+            cu_seqlens_q=cu_seq_lens_q,
+            cu_seqlens_k=cu_seq_lens_kv,
+            max_seqlen_q=max_seqlen_in_batch_q,
+            max_seqlen_k=max_seqlen_in_batch_kv,
+            dropout_p=self.dropout_p if self.training else 0.0,
+            softmax_scale=None,
+            causal=False,
+            # **flash_kwargs
+        )
+        return attn_output.reshape(batch_size, q_seq_len, head_num, head_dim).flatten(2, 3).contiguous()
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
+class Qwen2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
+class Qwen2RotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim=None,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        rope_type="default",
+        config=None,
+    ):
+        super().__init__()
+        # TODO (joao): remove the `if` below, only used for BC
+        self.rope_kwargs = {}
+        if config is None:
+            self.rope_kwargs = {
+                "rope_type": rope_type,
+                "factor": scaling_factor,
+                "dim": dim,
+                "base": base,
+                "max_position_embeddings": max_position_embeddings,
+            }
+            self.rope_type = rope_type
+            self.max_seq_len_cached = max_position_embeddings
+            self.original_max_seq_len = max_position_embeddings
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            else:
+                self.rope_type = "default"
+            self.max_seq_len_cached = config.max_position_embeddings
+            self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
+class Qwen2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class Qwen2Attention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        # if layer_idx is None:
+        #     logger.warning_once(
+        #         f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+        #         "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+        #         "when creating this class."
+        #     )
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary_emb = Qwen2RotaryEmbedding(config=self.config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            # logger.warning_once(
+            #     "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+            #     "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+            #     "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+            #     "removed and `position_embeddings` will be mandatory."
+            # )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class Qwen2FlashAttention2(Qwen2Attention):
+    """
+    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
+    as the weights of the module stays untouched. The only required change would be on the forward pass
+    where it needs to correctly call the public API of flash attention and deal with padding tokens
+    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
+    config.max_window_layers layers.
+    """
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ):
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            # logger.warning_once(
+            #     "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+            #     "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+            #     "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+            #     "removed and `position_embeddings` will be mandatory."
+            # )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            kv_seq_len = key_states.shape[-2] + cache_position[0]
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            # logger.warning_once(
+            #     f"The input hidden states seems to be silently casted in float32, this might be related to"
+            #     f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+            #     f" {target_dtype}."
+            # )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        if (
+            self.config.use_sliding_window
+            and getattr(self.config, "sliding_window", None) is not None
+            and self.layer_idx >= self.config.max_window_layers
+        ):
+            sliding_window = self.config.sliding_window
+        else:
+            sliding_window = None
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=sliding_window,
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class Qwen2HybridFlashAttention2(Qwen2FlashAttention2):
+    """
+    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
+    as the weights of the module stays untouched. The only required change would be on the forward pass
+    where it needs to correctly call the public API of flash attention and deal with padding tokens
+    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
+    config.max_window_layers layers.
+    """
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self,
+                 is_hyper_enabled,
+                 gating_type,
+                 cross_attn_implementation,
+                 *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self.is_hyper_enabled = is_hyper_enabled
+        if self.is_hyper_enabled:
+            self.gating_type = gating_type
+            self.cross_attention_implementation = cross_attn_implementation
+            self.cross_attn_kv_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim * 2, bias=True)
+            if gating_type.startswith("whole-dynamic"):
+                if "tanh" in gating_type:
+                    self.cross_attn_gate_proj = nn.Sequential(
+                        nn.Linear(self.hidden_size, 1),
+                        nn.Tanh()
+                    )
+                else:
+                    self.cross_attn_gate_proj = nn.Sequential(
+                        nn.Linear(self.hidden_size, 1),
+                    )
+                if gating_type.endswith("warmup"):
+                    self.cross_attn_warm_up_gate = torch.nn.Parameter(torch.zeros(1))
+            if "flashattn" in self.cross_attention_implementation:
+                self.cross_attn_core_attention = FlashAttnCrossAttention(layer_number=-1, attention_dropout=self.attention_dropout)
+            else:
+                self.cross_attn_core_attention = ScaleDotProductCrossAttention(layer_number=-1, attention_dropout=self.attention_dropout)
+    def all2media_cross_attn(self,
+                              text_state,
+                              text_query,
+                              vision_features,
+                              text2vision_cross_attn_mask=None,
+                              all_text_mask=None):
+        '''
+        text_query: [s b h d]
+        text_state: s b d
+        vision_features: [num_vis, b,  d]
+        '''
+        if vision_features is None or (self.is_hyper_enabled == False):
+            return text_state
+        L_c, B_c = text_state.shape[:2]
+        D_head = self.head_dim
+        if "whole-dynamic" in self.gating_type:
+            gate_value = self.cross_attn_gate_proj(text_state) # n, bs, head_D
+            if "warmup" in self.gating_type:
+                gate_value = gate_value * self.cross_attn_warm_up_gate
+        vision_features = vision_features.contiguous()
+        vision_features = self.cross_attn_kv_proj(vision_features)
+        text_query = rearrange(text_query, 'L B H D -> B H L D') # [25, 2, 32, 128])
+        vision_kv = rearrange(vision_features, 'BL Lv (H KV D) -> KV BL H Lv D', KV=2, H=self.num_key_value_heads)
+        vision_key = vision_kv[0].contiguous() # [b h s d]
+        vision_value = vision_kv[1].contiguous()
+        vision_key = repeat_kv(vision_key, self.num_key_value_groups)
+        vision_value = repeat_kv(vision_value, self.num_key_value_groups)
+        # expend_cross_attn_mask
+        attention_mask = text2vision_cross_attn_mask[:, None, :].repeat(1, text_state.shape[0], 1)
+        vision_context = self.cross_attn_core_attention(text_query, vision_key, vision_value, attn_mask=attention_mask).transpose(0, 1)
+        # mask out the output if a sample is pure text
+        vision_context = all_text_mask[None, :, None] * vision_context
+        # Apply dynamic gate
+        text_state = text_state + vision_context * gate_value
+        return text_state
+    def onlytext2media_cross_attn(self,
+                              text_state,
+                              text_query,
+                              vision_features,
+                              token_type,
+                              text2vision_cross_attn_mask=None,
+                              all_text_mask=None):
+        '''
+        text_query: [bs n h d]
+        text_state: [bs n d]
+        vision_features: [bs, vis_n, d]
+        token_type: [bs, n]
+        '''
+        # if vision_features is None or (self.is_hyper_enabled == False) or (all_text_mask.sum() == 0):
+        if vision_features is None or (self.is_hyper_enabled == False):
+            return text_state
+        # select all the pure text token
+        pure_text_query = []
+        text_mask = ((token_type - 2) <= 0).bool()
+        if "masksystem" in self.cross_attention_implementation:
+            new_text_masks = []
+            for idx, text_query_ in enumerate(text_query):
+                # mask out all the tokens before the media
+                first_im_token = (token_type[idx] == 3).nonzero()
+                if len(first_im_token) == 0:
+                    start = 0
+                else:
+                    start = first_im_token[0]
+                text_mask_ = text_mask[idx].clone()
+                text_mask_[:start] = False
+                pure_text_query.append(text_query_[text_mask_])
+                new_text_masks.append(text_mask_)
+            text_mask = torch.stack(new_text_masks, dim=0)
+        else:
+            for idx, text_query_ in enumerate(text_query):
+                pure_text_query.append(text_query_[text_mask[idx]])
+        # 2. pad all the text tokens
+        text_query = torch.nn.utils.rnn.pad_sequence(pure_text_query, batch_first=True)
+        padding_attn_mask = torch.ones(text_query.shape[:-2], dtype=torch.bool, device=text_state.device)
+        for i, tensor in enumerate(pure_text_query):
+            padding_attn_mask[i, len(tensor):] = False  # Mark padded elements as False
+        B_c, L_c = text_query.shape[:2]
+        D_head = self.head_dim
+        # obtain dynamic gate value
+        gate_value = self.cross_attn_gate_proj(text_state[text_mask]) # n, D
+        if "warmup" in self.gating_type:
+            gate_value = gate_value * self.cross_attn_warm_up_gate.tanh()
+        vision_features = vision_features.contiguous()
+        vision_features = self.cross_attn_kv_proj(vision_features)
+        text_query = text_query.transpose(1, 2)
+        vision_kv = rearrange(vision_features, 'BL Lv (H KV D) -> KV BL H Lv D', KV=2, H=self.num_key_value_heads)
+        vision_key = vision_kv[0].contiguous() # [b h s d]
+        vision_value = vision_kv[1].contiguous()
+        vision_key = repeat_kv(vision_key, self.num_key_value_groups)
+        vision_value = repeat_kv(vision_value, self.num_key_value_groups)
+        # expend_cross_attn_mask
+        attention_mask = text2vision_cross_attn_mask[:, None, :].repeat(1, text_query.shape[2], 1)
+        vision_context = self.cross_attn_core_attention(text_query, vision_key, vision_value, attn_mask=attention_mask)
+        # mask out the output if a sample is pure text
+        vision_context = all_text_mask[:, None, None] * vision_context
+        # Apply dynamic gate
+        extended_attn_output = torch.zeros_like(text_state, dtype=text_state.dtype, device=text_state.device)
+        extended_attn_output[text_mask] = extended_attn_output[text_mask] + vision_context[padding_attn_mask] * gate_value
+        text_state = text_state + extended_attn_output
+        # NOTE Min: just equvalent to the following line. Avoid error under deepspeed zero3
+        # text_state[text_mask] = text_state[text_mask] + vision_context[padding_attn_mask] * gate_value
+        return text_state
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        visual_hidden_states: torch.Tensor,
+        token_type: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        text2visual_attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ):
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            # logger.warning_once(
+            #     "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+            #     "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+            #     "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+            #     "removed and `position_embeddings` will be mandatory."
+            # )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            kv_seq_len = key_states.shape[-2] + cache_position[0]
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            # logger.warning_once(
+            #     f"The input hidden states seems to be silently casted in float32, this might be related to"
+            #     f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+            #     f" {target_dtype}."
+            # )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        if (
+            self.config.use_sliding_window
+            and getattr(self.config, "sliding_window", None) is not None
+            and self.layer_idx >= self.config.max_window_layers
+        ):
+            sliding_window = self.config.sliding_window
+        else:
+            sliding_window = None
+        attn_output = _flash_attention_forward(
+            query_states, # bs, n, head, head_dim
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=sliding_window,
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        # text-to-image cross-attention
+        ####
+        all_text_mask = (token_type == 3).sum(dim=-1).bool() # [bs, ] if False, indicate that this sample contains no image input
+        if self.cross_attention_implementation.startswith("vanilla"): # all tokens can attend to the slow tokens
+            attn_output = self.all2media_cross_attn(attn_output.permute(1, 0, 2),
+                                                     query_states.permute(1, 0, 2, 3),
+                                                     visual_hidden_states,
+                                                     text2visual_attention_mask,
+                                                     all_text_mask)
+            attn_output = attn_output.permute(1,0,2)
+        elif self.cross_attention_implementation.startswith("text-only-vanilla"): # only text tokens are allowed to attend the slow tokens
+            attn_output = self.onlytext2media_cross_attn(attn_output,
+                                                         query_states,
+                                                         visual_hidden_states,
+                                                         token_type=token_type,
+                                                         text2vision_cross_attn_mask=text2visual_attention_mask,
+                                                         all_text_mask=all_text_mask
+                                                         )
+        else:
+            raise NotImplementedError(f"cross-attention type {self.cross_attention_implementation} not implemented")
+        ####
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class Qwen2SdpaAttention(Qwen2Attention):
+    """
+    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+    # Adapted from Qwen2Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            # logger.warning_once(
+            #     "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+            #     'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            # )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            # logger.warning_once(
+            #     "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+            #     "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+            #     "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+            #     "removed and `position_embeddings` will be mandatory."
+            # )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+# TODO: Min: Not implementated yet
+class Qwen2HybridSdpaAttention(Qwen2SdpaAttention):
+    """
+    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+    def __init__(self,
+                 is_hyper_enabled,
+                 gating_type,
+                 cross_attn_implementation,
+                 *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_hyper_enabled = is_hyper_enabled
+        if self.is_hyper_enabled:
+            self.gating_type = gating_type
+            self.cross_attention_implementation = cross_attn_implementation
+            self.cross_attn_kv_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim * 2, bias=True)
+            if gating_type.startswith("whole-dynamic"):
+                if "tanh" in gating_type:
+                    self.cross_attn_gate_proj = nn.Sequential(
+                        nn.Linear(self.hidden_size, 1),
+                        nn.Tanh()
+                    )
+                else:
+                    self.cross_attn_gate_proj = nn.Sequential(
+                        nn.Linear(self.hidden_size, 1),
+                    )
+                if gating_type.endswith("warmup"):
+                    self.cross_attn_warm_up_gate = torch.nn.Parameter(torch.zeros(1))
+            if "flashattn" in self.cross_attention_implementation:
+                self.cross_attn_core_attention = FlashAttnCrossAttention(layer_number=-1, attention_dropout=self.attention_dropout)
+            else:
+                self.cross_attn_core_attention = ScaleDotProductCrossAttention(layer_number=-1, attention_dropout=self.attention_dropout)
+    def text2media_cross_attn(self,
+                              text_state,
+                              text_query,
+                              vision_features,
+                              text2vision_cross_attn_mask=None,
+                              all_text_mask=None):
+        '''
+        text_query: [s b h d]
+        text_state: s b d
+        vision_features: [num_vis, b,  d]
+        '''
+        if vision_features is None or (self.is_hyper_enabled == False):
+            return text_state
+        # obtain dynamic gate value
+        L_c, B_c = text_state.shape[:2]
+        D_head = self.head_dim
+        gate_value = rearrange(
+            self.gate_proj(
+                rearrange(text_state, 'L B (Head D) -> (L B Head) D', D=D_head)),
+            '(L B Head) D -> L B (Head D)', L=L_c, B=B_c)
+        vision_features = vision_features.contiguous()
+        vision_features = self.v_kv_proj(vision_features)
+        # length_each_img = vision_features.shape[1]
+        # sequence_length = text_query.shape[0]
+        query_layer = rearrange(query_layer, 'L B H D -> B H L D') # [25, 2, 32, 128])
+        vision_kv = rearrange(vision_features, 'BL Lv (H KV D) -> KV 1 H (BL Lv) D', KV=2, H=self.num_key_value_heads)
+        vision_key = vision_kv[0].contiguous() # [b h s d]
+        vision_value = vision_kv[1].contiguous()
+        # Apply MI-Rope
+        # key_layer = self.apply_mi_rope(key_layer, media_offset_line=self.visual_cache['media_offset'][batch_id,:,1]-curr_offset[0], length_each_img=length_each_img)
+        key_layer = repeat_kv(key_layer, self.num_key_value_groups)
+        value_layer = repeat_kv(value_layer, self.num_key_value_groups)
+        vision_context = self.v_core_attention_sdpa(query_layer, vision_key, vision_value, attn_mask=None, order='bhsd').squeeze(1) # TODO
+        # Apply dynamic gate
+        text_state = text_state * (1 - gate_value) + vision_context * gate_value
+        return text_state
+    # Adapted from Qwen2Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        visual_hidden_states: torch.Tensor,
+        token_type: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        text2visual_attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            # logger.warning_once(
+            #     "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+            #     'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            # )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            # logger.warning_once(
+            #     "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+            #     "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+            #     "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+            #     "removed and `position_embeddings` will be mandatory."
+            # )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+        # text-to-image cross-attention
+        ####
+        all_text_mask = (token_type == 3).sum(dim=-1).bool() # [bs, ] if False, indicate that this sample contains no image input
+        if self.cross_attention_implementation.startswith("vanilla"):
+            attn_output = self.text2media_cross_attn(attn_output.permute(1, 0, 2),
+                                                     query_states.permute(1, 0, 2, 3),
+                                                     visual_hidden_states,
+                                                     text2visual_attention_mask,
+                                                     all_text_mask)
+            attn_output = attn_output.permute(1,0,2)
+        elif self.cross_attention_implementation.startswith("text-only-vanilla"):
+            attn_output = self.onlytext2media_cross_attn(attn_output,
+                                                         query_states,
+                                                         visual_hidden_states,
+                                                         token_type=token_type,
+                                                         text2vision_cross_attn_mask=text2visual_attention_mask,
+                                                         all_text_mask=all_text_mask
+                                                         )
+        else:
+            raise NotImplementedError(f"cross-attention type {self.cross_attention_implementation} not implemented")
+        ####
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+QWEN2_ATTENTION_CLASSES = {
+    "eager": Qwen2Attention,
+    "flash_attention_2": Qwen2FlashAttention2,
+    "sdpa": Qwen2SdpaAttention,
+}
+QWEN2_HYBRID_ATTENTION_CLASSES = {
+    "flash_attention_2": Qwen2HybridFlashAttention2,
+    "sdpa": Qwen2HybridSdpaAttention, # Not implemented yet, only support flash attn
+}
+class Qwen2DecoderLayer(nn.Module):
+    def __init__(self, config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        if config.sliding_window and config._attn_implementation != "flash_attention_2":
+            # logger.warning_once(
+            #     f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+            #     "unexpected results may be encountered."
+            # )
+            pass
+        self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class Qwen2HybridDecoderLayer(nn.Module):
+    def __init__(self,
+                 config,
+                 layer_idx: int,
+                 is_hyper_enabled=False,
+                 cross_attn_implementation="vanilla", # in ['vanilla' and 'text-only-vanilla']
+                 cross_attn_gating_type="channel-wise-dynamic-sigmoid"):
+        super().__init__()
+        self.is_hyper_enabled = is_hyper_enabled
+        self.hidden_size = config.hidden_size
+        if config.sliding_window and config._attn_implementation != "flash_attention_2":
+            # logger.warning_once(
+            #     f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+            #     "unexpected results may be encountered."
+            # )
+            pass
+        self.self_attn = QWEN2_HYBRID_ATTENTION_CLASSES[config._attn_implementation](config=config,
+                                                                                    layer_idx=layer_idx,
+                                                                                    is_hyper_enabled=is_hyper_enabled,
+                                                                                    cross_attn_implementation=cross_attn_implementation,
+                                                                                    gating_type=cross_attn_gating_type)
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False # move the gradient checkpointing to the forward function of attn and MLP
+    # Used this great idea from this implementation of Flamingo (https://github.com/dhansmair/flamingo-mini/)
+    def condition_vis_x(self,
+                        vis_x,
+                        cross_attn_mask=None,
+                        token_type=None):
+        self.vis_x = vis_x
+        self.cross_attn_mask = cross_attn_mask
+        self.media_locations = token_type
+    def clear_vis_x(self):
+        self.vis_x = None
+        self.cross_attn_mask = None
+        self.media_locations = None
+    def mlp_forward(self, hidden_states):
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # process image embedding
+        visual_tokens = self.vis_x
+        cross_attn_mask = self.cross_attn_mask
+        token_type = self.media_locations
+        visual_tokens = self.input_layernorm(visual_tokens)
+        # Self Attention
+        if self.gradient_checkpointing and self.training:
+            hidden_states, self_attn_weights, present_key_value = torch.utils.checkpoint.checkpoint(
+                self.self_attn,
+                hidden_states,
+                visual_tokens,
+                token_type,
+                attention_mask,
+                cross_attn_mask,
+                position_ids,
+                past_key_value,
+                output_attentions,
+                use_cache,
+                cache_position,
+                position_embeddings
+            )
+        else:
+            hidden_states, self_attn_weights, present_key_value = self.self_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                visual_hidden_states=visual_tokens,
+                text2visual_attention_mask=cross_attn_mask,
+                token_type=token_type,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        if self.gradient_checkpointing and self.training:
+            hidden_states = torch.utils.checkpoint.checkpoint(
+                self.mlp_forward,
+                hidden_states)
+        else:
+            hidden_states = self.mlp_forward(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs

llava/model/language_model/llava_qwen2.py ADDED Viewed

	@@ -0,0 +1,143 @@

+#    Copyright 2024 Hao Zhang
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from typing import List, Optional, Tuple, Union, Dict
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+import transformers
+from transformers import AutoConfig, AutoModelForCausalLM, LlamaConfig, LlamaModel, LlamaForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+from transformers import Qwen2Config, Qwen2Model, Qwen2ForCausalLM
+class LlavaQwenConfig(Qwen2Config):
+    model_type = "llava_qwen"
+class LlavaQwenModel(LlavaMetaModel, Qwen2Model):
+    config_class = LlavaQwenConfig
+    def __init__(self, config: Qwen2Config):
+        super(LlavaQwenModel, self).__init__(config)
+class LlavaQwenForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaQwenConfig
+    def __init__(self, config):
+        Qwen2ForCausalLM.__init__(self, config)
+        config.model_type = "llava_qwen"
+        config.rope_scaling = None
+        self.model = LlavaQwenModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+        modalities: Optional[List[str]] = ["image"],
+        dpo_forward: Optional[bool] = False,
+        cache_position=None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if inputs_embeds is None:
+            (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes)
+        if dpo_forward:
+            outputs = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            hidden_states = outputs[0]
+            logits = self.lm_head(hidden_states)
+            return logits, labels
+        else:
+            return super().forward(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                labels=labels,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        modalities: Optional[List[str]] = ["image"],
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        if images is not None:
+            (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, image_sizes=image_sizes)
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+        return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs)
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs)
+        if images is not None:
+            inputs["images"] = images
+        if image_sizes is not None:
+            inputs["image_sizes"] = image_sizes
+        return inputs
+AutoConfig.register("llava_qwen", LlavaQwenConfig)
+AutoModelForCausalLM.register(LlavaQwenConfig, LlavaQwenForCausalLM)

llava/model/language_model/llava_qwen2_slowfast.py ADDED Viewed

	@@ -0,0 +1,632 @@

+#    Copyright 2024 Hao Zhang
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from typing import List, Optional, Tuple, Union, Dict
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from collections import OrderedDict
+import transformers
+# from transformers import AutoConfig, AutoModelForCausalLM, LlamaConfig, LlamaModel, LlamaForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+from transformers import AutoConfig, AutoModelForCausalLM, Qwen2Config, Qwen2Model, Qwen2ForCausalLM
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+from llava.model.language_model.hybrid_decoder_layer import Qwen2DecoderLayer, Qwen2HybridDecoderLayer
+class LlavaQwenSlowFastConfig(Qwen2Config):
+    model_type = "llava_qwen_slow_fast"
+class LlavaQwenSlowFastModel(LlavaMetaModel, Qwen2Model):
+    config_class = LlavaQwenSlowFastConfig
+    def __init__(self, config: Qwen2Config):
+        super(LlavaQwenSlowFastModel, self).__init__(config)
+        # initialize the cross-attention layers
+        self.slow_branch_is_initialized = False
+        if hasattr(config, "cross_attn_every_n_layers"):
+            self.initialize_slow_branch_modules(config)
+    def initialize_slow_branch_modules(self, args):
+        if self.slow_branch_is_initialized:
+            return
+        # number of decoder layers
+        num_layers = len(self.layers)
+        cross_attn_every_n_layers = args.cross_attn_every_n_layers
+        cross_attn_gating_type = args.cross_attn_gating_type
+        cross_attn_implementation = args.cross_attn_implementation
+        cross_attn_max_layer_depth = getattr(args, "cross_attn_max_layer_depth", num_layers)
+        cross_attn_min_layer_depth = getattr(args, "cross_attn_min_layer_depth", 0)
+        if cross_attn_max_layer_depth is None:
+            cross_attn_max_layer_depth = num_layers
+        if cross_attn_min_layer_depth is None:
+            cross_attn_min_layer_depth = 0
+        self.config.cross_attn_every_n_layers = cross_attn_every_n_layers
+        self.config.cross_attn_implementation = cross_attn_implementation
+        self.config.cross_attn_gating_type = cross_attn_gating_type
+        self.config.cross_attn_max_layer_depth = cross_attn_max_layer_depth
+        self.config.cross_attn_min_layer_depth = cross_attn_min_layer_depth
+        # set pooling operations
+        tile_image_input = getattr(args, "tile_image_input", True) # tile all the image input into a video sequence
+        min_fast_frames = getattr(args, "min_fast_frames", 1)   # force to sample at least `min_fast_frames` frames for fast visual tokens
+        if min_fast_frames is None:
+            min_fast_frames = 1
+        self.config.tile_image_input = tile_image_input
+        self.config.min_fast_frames = min_fast_frames
+        # generate layer index for the hybrid layer
+        hybrid_layer_idx = []
+        for i in range(cross_attn_min_layer_depth, cross_attn_max_layer_depth, cross_attn_every_n_layers):
+            hybrid_layer_idx.append(i)
+        # substitute the original decoder layer with hybrid layer
+        initialize_kv_from_lm = getattr(args, "initialize_cross_attn_kv_from_lm", False) # whether use LLM's pretrained kv projection to initialize the kv projection weight of cross-attn
+        for idx in range(len(self.layers)):
+            if idx in hybrid_layer_idx:
+                original_decoder_layer = self.layers[idx]
+                hybrid_decoder_layer = Qwen2HybridDecoderLayer(self.config, layer_idx=idx, is_hyper_enabled=True, cross_attn_gating_type=cross_attn_gating_type, cross_attn_implementation=cross_attn_implementation)
+                _, unexpected_keys = hybrid_decoder_layer.load_state_dict(original_decoder_layer.state_dict(), strict=False) # cause problem when using deepspeed zero3
+                if initialize_kv_from_lm and hasattr(hybrid_decoder_layer.self_attn, "cross_attn_kv_proj"):
+                    kv_weight = torch.cat([original_decoder_layer.self_attn.k_proj.weight,
+                                           original_decoder_layer.self_attn.v_proj.weight], dim=0)
+                    kv_bias = torch.cat([original_decoder_layer.self_attn.k_proj.bias,
+                                           original_decoder_layer.self_attn.v_proj.bias], dim=0)
+                    new_state_dict = OrderedDict()
+                    new_state_dict['weight'] = kv_weight
+                    new_state_dict['bias'] = kv_bias
+                    hybrid_decoder_layer.self_attn.cross_attn_kv_proj.load_state_dict(new_state_dict)
+                assert len(unexpected_keys) == 0
+                self.layers[idx] = hybrid_decoder_layer
+        # fast token config
+        self.config.fast_token_spatial_stride = args.fast_token_spatial_stride
+        self.config.fast_token_temporal_stride = args.fast_token_temporal_stride
+        self.config.fast_token_temporal_sampling_stride = args.fast_token_temporal_sampling_stride
+        self.slow_branch_is_initialized = True
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                # logger.warning_once(
+                #     "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                # )
+                use_cache = False
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                # logger.warning_once(
+                #     "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                #     "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                #     "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                # )
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                if not isinstance(decoder_layer, Qwen2HybridDecoderLayer):
+                    layer_outputs = self._gradient_checkpointing_func(
+                        decoder_layer.__call__,
+                        hidden_states,
+                        causal_mask,
+                        position_ids,
+                        past_key_values,
+                        output_attentions,
+                        use_cache,
+                        cache_position,
+                        position_embeddings,
+                    )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class LlavaQwenSlowFastForCausalLM(Qwen2ForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaQwenSlowFastConfig
+    def __init__(self, config):
+        Qwen2ForCausalLM.__init__(self, config)
+        config.model_type = "llava_qwen_slow_fast"
+        config.rope_scaling = None
+        self.model = LlavaQwenSlowFastModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, Qwen2HybridDecoderLayer):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+        modalities: Optional[List[str]] = ["image"],
+        cache_position=None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if inputs_embeds is None:
+            (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes)
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        modalities: Optional[List[str]] = ["image"],
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        if images is not None:
+            (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, image_sizes=image_sizes)
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+        return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs)
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs)
+        if images is not None:
+            inputs["images"] = images
+        if image_sizes is not None:
+            inputs["image_sizes"] = image_sizes
+        return inputs
+    def sample_fast_frames(self,
+                      total_frames,
+                      stride,
+                      min_frame_number):
+        all_indices_list = list(range(total_frames))
+        if total_frames < min_frame_number:
+            return all_indices_list
+        sampled_frames = max(total_frames // stride, min_frame_number)
+        stride = total_frames / sampled_frames
+        fast_indices = [min(int(i * stride), total_frames-1) for i in range(sampled_frames)]
+        return fast_indices
+    def split_slow_fast_tokens(self,
+                               visual_tokens,
+                               temporal_sampling_stride=1,
+                               spatial_stride=1,
+                               temporal_stride=1):
+        # TODO: Min: this function is very messy and can be simplified.
+        if isinstance(visual_tokens, torch.Tensor):
+            # for all image inputs, only perform spatial pooling
+            b, n, c = visual_tokens.shape
+            h = w = int(n**0.5)
+            fast_visual_tokens = nn.functional.avg_pool2d(visual_tokens.reshape(b, h, w, c).permute(0, 3, 1, 2),
+                                                          kernel_size=spatial_stride,
+                                                          stride=spatial_stride).flatten(2,3).transpose(1,2)
+            return fast_visual_tokens, visual_tokens
+        else:
+            fast_visual_tokens = []
+            for sample_ in visual_tokens:
+                t, n, c = sample_.shape
+                if t > 1: # is a video
+                    T_downsampling_rate = temporal_sampling_stride * temporal_stride
+                    if t % T_downsampling_rate != 0:
+                        padding_size = (T_downsampling_rate - t % T_downsampling_rate) % T_downsampling_rate
+                        # Pad on the first dimension (sequence length) with zeros
+                        sample_ = nn.functional.pad(sample_, (0, 0, 0, 0, 0, padding_size))  # (dim_pad_left, dim_pad_right, T_pad_left, T_pad_right)
+                    # 1. temporal direct sampling
+                    if temporal_sampling_stride > 1:
+                        fast_token_indices = self.sample_fast_frames(total_frames=t,
+                                                    stride=temporal_sampling_stride,
+                                                    min_frame_number=self.config.min_fast_frames)
+                    else:
+                        fast_token_indices = list(range(t))
+                    sample_ = torch.stack([sample_[idx] for idx in fast_token_indices], dim=0)
+                    b, n, c = sample_.shape
+                    h = w = int(n**0.5)
+                    sample_ = sample_.reshape(b, h, w, c).permute(0, 3, 1, 2)
+                    # 2. temporal average pooling
+                    if temporal_stride > 1:
+                        if (sample_.shape[0] // temporal_stride) >= self.config.min_fast_frames:
+                            sample_ = nn.functional.avg_pool3d(sample_.transpose(0, 1), kernel_size=(temporal_stride, 1, 1)).transpose(0, 1)
+                        else:
+                            h_, w_ = sample_.shape[-2:]
+                            output_frames_num = min(sample_.shape[0], self.config.min_fast_frames)
+                            sample_ = nn.functional.adaptive_avg_pool3d(sample_.transpose(0, 1), output_size=(output_frames_num, h_, w_)).transpose(0, 1)
+                    # 3. spatial pooling
+                    if spatial_stride > 1:
+                        sample_ = nn.functional.avg_pool2d(sample_,
+                                                        kernel_size=spatial_stride,
+                                                        stride=spatial_stride)
+                    sample_ = sample_.flatten(2,3).transpose(1,2)
+                else:
+                    if spatial_stride > 1:
+                        h = w = int(n**0.5)
+                        sample_ = sample_.reshape(t, h, w, c).permute(0, 3, 1, 2)
+                        sample_ = nn.functional.avg_pool2d(sample_,
+                                                        kernel_size=spatial_stride,
+                                                        stride=spatial_stride)
+                        sample_ = sample_.flatten(2,3).transpose(1,2)
+                fast_visual_tokens.append(sample_.flatten(0, 1).contiguous())
+            slow_visual_tokens = [_.flatten(0, 1).contiguous() for _ in visual_tokens]
+            return fast_visual_tokens, slow_visual_tokens
+    def prepare_inputs_labels_for_multimodal(
+        self, input_ids, position_ids, attention_mask, past_key_values, labels,
+        images, image_sizes=None
+    ):
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            # clear the visual tokens if current one is a pure text sample
+            if images is None and input_ids.shape[1] > 1:
+                for layer in self.get_decoder().layers:
+                        if hasattr(layer, "clear_vis_x"):
+                            layer.clear_vis_x()
+            token_types = torch.ones_like(input_ids, dtype=input_ids.dtype, device=input_ids.device)
+            for layer in self.get_decoder().layers:
+                if hasattr(layer, "condition_vis_x"):
+                    layer.media_locations = token_types
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+        # handle image input
+        images = [image if len(image.shape) == 4 else image.unsqueeze(0) for image in images] # list [ [T, C, H, W], ]
+        feature_split_size = [x.shape[0] for x in images]
+        all_features, feature_split_size = self.encode_images(torch.cat(images, dim=0), feature_split_size)
+        raw_image_features = torch.split(all_features, feature_split_size, dim=0)
+        image_features = []
+        for sample_feat in raw_image_features:   # initial spatial pooling for all video tokens
+            if sample_feat.shape[0] > 1 and self.config.mm_video_pooling_stride > 1:
+                b, n, c = sample_feat.shape
+                h = w = int(n**0.5)
+                sample_feat = nn.functional.avg_pool2d(sample_feat.reshape(b, h, w, c).permute(0, 3, 1, 2),
+                                                        kernel_size=self.config.mm_video_pooling_stride,
+                                                        stride=self.config.mm_video_pooling_stride).flatten(2,3).transpose(1,2)
+            image_features.append(sample_feat.contiguous())
+        del raw_image_features, all_features
+        ## generate fast and slow tokens
+        image_features, slow_image_features = self.split_slow_fast_tokens(image_features,
+                                                                        temporal_sampling_stride=self.config.fast_token_temporal_sampling_stride,
+                                                                        spatial_stride=self.config.fast_token_spatial_stride,
+                                                                        temporal_stride=self.config.fast_token_temporal_stride)
+        ## set cross-attention states
+        if isinstance(slow_image_features, (list, tuple)):
+            padded_tensors = torch.nn.utils.rnn.pad_sequence(slow_image_features, batch_first=True)
+            cross_attn_mask = torch.ones(padded_tensors.shape[:-1], dtype=torch.bool, device=padded_tensors.device)
+            for i, tensor in enumerate(slow_image_features):
+                cross_attn_mask[i, len(tensor):] = False  # Mark padded elements as False
+            slow_image_features = padded_tensors
+        else:
+            cross_attn_mask = torch.ones(slow_image_features.shape[:-1], dtype=torch.bool, device=slow_image_features.device)
+        # TODO: image start / end is not implemented here to support pretraining.
+        if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
+            raise NotImplementedError
+        # Let's just add dummy tensors if they do not exist,
+        # it is a headache to deal with None all the time.
+        # But it is not ideal, and if you have a better idea,
+        # please open an issue / submit a PR, thanks.
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+        # remove the padding using attention_mask -- FIXME
+        _input_ids = input_ids
+        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        new_token_types = []
+        # NOTE: Min: we need to record the type of tokens so that we can split the tokens in the hybrid decoder layer
+        # Token type 1: user's input and system tokens, 2: response text tokens, 3: visual tokens, 4: invalid tokens (padding)
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_token_type = torch.full((cur_input_ids.shape[0],), 2, dtype=cur_input_ids[-1].dtype, device=cur_input_ids[-1].device)
+                cur_token_type[labels[batch_idx] == IGNORE_INDEX] = 1 # token with ignore tokens are considered as user input
+                new_token_types.append(cur_token_type)
+                cur_image_idx += 1
+                continue
+            image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            cur_token_type_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
+                cur_token = torch.full((cur_labels_noim[-1].shape[0],), 2, dtype=cur_input_ids_noim[-1].dtype, device=cur_input_ids_noim[-1].device)
+                cur_token[cur_labels[image_token_indices[i]+1:image_token_indices[i+1]] == IGNORE_INDEX] = 1 # ingored tokens are considered as user input
+                cur_token_type_noim.append(cur_token)
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+            cur_new_token_type = []
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                cur_new_token_type.append(cur_token_type_noim[i])
+                if i < num_images:
+                    cur_image_features = image_features[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
+                    cur_new_token_type.append(torch.full((cur_image_features.shape[0],), 3, device=cur_labels.device, dtype=cur_labels.dtype)) # insert image token type
+            cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+            cur_new_token_type = torch.cat(cur_new_token_type) ##
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+            new_token_types.append(cur_new_token_type) ##
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+            new_token_types = [x[:tokenizer_model_max_length] for x in new_token_types]
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+        new_token_types_padded = torch.full((batch_size, max_len), 4, dtype=new_labels[0].dtype, device=new_labels[0].device) ## 4 is invalid token type (padding)
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
+                new_input_embeds_padded.append(torch.cat((
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
+                    cur_new_embed
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+                    new_token_types_padded[i, -cur_len:] = new_token_types[i] ##
+            else:
+                new_input_embeds_padded.append(torch.cat((
+                    cur_new_embed,
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+                    new_token_types_padded[i, :cur_len] = new_token_types[i]
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+        if _position_ids is None:
+            position_ids = None
+        # token type
+        token_types = new_token_types_padded
+        # send token type to cross-attn layers
+        if _input_ids is not None and _input_ids.shape[-1] == 1:
+            pass
+        else:
+            if slow_image_features is not None:
+                for layer in self.get_decoder().layers:
+                    if hasattr(layer, "condition_vis_x"):
+                        layer.condition_vis_x(slow_image_features,
+                                              cross_attn_mask,
+                                              token_type=token_types)
+            else:
+                for layer in self.get_decoder().layers:
+                    if hasattr(layer, "clear_vis_x"):
+                        layer.clear_vis_x()
+        return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
+AutoConfig.register("llava_qwen_slow_fast", LlavaQwenSlowFastConfig)
+AutoModelForCausalLM.register(LlavaQwenSlowFastConfig, LlavaQwenSlowFastForCausalLM)

llava/model/llava_arch.py ADDED Viewed

	@@ -0,0 +1,355 @@

+# This file is modified from https://github.com/haotian-liu/LLaVA/
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from abc import ABC, abstractmethod
+import torch
+import torch.nn as nn
+from .multimodal_encoder.builder import build_vision_tower
+from .multimodal_projector.builder import build_vision_projector
+from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.mm_utils import get_anyres_image_grid_shape
+class LlavaMetaModel:
+    def __init__(self, config):
+        super(LlavaMetaModel, self).__init__(config)
+        if hasattr(config, "mm_vision_tower"):
+            self.vision_tower = build_vision_tower(config, delay_load=True)
+            fpn_input_dim = [] if not hasattr(self.vision_tower, "fpn_input_dim") else self.vision_tower.fpn_input_dim
+            self.mm_projector = build_vision_projector(config, fpn_input_dim=fpn_input_dim)
+            if 'unpad' in getattr(config, 'mm_patch_merge_type', ''):
+                self.image_newline = nn.Parameter(
+                    torch.empty(config.hidden_size, dtype=self.dtype)
+                )
+    def get_vision_tower(self):
+        vision_tower = getattr(self, 'vision_tower', None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        vision_tower = model_args.vision_tower
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+        mm_patch_merge_type = model_args.mm_patch_merge_type
+        mm_video_pooling_stride = model_args.mm_video_pooling_stride
+        self.config.mm_vision_tower = vision_tower
+        if self.get_vision_tower() is None:
+            vision_tower = build_vision_tower(model_args)
+            if fsdp is not None and len(fsdp) > 0:
+                self.vision_tower = [vision_tower]
+            else:
+                self.vision_tower = vision_tower
+        else:
+            if fsdp is not None and len(fsdp) > 0:
+                vision_tower = self.vision_tower[0]
+            else:
+                vision_tower = self.vision_tower
+            vision_tower.load_model()
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
+        self.config.mm_hidden_size = vision_tower.hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+        self.config.mm_patch_merge_type = mm_patch_merge_type
+        self.config.mm_video_pooling_stride = mm_video_pooling_stride
+        if getattr(self, 'mm_projector', None) is None:
+            fpn_input_dim = [] if not hasattr(self.vision_tower, "fpn_input_dim") else self.vision_tower.fpn_input_dim
+            self.mm_projector = build_vision_projector(self.config, fpn_input_dim=fpn_input_dim)
+            if 'unpad' in mm_patch_merge_type:
+                embed_std = 1 / torch.sqrt(torch.tensor(self.config.hidden_size, dtype=self.dtype))
+                self.image_newline = nn.Parameter(
+                    torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std
+                )
+        else:
+            # In case it is frozen by LoRA
+            for p in self.mm_projector.parameters():
+                p.requires_grad = True
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+            def get_w(weights, keyword):
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
+            self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
+def unpad_image(tensor, original_size):
+    """
+    Unpads a PyTorch tensor of a padded and resized image.
+    Args:
+    tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
+    original_size (tuple): The original size of the image (height, width).
+    Returns:
+    torch.Tensor: The unpadded image tensor.
+    """
+    original_width, original_height = original_size
+    current_height, current_width = tensor.shape[1:]
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = current_width / original_width
+        new_height = int(original_height * scale_factor)
+        padding = (current_height - new_height) // 2
+        unpadded_tensor = tensor[:, padding:current_height - padding, :]
+    else:
+        scale_factor = current_height / original_height
+        new_width = int(original_width * scale_factor)
+        padding = (current_width - new_width) // 2
+        unpadded_tensor = tensor[:, :, padding:current_width - padding]
+    return unpadded_tensor
+class LlavaMetaForCausalLM(ABC):
+    @abstractmethod
+    def get_model(self):
+        pass
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+    def encode_images(self, images, feat_split_size=None):
+        image_features = self.get_model().get_vision_tower()(images)
+        if "st" in self.config.mm_projector_type:
+        # need temporal correlations
+            if feat_split_size is None:
+                feat_split_size = [1] * images.shape[0]
+            image_features = image_features.split(feat_split_size)
+            image_features = self.get_model().mm_projector(image_features)
+            feat_split_size = [_.shape[0] for _ in image_features]
+            image_features = torch.cat(image_features, dim=0)
+        else:
+            image_features = self.get_model().mm_projector(image_features)
+        return image_features, feat_split_size
+    def prepare_inputs_labels_for_multimodal(
+        self, input_ids, position_ids, attention_mask, past_key_values, labels,
+        images, image_sizes=None
+    ):
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+        if type(images) is list: # for each element, either video tensor [T, C, H, W], or image tensor [C, H, W]
+            images = [image if len(image.shape) == 4 else image.unsqueeze(0) for image in images] # list [ [T, C, H, W], ]
+            feature_split_size = [x.shape[0] for x in images]
+            all_features, feature_split_size = self.encode_images(torch.cat(images, dim=0), feature_split_size)
+            image_features_raw = torch.split(all_features, feature_split_size, dim=0)
+            image_features = []
+            for sample_feat in image_features_raw:
+                if sample_feat.shape[0] > 1 and self.config.mm_video_pooling_stride > 1: # is video and use different pooling
+                    b, n, c = sample_feat.shape
+                    h = w = int(n**0.5)
+                    sample_feat = nn.functional.avg_pool2d(sample_feat.reshape(b, h, w, c).permute(0, 3, 1, 2),
+                                                           kernel_size=self.config.mm_video_pooling_stride,
+                                                           stride=self.config.mm_video_pooling_stride).flatten(2,3).transpose(1,2)
+                image_features.append(sample_feat.flatten(0,1).contiguous())
+        else:
+            image_features, feature_split_size = self.encode_images(images)
+        # TODO: image start / end is not implemented here to support pretraining.
+        if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
+            raise NotImplementedError
+        # Let's just add dummy tensors if they do not exist,
+        # it is a headache to deal with None all the time.
+        # But it is not ideal, and if you have a better idea,
+        # please open an issue / submit a PR, thanks.
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+        # remove the padding using attention_mask -- FIXME
+        _input_ids = input_ids
+        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+            image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    cur_image_features = image_features[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
+            cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
+                new_input_embeds_padded.append(torch.cat((
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
+                    cur_new_embed
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+            else:
+                new_input_embeds_padded.append(torch.cat((
+                    cur_new_embed,
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+        if _position_ids is None:
+            position_ids = None
+        return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
+    def initialize_vision_tokenizer(self, model_args, tokenizer):
+        if model_args.mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+        if model_args.mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+            if model_args.pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
+                embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
+                assert num_new_tokens == 2
+                if input_embeddings.shape == embed_tokens_weight.shape:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
+                elif embed_tokens_weight.shape[0] == num_new_tokens:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                else:
+                    raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
+        elif model_args.mm_use_im_patch_token:
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = False
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False

llava/model/multimodal_encoder/builder.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# This file is modified from https://github.com/haotian-liu/LLaVA/
+import os
+from .clip_encoder import CLIPVisionTower, SiglipVisionTower
+from copy import deepcopy
+from .convnext_encoder import ConvNextVisionTower
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
+    if "clip" in vision_tower and vision_tower.startswith("openai"):
+        is_absolute_path_exists = os.path.exists(vision_tower)
+        if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
+            return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+        raise ValueError(f'Unknown vision tower: {vision_tower}')
+    elif "siglip" in vision_tower:
+        vision_tower_cfg.input_image_size = 384
+        return SiglipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    elif vision_tower == "convnext-576":
+        ## ConvNeXt
+        convnext_args = deepcopy(vision_tower_cfg)
+        convnext_args.freeze_vision = False
+        convnext_args.input_image_size = 576
+        convnext_vision_tower = "convnext_xxlarge.clip_laion2b_soup" # hardcode
+        return ConvNextVisionTower(convnext_vision_tower, convnext_args)
+    raise ValueError(f'Unknown vision tower: {vision_tower}')

llava/model/multimodal_encoder/clip_encoder.py ADDED Viewed

	@@ -0,0 +1,190 @@

+# This file is modified from https://github.com/haotian-liu/LLaVA/
+import torch
+import torch.nn as nn
+from transformers import (
+    CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig,
+    SiglipVisionModel, SiglipImageProcessor, SiglipVisionConfig
+)
+class CLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        if not delay_load:
+            self.load_model()
+        elif getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+        self.vision_tower.requires_grad_(False)
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+    @property
+    def image_size(self):
+        return self.config.image_size
+class SiglipVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.input_image_size = args.input_image_size
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        self.is_loaded = False
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = SiglipVisionConfig.from_pretrained(self.vision_tower_name)
+    def load_model(self, device_map=None):
+        if self. is_loaded:
+            return
+        self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name)
+        self.image_processor.crop_size = {'height':self.input_image_size, 'width':self.input_image_size}
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs, dtype):
+        image_features = image_forward_outs.hidden_states
+        if self.select_feature == 'patch':
+            image_features = image_features[self.select_layer].to(dtype)
+        elif self.select_feature == 'list':
+            image_features = [feature.to(dtype) for feature in image_features[::7]]
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out, image.dtype)
+                image_features.append(image_feature)
+        else:
+            batch_size = images.shape[0]
+            chunk_size = 256
+            image_features = []
+            for i in range(0, batch_size, chunk_size):
+                chunk = images[i:i+chunk_size].to(device=self.device, dtype=self.dtype)
+                chunk_forward_outs = self.vision_tower(chunk, output_hidden_states=True)
+                chunk_features = self.feature_select(chunk_forward_outs, images.dtype)
+                image_features.append(chunk_features)
+            image_features = torch.cat(image_features, dim=0)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+    @property
+    def image_size(self):
+        return self.config.image_size

llava/model/multimodal_encoder/convnext_encoder.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# This file is modified from https://github.com/luogen1996/LLaVA-HR
+import torch
+import torch.nn as nn
+from transformers import CLIPImageProcessor
+from .vision_models.convnext import convnext_xxlarge
+from torch.utils.checkpoint import checkpoint
+from llava.utils import load_state_dict_into_model
+import safetensors
+from collections import OrderedDict
+cfg={
+    "crop_size": 256,
+    "do_center_crop": True,
+    "do_normalize": True,
+    "do_resize": True,
+    "feature_extractor_type": "CLIPFeatureExtractor",
+    "image_mean": [
+        0.48145466,
+        0.4578275,
+        0.40821073
+    ],
+    "image_std": [
+        0.26862954,
+        0.26130258,
+        0.27577711
+    ],
+    "resample": 3,
+    "size": 256
+}
+class ConvNextVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.freeze_vision=args.freeze_vision
+        self.input_image_size=args.input_image_size
+        self.vision_tower_name = vision_tower
+        self.select_layer = -1 # hardcode
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        self.load_model()
+    def load_model(self):
+        self.image_processor = CLIPImageProcessor(**cfg)
+        if 'xxlarge' in self.vision_tower_name:
+            self.vision_tower = convnext_xxlarge(self.vision_tower_name)
+            setattr(self.vision_tower, 'hidden_size', 3072)
+            # load weights manually to avoid deepspeed issue
+            # encoder_ckpt = safetensors.torch.load_file("checkpoints/convnext-xxl-clip.safetensors", device="cpu")
+            # new_dict = OrderedDict()
+            # for k, v in encoder_ckpt.items():
+            #     if "gamma" in k:
+            #         k = k.replace("gamma", "weight")
+            #     new_dict[k] = v
+            # encoder_ckpt = new_dict
+            # load_state_dict_into_model(model_to_load=self.vision_tower,
+            #                            state_dict=encoder_ckpt)
+        else:
+            raise NotImplementedError
+        if self.freeze_vision:
+            self.vision_tower.requires_grad_(False)
+        # Hardcode
+        for s in self.vision_tower.stages:
+            s.grad_checkpointing = True
+        if self.input_image_size is not None:
+            self.image_processor.size=self.input_image_size
+            self.image_processor.crop_size={
+                'height':self.input_image_size,
+                'width': self.input_image_size
+            }
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs[self.select_layer]
+        return image_features
+    def forward_features(self, x):
+        x = self.vision_tower.stem(x)
+        image_forward_out=[]
+        for blk in self.vision_tower.stages:
+            x = blk(x)
+            b,c,h,w=x.shape
+            image_forward_out.append(x.view(b,c,-1).transpose(1,2))
+        return image_forward_out
+    def forward(self, images):
+        if self.freeze_vision:
+            with torch.no_grad():
+                image_features = self._forward_images(images)
+        else:
+            image_features = self._forward_images(images)
+        return image_features
+    def _forward_images(self, images):
+        image_forward_outs = self.forward_features(images.to(device=self.device, dtype=self.dtype))
+        image_features = self.feature_select(image_forward_outs)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return next(self.vision_tower.parameters()).dtype
+    @property
+    def device(self):
+        return next(self.vision_tower.parameters()).device
+    @property
+    def config(self):
+        assert  NotImplementedError
+        pass
+    @property
+    def num_attention_heads(self):
+        # as constant
+        return 16
+    @property
+    def num_layers(self):
+        # as constant
+        return 4
+    @property
+    def hidden_size(self):
+        return self.vision_tower.hidden_size
+    @property
+    def num_patches(self):
+        return (cfg['image_size'] // self.patch_embed.patch_size[0]) ** 2

llava/model/multimodal_encoder/vision_models/__init__.py ADDED Viewed

File without changes

llava/model/multimodal_encoder/vision_models/convnext.py ADDED Viewed

	@@ -0,0 +1,1109 @@

+""" ConvNeXt
+Papers:
+* `A ConvNet for the 2020s` - https://arxiv.org/pdf/2201.03545.pdf
+@Article{liu2022convnet,
+  author  = {Zhuang Liu and Hanzi Mao and Chao-Yuan Wu and Christoph Feichtenhofer and Trevor Darrell and Saining Xie},
+  title   = {A ConvNet for the 2020s},
+  journal = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year    = {2022},
+}
+* `ConvNeXt-V2 - Co-designing and Scaling ConvNets with Masked Autoencoders` - https://arxiv.org/abs/2301.00808
+@article{Woo2023ConvNeXtV2,
+  title={ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders},
+  author={Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon and Saining Xie},
+  year={2023},
+  journal={arXiv preprint arXiv:2301.00808},
+}
+Original code and weights from:
+* https://github.com/facebookresearch/ConvNeXt, original copyright below
+* https://github.com/facebookresearch/ConvNeXt-V2, original copyright below
+Model defs atto, femto, pico, nano and _ols / _hnf variants are timm originals.
+Modifications and additions for timm hacked together by / Copyright 2022, Ross Wightman
+"""
+# ConvNeXt
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the MIT license
+# ConvNeXt-V2
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree (Attribution-NonCommercial 4.0 International (CC BY-NC 4.0))
+# No code was used directly from ConvNeXt-V2, however the weights are CC BY-NC 4.0 so beware if using commercially.
+from collections import OrderedDict
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+from timm.layers import trunc_normal_, AvgPool2dSame, DropPath, Mlp, GlobalResponseNormMlp, \
+    LayerNorm2d, LayerNorm, create_conv2d, get_act_layer, make_divisible, to_ntuple
+from timm.layers import NormMlpClassifierHead, ClassifierHead
+from timm.models._builder import build_model_with_cfg
+from timm.models._manipulate import named_apply, checkpoint_seq
+from timm.models._registry import generate_default_cfgs, register_model, register_model_deprecations
+__all__ = ['ConvNeXt']  # model_registry will add each entrypoint fn to this
+class Downsample(nn.Module):
+    def __init__(self, in_chs, out_chs, stride=1, dilation=1):
+        super().__init__()
+        avg_stride = stride if dilation == 1 else 1
+        if stride > 1 or dilation > 1:
+            avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
+            self.pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
+        else:
+            self.pool = nn.Identity()
+        if in_chs != out_chs:
+            self.conv = create_conv2d(in_chs, out_chs, 1, stride=1)
+        else:
+            self.conv = nn.Identity()
+    def forward(self, x):
+        x = self.pool(x)
+        x = self.conv(x)
+        return x
+class ConvNeXtBlock(nn.Module):
+    """ ConvNeXt Block
+    There are two equivalent implementations:
+      (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+      (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    Unlike the official impl, this one allows choice of 1 or 2, 1x1 conv can be faster with appropriate
+    choice of LayerNorm impl, however as model size increases the tradeoffs appear to change and nn.Linear
+    is a better choice. This was observed with PyTorch 1.10 on 3090 GPU, it could change over time & w/ different HW.
+    """
+    def __init__(
+            self,
+            in_chs: int,
+            out_chs: Optional[int] = None,
+            kernel_size: int = 7,
+            stride: int = 1,
+            dilation: Union[int, Tuple[int, int]] = (1, 1),
+            mlp_ratio: float = 4,
+            conv_mlp: bool = False,
+            conv_bias: bool = True,
+            use_grn: bool = False,
+            ls_init_value: Optional[float] = 1e-6,
+            act_layer: Union[str, Callable] = 'gelu',
+            norm_layer: Optional[Callable] = None,
+            drop_path: float = 0.,
+    ):
+        """
+        Args:
+            in_chs: Block input channels.
+            out_chs: Block output channels (same as in_chs if None).
+            kernel_size: Depthwise convolution kernel size.
+            stride: Stride of depthwise convolution.
+            dilation: Tuple specifying input and output dilation of block.
+            mlp_ratio: MLP expansion ratio.
+            conv_mlp: Use 1x1 convolutions for MLP and a NCHW compatible norm layer if True.
+            conv_bias: Apply bias for all convolution (linear) layers.
+            use_grn: Use GlobalResponseNorm in MLP (from ConvNeXt-V2)
+            ls_init_value: Layer-scale init values, layer-scale applied if not None.
+            act_layer: Activation layer.
+            norm_layer: Normalization layer (defaults to LN if not specified).
+            drop_path: Stochastic depth probability.
+        """
+        super().__init__()
+        out_chs = out_chs or in_chs
+        dilation = to_ntuple(2)(dilation)
+        act_layer = get_act_layer(act_layer)
+        if not norm_layer:
+            norm_layer = LayerNorm2d if conv_mlp else LayerNorm
+        mlp_layer = partial(GlobalResponseNormMlp if use_grn else Mlp, use_conv=conv_mlp)
+        self.use_conv_mlp = conv_mlp
+        self.conv_dw = create_conv2d(
+            in_chs,
+            out_chs,
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation[0],
+            depthwise=True,
+            bias=conv_bias,
+        )
+        self.norm = norm_layer(out_chs)
+        self.mlp = mlp_layer(out_chs, int(mlp_ratio * out_chs), act_layer=act_layer)
+        self.weight = nn.Parameter(ls_init_value * torch.ones(out_chs)) if ls_init_value is not None else None
+        if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
+            self.shortcut = Downsample(in_chs, out_chs, stride=stride, dilation=dilation[0])
+        else:
+            self.shortcut = nn.Identity()
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x):
+        shortcut = x
+        x = self.conv_dw(x)
+        if self.use_conv_mlp:
+            x = self.norm(x)
+            x = self.mlp(x)
+        else:
+            x = x.permute(0, 2, 3, 1)
+            x = self.norm(x)
+            x = self.mlp(x)
+            x = x.permute(0, 3, 1, 2)
+        if self.weight is not None:
+            x = x.mul(self.weight.reshape(1, -1, 1, 1))
+        x = self.drop_path(x) + self.shortcut(shortcut)
+        return x
+class ConvNeXtStage(nn.Module):
+    def __init__(
+            self,
+            in_chs,
+            out_chs,
+            kernel_size=7,
+            stride=2,
+            depth=2,
+            dilation=(1, 1),
+            drop_path_rates=None,
+            ls_init_value=1.0,
+            conv_mlp=False,
+            conv_bias=True,
+            use_grn=False,
+            act_layer='gelu',
+            norm_layer=None,
+            norm_layer_cl=None
+    ):
+        super().__init__()
+        self.grad_checkpointing = False
+        if in_chs != out_chs or stride > 1 or dilation[0] != dilation[1]:
+            ds_ks = 2 if stride > 1 or dilation[0] != dilation[1] else 1
+            pad = 'same' if dilation[1] > 1 else 0  # same padding needed if dilation used
+            self.downsample = nn.Sequential(
+                norm_layer(in_chs),
+                create_conv2d(
+                    in_chs,
+                    out_chs,
+                    kernel_size=ds_ks,
+                    stride=stride,
+                    dilation=dilation[0],
+                    padding=pad,
+                    bias=conv_bias,
+                ),
+            )
+            in_chs = out_chs
+        else:
+            self.downsample = nn.Identity()
+        drop_path_rates = drop_path_rates or [0.] * depth
+        stage_blocks = []
+        for i in range(depth):
+            stage_blocks.append(ConvNeXtBlock(
+                in_chs=in_chs,
+                out_chs=out_chs,
+                kernel_size=kernel_size,
+                dilation=dilation[1],
+                drop_path=drop_path_rates[i],
+                ls_init_value=ls_init_value,
+                conv_mlp=conv_mlp,
+                conv_bias=conv_bias,
+                use_grn=use_grn,
+                act_layer=act_layer,
+                norm_layer=norm_layer if conv_mlp else norm_layer_cl,
+            ))
+            in_chs = out_chs
+        self.blocks = nn.Sequential(*stage_blocks)
+    def forward(self, x):
+        x = self.downsample(x)
+        if self.grad_checkpointing and self.training and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        return x
+class ConvNeXt(nn.Module):
+    r""" ConvNeXt
+        A PyTorch impl of : `A ConvNet for the 2020s`  - https://arxiv.org/pdf/2201.03545.pdf
+    """
+    def __init__(
+            self,
+            in_chans: int = 3,
+            num_classes: int = 1000,
+            global_pool: str = 'avg',
+            output_stride: int = 32,
+            depths: Tuple[int, ...] = (3, 3, 9, 3),
+            dims: Tuple[int, ...] = (96, 192, 384, 768),
+            kernel_sizes: Union[int, Tuple[int, ...]] = 7,
+            ls_init_value: Optional[float] = 1e-6,
+            stem_type: str = 'patch',
+            patch_size: int = 4,
+            head_init_scale: float = 1.,
+            head_norm_first: bool = False,
+            head_hidden_size: Optional[int] = None,
+            conv_mlp: bool = False,
+            conv_bias: bool = True,
+            use_grn: bool = False,
+            act_layer: Union[str, Callable] = 'gelu',
+            norm_layer: Optional[Union[str, Callable]] = None,
+            norm_eps: Optional[float] = None,
+            drop_rate: float = 0.,
+            drop_path_rate: float = 0.,
+    ):
+        """
+        Args:
+            in_chans: Number of input image channels.
+            num_classes: Number of classes for classification head.
+            global_pool: Global pooling type.
+            output_stride: Output stride of network, one of (8, 16, 32).
+            depths: Number of blocks at each stage.
+            dims: Feature dimension at each stage.
+            kernel_sizes: Depthwise convolution kernel-sizes for each stage.
+            ls_init_value: Init value for Layer Scale, disabled if None.
+            stem_type: Type of stem.
+            patch_size: Stem patch size for patch stem.
+            head_init_scale: Init scaling value for classifier weights and biases.
+            head_norm_first: Apply normalization before global pool + head.
+            head_hidden_size: Size of MLP hidden layer in head if not None and head_norm_first == False.
+            conv_mlp: Use 1x1 conv in MLP, improves speed for small networks w/ chan last.
+            conv_bias: Use bias layers w/ all convolutions.
+            use_grn: Use Global Response Norm (ConvNeXt-V2) in MLP.
+            act_layer: Activation layer type.
+            norm_layer: Normalization layer type.
+            drop_rate: Head pre-classifier dropout rate.
+            drop_path_rate: Stochastic depth drop rate.
+        """
+        super().__init__()
+        assert output_stride in (8, 16, 32)
+        kernel_sizes = to_ntuple(4)(kernel_sizes)
+        if norm_layer is None:
+            norm_layer = LayerNorm2d
+            norm_layer_cl = norm_layer if conv_mlp else LayerNorm
+            if norm_eps is not None:
+                norm_layer = partial(norm_layer, eps=norm_eps)
+                norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
+        else:
+            assert conv_mlp,\
+                'If a norm_layer is specified, conv MLP must be used so all norm expect rank-4, channels-first input'
+            norm_layer_cl = norm_layer
+            if norm_eps is not None:
+                norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        self.feature_info = []
+        assert stem_type in ('patch', 'overlap', 'overlap_tiered')
+        if stem_type == 'patch':
+            # NOTE: this stem is a minimal form of ViT PatchEmbed, as used in SwinTransformer w/ patch_size = 4
+            self.stem = nn.Sequential(
+                nn.Conv2d(in_chans, dims[0], kernel_size=patch_size, stride=patch_size, bias=conv_bias),
+                norm_layer(dims[0]),
+            )
+            stem_stride = patch_size
+        else:
+            mid_chs = make_divisible(dims[0] // 2) if 'tiered' in stem_type else dims[0]
+            self.stem = nn.Sequential(
+                nn.Conv2d(in_chans, mid_chs, kernel_size=3, stride=2, padding=1, bias=conv_bias),
+                nn.Conv2d(mid_chs, dims[0], kernel_size=3, stride=2, padding=1, bias=conv_bias),
+                norm_layer(dims[0]),
+            )
+            stem_stride = 4
+        self.stages = nn.Sequential()
+        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        stages = []
+        prev_chs = dims[0]
+        curr_stride = stem_stride
+        dilation = 1
+        # 4 feature resolution stages, each consisting of multiple residual blocks
+        for i in range(4):
+            stride = 2 if curr_stride == 2 or i > 0 else 1
+            if curr_stride >= output_stride and stride > 1:
+                dilation *= stride
+                stride = 1
+            curr_stride *= stride
+            first_dilation = 1 if dilation in (1, 2) else 2
+            out_chs = dims[i]
+            stages.append(ConvNeXtStage(
+                prev_chs,
+                out_chs,
+                kernel_size=kernel_sizes[i],
+                stride=stride,
+                dilation=(first_dilation, dilation),
+                depth=depths[i],
+                drop_path_rates=dp_rates[i],
+                ls_init_value=ls_init_value,
+                conv_mlp=conv_mlp,
+                conv_bias=conv_bias,
+                use_grn=use_grn,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                norm_layer_cl=norm_layer_cl,
+            ))
+            prev_chs = out_chs
+            # NOTE feature_info use currently assumes stage 0 == stride 1, rest are stride 2
+            self.feature_info += [dict(num_chs=prev_chs, reduction=curr_stride, module=f'stages.{i}')]
+        self.stages = nn.Sequential(*stages)
+        self.num_features = prev_chs
+        # if head_norm_first == true, norm -> global pool -> fc ordering, like most other nets
+        # otherwise pool -> norm -> fc, the default ConvNeXt ordering (pretrained FB weights)
+        if head_norm_first:
+            assert not head_hidden_size
+            self.norm_pre = norm_layer(self.num_features)
+            self.head = ClassifierHead(
+                self.num_features,
+                num_classes,
+                pool_type=global_pool,
+                drop_rate=self.drop_rate,
+            )
+        else:
+            self.norm_pre = nn.Identity()
+            self.head = NormMlpClassifierHead(
+                self.num_features,
+                num_classes,
+                hidden_size=head_hidden_size,
+                pool_type=global_pool,
+                drop_rate=self.drop_rate,
+                norm_layer=norm_layer,
+                act_layer='gelu',
+            )
+        # named_apply(partial(_init_weights, head_init_scale=head_init_scale), self)
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^stem',
+            blocks=r'^stages\.(\d+)' if coarse else [
+                (r'^stages\.(\d+)\.downsample', (0,)),  # blocks
+                (r'^stages\.(\d+)\.blocks\.(\d+)', None),
+                (r'^norm_pre', (99999,))
+            ]
+        )
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        for s in self.stages:
+            s.grad_checkpointing = enable
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head.fc
+    def reset_classifier(self, num_classes=0, global_pool=None):
+        self.head.reset(num_classes, global_pool)
+    def forward_features(self, x):
+        x = self.stem(x)
+        x = self.stages(x)
+        x = self.norm_pre(x)
+        return x
+    def forward_head(self, x, pre_logits: bool = False):
+        return self.head(x, pre_logits=True) if pre_logits else self.head(x)
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+def _init_weights(module, name=None, head_init_scale=1.0):
+    if isinstance(module, nn.Conv2d):
+        trunc_normal_(module.weight, std=.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=.02)
+        nn.init.zeros_(module.bias)
+        if name and 'head.' in name:
+            module.weight.data.mul_(head_init_scale)
+            module.bias.data.mul_(head_init_scale)
+def checkpoint_filter_fn(state_dict, model):
+    """ Remap FB checkpoints -> timm """
+    if 'head.norm.weight' in state_dict or 'norm_pre.weight' in state_dict:
+        out_dict={}
+        out_dict = {k.replace('gamma', 'weight'): v for k, v in state_dict.items()}
+        return out_dict  # non-FB checkpoint
+    if 'model' in state_dict:
+        state_dict = state_dict['model']
+    out_dict = {}
+    if 'visual.trunk.stem.0.weight' in state_dict:
+        out_dict = {k.replace('visual.trunk.', '').replace('gamma', 'weight'): v for k, v in state_dict.items() if
+                    k.startswith('visual.trunk.')}
+        if 'visual.head.proj.weight' in state_dict:
+            out_dict['head.fc.weight'] = state_dict['visual.head.proj.weight']
+            out_dict['head.fc.bias'] = torch.zeros(state_dict['visual.head.proj.weight'].shape[0])
+        elif 'visual.head.mlp.fc1.weight' in state_dict:
+            out_dict['head.pre_logits.fc.weight'] = state_dict['visual.head.mlp.fc1.weight']
+            out_dict['head.pre_logits.fc.bias'] = state_dict['visual.head.mlp.fc1.bias']
+            out_dict['head.fc.weight'] = state_dict['visual.head.mlp.fc2.weight']
+            out_dict['head.fc.bias'] = torch.zeros(state_dict['visual.head.mlp.fc2.weight'].shape[0])
+        return out_dict
+    import re
+    for k, v in state_dict.items():
+        k = k.replace('downsample_layers.0.', 'stem.')
+        k = re.sub(r'stages.([0-9]+).([0-9]+)', r'stages.\1.blocks.\2', k)
+        k = re.sub(r'downsample_layers.([0-9]+).([0-9]+)', r'stages.\1.downsample.\2', k)
+        k = k.replace('dwconv', 'conv_dw')
+        k = k.replace('pwconv', 'mlp.fc')
+        if 'grn' in k:
+            k = k.replace('grn.beta', 'mlp.grn.bias')
+            k = k.replace('grn.gamma', 'mlp.grn.weight')
+            v = v.reshape(v.shape[-1])
+        k = k.replace('head.', 'head.fc.')
+        if k.startswith('norm.'):
+            k = k.replace('norm', 'head.norm')
+        if v.ndim == 2 and 'head' not in k:
+            model_shape = model.state_dict()[k].shape
+            v = v.reshape(model_shape)
+        k=k.replace('gamma','weight')
+        out_dict[k] = v
+    return out_dict
+def _create_convnext(variant, pretrained=False, **kwargs):
+    if kwargs.get('pretrained_cfg', '') == 'fcmae':
+        # NOTE fcmae pretrained weights have no classifier or final norm-layer (`head.norm`)
+        # This is workaround loading with num_classes=0 w/o removing norm-layer.
+        kwargs.setdefault('pretrained_strict', False)
+    model = build_model_with_cfg(
+        ConvNeXt, variant, pretrained,
+        pretrained_filter_fn=checkpoint_filter_fn,
+        feature_cfg=dict(out_indices=(0, 1, 2, 3), flatten_sequential=True),
+        **kwargs)
+    return model
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.0', 'classifier': 'head.fc',
+        **kwargs
+    }
+def _cfgv2(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.0', 'classifier': 'head.fc',
+        'license': 'cc-by-nc-4.0', 'paper_ids': 'arXiv:2301.00808',
+        'paper_name': 'ConvNeXt-V2: Co-designing and Scaling ConvNets with Masked Autoencoders',
+        'origin_url': 'https://github.com/facebookresearch/ConvNeXt-V2',
+        **kwargs
+    }
+default_cfgs = generate_default_cfgs({
+    # timm specific variants
+    'convnext_tiny.in12k_ft_in1k': _cfg(
+        hf_hub_id='timm/',
+        crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_small.in12k_ft_in1k': _cfg(
+        hf_hub_id='timm/',
+        crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_atto.d2_in1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_atto_d2-01bb0f51.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'convnext_atto_ols.a2_in1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_atto_ols_a2-78d1c8f3.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'convnext_femto.d1_in1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_femto_d1-d71d5b4c.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'convnext_femto_ols.d1_in1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_femto_ols_d1-246bf2ed.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'convnext_pico.d1_in1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_pico_d1-10ad7f0d.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'convnext_pico_ols.d1_in1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_pico_ols_d1-611f0ca7.pth',
+        hf_hub_id='timm/',
+        crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_nano.in12k_ft_in1k': _cfg(
+        hf_hub_id='timm/',
+        crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_nano.d1h_in1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_nano_d1h-7eb4bdea.pth',
+        hf_hub_id='timm/',
+        crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_nano_ols.d1h_in1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_nano_ols_d1h-ae424a9a.pth',
+        hf_hub_id='timm/',
+        crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_tiny_hnf.a2h_in1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_tiny_hnf_a2h-ab7e9df2.pth',
+        hf_hub_id='timm/',
+        crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_tiny.in12k_ft_in1k_384': _cfg(
+        hf_hub_id='timm/',
+       input_size=(3, 384, 384), pool_size=(12, 12),  crop_pct=1.0, crop_mode='squash'),
+    'convnext_small.in12k_ft_in1k_384': _cfg(
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0,  crop_mode='squash'),
+    'convnext_nano.in12k': _cfg(
+        hf_hub_id='timm/',
+        crop_pct=0.95, num_classes=11821),
+    'convnext_tiny.in12k': _cfg(
+        hf_hub_id='timm/',
+        crop_pct=0.95, num_classes=11821),
+    'convnext_small.in12k': _cfg(
+        hf_hub_id='timm/',
+        crop_pct=0.95, num_classes=11821),
+    'convnext_tiny.fb_in22k_ft_in1k': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_1k_224.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_small.fb_in22k_ft_in1k': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_1k_224.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_base.fb_in22k_ft_in1k': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_224.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_large.fb_in22k_ft_in1k': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_224.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_xlarge.fb_in22k_ft_in1k': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_224_ema.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_tiny.fb_in1k': _cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_small.fb_in1k': _cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_base.fb_in1k': _cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_large.fb_in1k': _cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_tiny.fb_in22k_ft_in1k_384': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_1k_384.pth',
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnext_small.fb_in22k_ft_in1k_384': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_1k_384.pth',
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnext_base.fb_in22k_ft_in1k_384': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_384.pth',
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnext_large.fb_in22k_ft_in1k_384': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_384.pth',
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnext_xlarge.fb_in22k_ft_in1k_384': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_384_ema.pth',
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnext_tiny.fb_in22k': _cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_224.pth",
+        hf_hub_id='timm/',
+        num_classes=21841),
+    'convnext_small.fb_in22k': _cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_224.pth",
+        hf_hub_id='timm/',
+        num_classes=21841),
+    'convnext_base.fb_in22k': _cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth",
+        hf_hub_id='timm/',
+        num_classes=21841),
+    'convnext_large.fb_in22k': _cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth",
+        hf_hub_id='timm/',
+        num_classes=21841),
+    'convnext_xlarge.fb_in22k': _cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth",
+        hf_hub_id='timm/',
+        num_classes=21841),
+    'convnextv2_nano.fcmae_ft_in22k_in1k': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_224_ema.pt',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnextv2_nano.fcmae_ft_in22k_in1k_384': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_384_ema.pt',
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnextv2_tiny.fcmae_ft_in22k_in1k': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_224_ema.pt",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnextv2_tiny.fcmae_ft_in22k_in1k_384': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_384_ema.pt",
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnextv2_base.fcmae_ft_in22k_in1k': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_224_ema.pt",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnextv2_base.fcmae_ft_in22k_in1k_384': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_384_ema.pt",
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnextv2_large.fcmae_ft_in22k_in1k': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_224_ema.pt",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnextv2_large.fcmae_ft_in22k_in1k_384': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_384_ema.pt",
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnextv2_huge.fcmae_ft_in22k_in1k_384': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_384_ema.pt",
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnextv2_huge.fcmae_ft_in22k_in1k_512': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_512_ema.pt",
+        hf_hub_id='timm/',
+        input_size=(3, 512, 512), pool_size=(15, 15), crop_pct=1.0, crop_mode='squash'),
+    'convnextv2_atto.fcmae_ft_in1k': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_atto_1k_224_ema.pt',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'convnextv2_femto.fcmae_ft_in1k': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_femto_1k_224_ema.pt',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'convnextv2_pico.fcmae_ft_in1k': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_pico_1k_224_ema.pt',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'convnextv2_nano.fcmae_ft_in1k': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_nano_1k_224_ema.pt',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnextv2_tiny.fcmae_ft_in1k': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_tiny_1k_224_ema.pt",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnextv2_base.fcmae_ft_in1k': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_base_1k_224_ema.pt",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnextv2_large.fcmae_ft_in1k': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_large_1k_224_ema.pt",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnextv2_huge.fcmae_ft_in1k': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_huge_1k_224_ema.pt",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnextv2_atto.fcmae': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_atto_1k_224_fcmae.pt',
+        hf_hub_id='timm/',
+        num_classes=0),
+    'convnextv2_femto.fcmae': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_femto_1k_224_fcmae.pt',
+        hf_hub_id='timm/',
+        num_classes=0),
+    'convnextv2_pico.fcmae': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_pico_1k_224_fcmae.pt',
+        hf_hub_id='timm/',
+        num_classes=0),
+    'convnextv2_nano.fcmae': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_nano_1k_224_fcmae.pt',
+        hf_hub_id='timm/',
+        num_classes=0),
+    'convnextv2_tiny.fcmae': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_tiny_1k_224_fcmae.pt",
+        hf_hub_id='timm/',
+        num_classes=0),
+    'convnextv2_base.fcmae': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_base_1k_224_fcmae.pt",
+        hf_hub_id='timm/',
+        num_classes=0),
+    'convnextv2_large.fcmae': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_large_1k_224_fcmae.pt",
+        hf_hub_id='timm/',
+        num_classes=0),
+    'convnextv2_huge.fcmae': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_huge_1k_224_fcmae.pt",
+        hf_hub_id='timm/',
+        num_classes=0),
+    'convnextv2_small.untrained': _cfg(),
+    # CLIP weights, fine-tuned on in1k or in12k + in1k
+    'convnext_base.clip_laion2b_augreg_ft_in12k_in1k': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0),
+    'convnext_base.clip_laion2b_augreg_ft_in12k_in1k_384': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnext_large_mlp.clip_laion2b_soup_ft_in12k_in1k_320': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0),
+    'convnext_large_mlp.clip_laion2b_soup_ft_in12k_in1k_384': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnext_base.clip_laion2b_augreg_ft_in1k': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0),
+    'convnext_base.clip_laiona_augreg_ft_in1k_384': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0),
+    'convnext_large_mlp.clip_laion2b_augreg_ft_in1k': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0
+    ),
+    'convnext_large_mlp.clip_laion2b_augreg_ft_in1k_384': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'
+    ),
+    'convnext_xxlarge.clip_laion2b_soup_ft_in1k': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0),
+    'convnext_base.clip_laion2b_augreg_ft_in12k': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0),
+    'convnext_large_mlp.clip_laion2b_soup_ft_in12k_320': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821,
+        input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0),
+    'convnext_large_mlp.clip_laion2b_augreg_ft_in12k_384': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821,
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnext_large_mlp.clip_laion2b_soup_ft_in12k_384': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821,
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnext_xxlarge.clip_laion2b_soup_ft_in12k': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0),
+    # CLIP original image tower weights
+    'convnext_base.clip_laion2b': _cfg(
+        hf_hub_id='laion/CLIP-convnext_base_w-laion2B-s13B-b82K',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=640),
+    'convnext_base.clip_laion2b_augreg': _cfg(
+        hf_hub_id='laion/CLIP-convnext_base_w-laion2B-s13B-b82K-augreg',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=640),
+    'convnext_base.clip_laiona': _cfg(
+        hf_hub_id='laion/CLIP-convnext_base_w-laion_aesthetic-s13B-b82K',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=640),
+    'convnext_base.clip_laiona_320': _cfg(
+        hf_hub_id='laion/CLIP-convnext_base_w_320-laion_aesthetic-s13B-b82K',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=640),
+    'convnext_base.clip_laiona_augreg_320': _cfg(
+        hf_hub_id='laion/CLIP-convnext_base_w_320-laion_aesthetic-s13B-b82K-augreg',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=640),
+    'convnext_large_mlp.clip_laion2b_augreg': _cfg(
+        hf_hub_id='laion/CLIP-convnext_large_d.laion2B-s26B-b102K-augreg',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=768),
+    'convnext_large_mlp.clip_laion2b_ft_320': _cfg(
+        hf_hub_id='laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=768),
+    'convnext_large_mlp.clip_laion2b_ft_soup_320': _cfg(
+        hf_hub_id='laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=768),
+    'convnext_xxlarge.clip_laion2b_soup': _cfg(
+        hf_hub_id='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=1024),
+    'convnext_xxlarge.clip_laion2b_rewind': _cfg(
+        hf_hub_id='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-rewind',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=1024),
+})
+# @register_model
+# def convnext_atto(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm femto variant (NOTE: still tweaking depths, will vary between 3-4M param, current is 3.7M
+#     model_args = dict(depths=(2, 2, 6, 2), dims=(40, 80, 160, 320), conv_mlp=True)
+#     model = _create_convnext('convnext_atto', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_atto_ols(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm femto variant with overlapping 3x3 conv stem, wider than non-ols femto above, current param count 3.7M
+#     model_args = dict(depths=(2, 2, 6, 2), dims=(40, 80, 160, 320), conv_mlp=True, stem_type='overlap_tiered')
+#     model = _create_convnext('convnext_atto_ols', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_femto(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm femto variant
+#     model_args = dict(depths=(2, 2, 6, 2), dims=(48, 96, 192, 384), conv_mlp=True)
+#     model = _create_convnext('convnext_femto', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_femto_ols(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm femto variant
+#     model_args = dict(depths=(2, 2, 6, 2), dims=(48, 96, 192, 384), conv_mlp=True, stem_type='overlap_tiered')
+#     model = _create_convnext('convnext_femto_ols', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_pico(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm pico variant
+#     model_args = dict(depths=(2, 2, 6, 2), dims=(64, 128, 256, 512), conv_mlp=True)
+#     model = _create_convnext('convnext_pico', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_pico_ols(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm nano variant with overlapping 3x3 conv stem
+#     model_args = dict(depths=(2, 2, 6, 2), dims=(64, 128, 256, 512), conv_mlp=True,  stem_type='overlap_tiered')
+#     model = _create_convnext('convnext_pico_ols', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_nano(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm nano variant with standard stem and head
+#     model_args = dict(depths=(2, 2, 8, 2), dims=(80, 160, 320, 640), conv_mlp=True)
+#     model = _create_convnext('convnext_nano', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_nano_ols(pretrained=False, **kwargs) -> ConvNeXt:
+#     # experimental nano variant with overlapping conv stem
+#     model_args = dict(depths=(2, 2, 8, 2), dims=(80, 160, 320, 640), conv_mlp=True, stem_type='overlap')
+#     model = _create_convnext('convnext_nano_ols', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_tiny_hnf(pretrained=False, **kwargs) -> ConvNeXt:
+#     # experimental tiny variant with norm before pooling in head (head norm first)
+#     model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), head_norm_first=True, conv_mlp=True)
+#     model = _create_convnext('convnext_tiny_hnf', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_tiny(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768))
+#     model = _create_convnext('convnext_tiny', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_small(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768])
+#     model = _create_convnext('convnext_small', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_base_clip(pretrained='', **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024])
+#     model = _create_convnext(pretrained, pretrained=True, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_base(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024])
+#     model = _create_convnext('convnext_base', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_large(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536])
+#     model = _create_convnext('convnext_large', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_large_mlp(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], head_hidden_size=1536)
+#     model = _create_convnext('convnext_large_mlp', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_xlarge(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048])
+#     model = _create_convnext('convnext_xlarge', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+def convnext_xxlarge(pretrained=False, **kwargs) -> ConvNeXt:
+    model_args = dict(depths=[3, 4, 30, 3], dims=[384, 768, 1536, 3072], norm_eps=kwargs.pop('norm_eps', 1e-5))
+    # model = _create_convnext('convnext_xxlarge', pretrained=pretrained, **dict(model_args, **kwargs))
+    model = _create_convnext('convnext_xxlarge', pretrained=False, **dict(model_args, **kwargs))
+    return model
+# @register_model
+# def convnextv2_atto(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm femto variant (NOTE: still tweaking depths, will vary between 3-4M param, current is 3.7M
+#     model_args = dict(
+#         depths=(2, 2, 6, 2), dims=(40, 80, 160, 320), use_grn=True, ls_init_value=None, conv_mlp=True)
+#     model = _create_convnext('convnextv2_atto', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnextv2_femto(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm femto variant
+#     model_args = dict(
+#         depths=(2, 2, 6, 2), dims=(48, 96, 192, 384), use_grn=True, ls_init_value=None, conv_mlp=True)
+#     model = _create_convnext('convnextv2_femto', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnextv2_pico(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm pico variant
+#     model_args = dict(
+#         depths=(2, 2, 6, 2), dims=(64, 128, 256, 512), use_grn=True, ls_init_value=None, conv_mlp=True)
+#     model = _create_convnext('convnextv2_pico', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnextv2_nano(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm nano variant with standard stem and head
+#     model_args = dict(
+#         depths=(2, 2, 8, 2), dims=(80, 160, 320, 640), use_grn=True, ls_init_value=None, conv_mlp=True)
+#     model = _create_convnext('convnextv2_nano', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnextv2_tiny(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), use_grn=True, ls_init_value=None)
+#     model = _create_convnext('convnextv2_tiny', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnextv2_small(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], use_grn=True, ls_init_value=None)
+#     model = _create_convnext('convnextv2_small', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnextv2_base(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], use_grn=True, ls_init_value=None)
+#     model = _create_convnext('convnextv2_base', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnextv2_large(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], use_grn=True, ls_init_value=None)
+#     model = _create_convnext('convnextv2_large', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnextv2_huge(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[352, 704, 1408, 2816], use_grn=True, ls_init_value=None)
+#     model = _create_convnext('convnextv2_huge', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# register_model_deprecations(__name__, {
+#     'convnext_tiny_in22ft1k': 'convnext_tiny.fb_in22k_ft_in1k',
+#     'convnext_small_in22ft1k': 'convnext_small.fb_in22k_ft_in1k',
+#     'convnext_base_in22ft1k': 'convnext_base.fb_in22k_ft_in1k',
+#     'convnext_large_in22ft1k': 'convnext_large.fb_in22k_ft_in1k',
+#     'convnext_xlarge_in22ft1k': 'convnext_xlarge.fb_in22k_ft_in1k',
+#     'convnext_tiny_384_in22ft1k': 'convnext_tiny.fb_in22k_ft_in1k_384',
+#     'convnext_small_384_in22ft1k': 'convnext_small.fb_in22k_ft_in1k_384',
+#     'convnext_base_384_in22ft1k': 'convnext_base.fb_in22k_ft_in1k_384',
+#     'convnext_large_384_in22ft1k': 'convnext_large.fb_in22k_ft_in1k_384',
+#     'convnext_xlarge_384_in22ft1k': 'convnext_xlarge.fb_in22k_ft_in1k_384',
+#     'convnext_tiny_in22k': 'convnext_tiny.fb_in22k',
+#     'convnext_small_in22k': 'convnext_small.fb_in22k',
+#     'convnext_base_in22k': 'convnext_base.fb_in22k',
+#     'convnext_large_in22k': 'convnext_large.fb_in22k',
+#     'convnext_xlarge_in22k': 'convnext_xlarge.fb_in22k',
+# })

llava/model/multimodal_projector/builder.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Based on https://github.com/haotian-liu/LLaVA.
+import torch
+import torch.nn as nn
+import re
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": 'identity'}
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+def build_vision_projector(config, delay_load=False, fpn_input_dim=[], **kwargs):
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+    # if getattr(config, 'mm_use_4_vision_tokens', False):
+    #     mm_hidden_size = config.mm_hidden_size * 4
+    # else:
+    mm_hidden_size = config.mm_hidden_size
+    if projector_type == 'linear':
+        return nn.Linear(mm_hidden_size, config.hidden_size)
+    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+    if projector_type == 'identity':
+        return IdentityMap()
+    raise ValueError(f'Unknown projector type: {projector_type}')

llava/train/llava_trainer.py ADDED Viewed

	@@ -0,0 +1,321 @@

+# This file is modified from https://github.com/haotian-liu/LLaVA/
+import os
+import torch
+import torch.nn as nn
+from torch.utils.data import Sampler
+from transformers import Trainer
+from transformers.trainer import (
+    is_sagemaker_mp_enabled,
+    get_parameter_names,
+    has_length,
+    ALL_LAYERNORM_LAYERS,
+    logger,
+)
+from typing import List, Optional
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                print(name, 'no ignore status')
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
+    to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
+    return to_return
+def split_to_even_chunks(indices, lengths, num_chunks):
+    """
+    Split a list of indices into `chunks` chunks of roughly equal lengths.
+    """
+    if len(indices) % num_chunks != 0:
+        return [indices[i::num_chunks] for i in range(num_chunks)]
+    num_indices_per_chunk = len(indices) // num_chunks
+    chunks = [[] for _ in range(num_chunks)]
+    chunks_lengths = [0 for _ in range(num_chunks)]
+    for index in indices:
+        shortest_chunk = chunks_lengths.index(min(chunks_lengths))
+        chunks[shortest_chunk].append(index)
+        chunks_lengths[shortest_chunk] += lengths[index]
+        if len(chunks[shortest_chunk]) == num_indices_per_chunk:
+            chunks_lengths[shortest_chunk] = float("inf")
+    return chunks
+def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    assert all(l != 0 for l in lengths), "Should not have zero length."
+    if all(l > 0 for l in lengths) or all(l < 0 for l in lengths):
+        # all samples are in the same modality
+        return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator)
+    mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
+    lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])
+    mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]
+    lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
+    megabatch_size = world_size * batch_size
+    mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
+    lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]
+    last_mm = mm_megabatches[-1]
+    last_lang = lang_megabatches[-1]
+    additional_batch = last_mm + last_lang
+    megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
+    megabatch_indices = torch.randperm(len(megabatches), generator=generator)
+    megabatches = [megabatches[i] for i in megabatch_indices]
+    if len(additional_batch) > 0:
+        megabatches.append(sorted(additional_batch))
+    return [i for megabatch in megabatches for i in megabatch]
+def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    indices = torch.randperm(len(lengths), generator=generator)
+    megabatch_size = world_size * batch_size
+    megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
+    megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
+    megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]
+    return [i for megabatch in megabatches for batch in megabatch for i in batch]
+class LengthGroupedSampler(Sampler):
+    r"""
+    Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
+    keeping a bit of randomness.
+    """
+    def __init__(
+        self,
+        batch_size: int,
+        world_size: int,
+        lengths: Optional[List[int]] = None,
+        generator=None,
+        group_by_modality: bool = False,
+    ):
+        if lengths is None:
+            raise ValueError("Lengths must be provided.")
+        self.batch_size = batch_size
+        self.world_size = world_size
+        self.lengths = lengths
+        self.generator = generator
+        self.group_by_modality = group_by_modality
+    def __len__(self):
+        return len(self.lengths)
+    def __iter__(self):
+        if self.group_by_modality:
+            indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+        else:
+            indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+        return iter(indices)
+class LlavaTrainer(Trainer):
+    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
+        if self.train_dataset is None or not has_length(self.train_dataset):
+            return None
+        if self.args.group_by_modality_length:
+            lengths = self.train_dataset.modality_lengths
+            return LengthGroupedSampler(
+                self.args.train_batch_size,
+                world_size=self.args.world_size * self.args.gradient_accumulation_steps,
+                lengths=lengths,
+                group_by_modality=True,
+            )
+        else:
+            return super()._get_train_sampler()
+    def create_optimizer(self):
+        """
+        Setup the optimizer.
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
+        """
+        if is_sagemaker_mp_enabled():
+            return super().create_optimizer()
+        opt_model = self.model
+        if self.optimizer is None:
+            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
+            decay_parameters = [name for name in decay_parameters if "bias" not in name]
+            if self.args.mm_projector_lr is not None:
+                projector_parameters = [name for name, _ in opt_model.named_parameters() if "mm_projector" in name]
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                        "lr": self.args.mm_projector_lr,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                        "lr": self.args.mm_projector_lr,
+                    },
+                ]
+            elif self.args.cross_attention_layer_lr:
+                cross_attn_parameters = [name for name, _ in opt_model.named_parameters() if "cross_attn_" in name]
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in cross_attn_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in cross_attn_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in cross_attn_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                        "lr": self.args.cross_attention_layer_lr,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in cross_attn_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                        "lr": self.args.cross_attention_layer_lr,
+                    },
+                ]
+            elif self.args.vision_tower_lr:
+                vision_tower_parameters = [name for name, _ in opt_model.named_parameters() if "vision_tower" in name]
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in vision_tower_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in vision_tower_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in vision_tower_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                        "lr": self.args.vision_tower_lr,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in vision_tower_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                        "lr": self.args.vision_tower_lr,
+                    },
+                ]
+            else:
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                ]
+            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
+            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+            if optimizer_cls.__name__ == "Adam8bit":
+                import bitsandbytes
+                manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
+                skipped = 0
+                for module in opt_model.modules():
+                    if isinstance(module, nn.Embedding):
+                        skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
+                        logger.info(f"skipped {module}: {skipped/2**20}M params")
+                        manager.register_module_override(module, "weight", {"optim_bits": 32})
+                        logger.debug(f"bitsandbytes: will optimize {module} in fp32")
+                logger.info(f"skipped: {skipped/2**20}M params")
+        return self.optimizer
+    # Min: save all the model parameters even during pretraining
+    # def _save_checkpoint(self, model, trial, metrics=None):
+    #     if getattr(self.args, 'tune_mm_mlp_adapter', False):
+    #         from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+    #         checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+    #         run_dir = self._get_output_dir(trial=trial)
+    #         output_dir = os.path.join(run_dir, checkpoint_folder)
+    #         # Only save Adapter
+    #         keys_to_match = ['mm_projector', 'vision_resampler']
+    #         if getattr(self.args, "use_im_start_end", False):
+    #             keys_to_match.extend(['embed_tokens', 'embed_in'])
+    #         weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)
+    #         if self.args.local_rank == 0 or self.args.local_rank == -1:
+    #             self.model.config.save_pretrained(output_dir)
+    #             torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
+    #     else:
+    #         super(EagleTrainer, self)._save_checkpoint(model, trial, metrics)
+    # def _save(self, output_dir: Optional[str] = None, state_dict=None):
+    #     if getattr(self.args, 'tune_mm_mlp_adapter', False):
+    #         pass
+    #     else:
+    #         super(EagleTrainer, self)._save(output_dir, state_dict)

llava/utils.py ADDED Viewed

	@@ -0,0 +1,212 @@

+# This file is modified from https://github.com/haotian-liu/LLaVA/
+import datetime
+import time
+import logging
+import logging.handlers
+import os
+import sys
+import requests
+import torch
+import transformers
+from transformers.integrations import is_deepspeed_zero3_enabled
+from llava.constants import LOGDIR
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
+handler = None
+def build_logger(logger_name, logger_filename):
+    global handler
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO)
+    logging.getLogger().handlers[0].setFormatter(formatter)
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+    # Add a file handler for all loggers
+    if handler is None:
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(
+            filename, when='D', utc=True, encoding='UTF-8')
+        handler.setFormatter(formatter)
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+    return logger
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ''
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ''
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == '\n':
+                self.logger.log(self.log_level, line.rstrip())
+            else:
+                self.linebuf += line
+    def flush(self):
+        if self.linebuf != '':
+            self.logger.log(self.log_level, self.linebuf.rstrip())
+        self.linebuf = ''
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+def violates_moderation(text):
+    """
+    Check whether the text violates OpenAI moderation API.
+    """
+    url = "https://api.openai.com/v1/moderations"
+    headers = {"Content-Type": "application/json",
+               "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
+    text = text.replace("\n", "")
+    data = "{" + '"input": ' + f'"{text}"' + "}"
+    data = data.encode("utf-8")
+    try:
+        ret = requests.post(url, headers=headers, data=data, timeout=5)
+        flagged = ret.json()["results"][0]["flagged"]
+    except requests.exceptions.RequestException as e:
+        flagged = False
+    except KeyError as e:
+        flagged = False
+    return flagged
+def pretty_print_semaphore(semaphore):
+    if semaphore is None:
+        return "None"
+    return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
+@torch.no_grad()
+def load_state_dict_into_model(model_to_load, state_dict, start_prefix=""):
+    # copied and altered from:
+    #   https://github.com/huggingface/transformers/blob/9d35edbb30625489bf286a9b15aed0c5a3119c1c/src/transformers/modeling_utils.py#L650
+    #   https://github.com/baaivision/EVA/blob/2ca37a8c0d82b9496754f3fa9c3966b4caa54d75/EVA-CLIP-18B/shinji/eva_clip/factory.py#L168
+    # copy state_dict so _load_from_state_dict can modify it
+    metadata = getattr(state_dict, "_metadata", None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+    error_msgs = []
+    # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
+    # so we need to apply the function recursively.
+    def load(module: torch.nn.Module, prefix=""):
+        local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+        args = (state_dict, prefix, local_metadata, True, [], [], error_msgs)
+        # Parameters of module and children will start with prefix. We can exit early if there are none in this state_dict
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+            with deepspeed.zero.GatheredParameters(list(module.parameters(recurse=False)), modifier_rank=0):
+                if torch.distributed.get_rank() == 0:
+                    module._load_from_state_dict(*args)
+        else:
+            module._load_from_state_dict(*args)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + ".")
+    load(model_to_load, prefix=start_prefix)
+    # Delete `state_dict` so it could be collected by GC earlier. Note that `state_dict` is a copy of the argument, so
+    # it's safe to delete it.
+    del state_dict
+    return error_msgs
+class Timer:
+    def __init__(self):
+        self.start_time = None
+        self.elapsed_time = 0
+    def start(self):
+        self.start_time = time.time()
+    def reset(self):
+        self.start_time = None
+        self.elapsed_time = 0
+    def get_elapsed_time(self):
+        if self.start_time is not None:
+            return self.elapsed_time + (time.time() - self.start_time)
+class TimeoutTerminateCallback(transformers.TrainerCallback):
+    def __init__(self, args, total_time_limit=240, pre_terminate_time=10):
+        self.training_args = args
+        self.total_time_limit = total_time_limit
+        self.pre_terminate_time = pre_terminate_time
+        self.timer = Timer()
+        self.timer.start()
+        if args.local_rank == 0:
+            print(f"Timer for terminate callback has been set.\nTotal limit: {total_time_limit}min\nPre terminate time: {pre_terminate_time}min")
+        self.time_to_kill = (total_time_limit - pre_terminate_time) * 60
+    def on_step_end(self, args, state, control, model, **kwargs):
+        elapsed_time = self.timer.get_elapsed_time()
+        if elapsed_time > self.time_to_kill:
+            if args.local_rank == 0:
+                print("Timeout, start to save checkpoint....")
+            control.should_save = True
+            control.should_training_stop = True
+        return control