diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..f2c74859d7e120622a212127ea3cbcb5dfb22caf
Binary files /dev/null and b/.DS_Store differ
diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..a26ede40ac4da2c448114b873bf8524cf9b5d21d 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+demo/videos/ filter=lfs diff=lfs merge=lfs -text
+demo/videos/3.mp4 filter=lfs diff=lfs merge=lfs -text
+demo/videos/4.mp4 filter=lfs diff=lfs merge=lfs -text
+demo/videos/1.mp4 filter=lfs diff=lfs merge=lfs -text
+demo/videos/2.mp4 filter=lfs diff=lfs merge=lfs -text
+demo/images/4.jpg filter=lfs diff=lfs merge=lfs -text
+demo/images/5.jpg filter=lfs diff=lfs merge=lfs -text
+demo/images/6.jpg filter=lfs diff=lfs merge=lfs -text
+demo/images/8.jpg filter=lfs diff=lfs merge=lfs -text
+demo/images/1.jpg filter=lfs diff=lfs merge=lfs -text
+demo/images/2.jpg filter=lfs diff=lfs merge=lfs -text
+demo/images/3.jpg filter=lfs diff=lfs merge=lfs -text
+demo/images/7.jpg filter=lfs diff=lfs merge=lfs -text
+demo/images/LICENSE filter=lfs diff=lfs merge=lfs -text
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ceffc0625f1ea00cb339e68cae55cd95a2e4dac
--- /dev/null
+++ b/app.py
@@ -0,0 +1,562 @@
+import gradio as gr
+import numpy as np
+import torch
+from transformers import SamModel, SamProcessor
+from PIL import Image
+import os
+import cv2
+import argparse
+import sys
+# This is for making model initialization faster and has no effect since we are loading the weights
+sys.path.append('./')
+from videollama3 import disable_torch_init, model_init, mm_infer, get_model_output
+from videollama3.mm_utils import load_images
+from videollama3.mm_utils import load_video
+
+
+color_rgb = (1.0, 1.0, 1.0)
+color_rgbs = [
+        (1.0, 1.0, 1.0),
+        (1.0, 0.0, 0.0),
+        (0.0, 1.0, 1.0),
+        (0.0, 1.0, 0.0),
+        (0.0, 0.0, 1.0),
+        (1.0, 0.0, 1.0),
+    ]
+
+mask_list = []
+mask_raw_list = []
+mask_list_video = []
+mask_raw_list_video = []
+
+def extract_first_frame_from_video(video):
+    cap = cv2.VideoCapture(video)
+    success, frame = cap.read()
+    cap.release()
+    if success:
+        return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+    return None
+
+def extract_points_from_mask(mask_pil):
+    mask = np.asarray(mask_pil)[..., 0]
+    coords = np.nonzero(mask)
+    coords = np.stack((coords[1], coords[0]), axis=1)
+
+    return coords
+
+def add_contour(img, mask, color=(1., 1., 1.)):
+    img = img.copy()
+
+    mask = mask.astype(np.uint8) * 255
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cv2.drawContours(img, contours, -1, color, thickness=8)
+
+    return img
+
+def generate_masks(image):
+    global mask_list
+    global mask_raw_list
+    image['image'] = image['background'].convert('RGB')
+    # del image['background'], image['composite']
+    assert len(image['layers']) == 1, f"Expected 1 layer, got {len(image['layers'])}"
+
+    mask = Image.fromarray((np.asarray(image['layers'][0])[..., 3] > 0).astype(np.uint8) * 255).convert('RGB')
+    points = extract_points_from_mask(mask)
+    np.random.seed(0)
+    if points.shape[0] == 0:
+        raise gr.Error("No points selected")
+
+    points_selected_indices = np.random.choice(points.shape[0], size=min(points.shape[0], 8), replace=False)
+    points = points[points_selected_indices]
+    coords = [points.tolist()]
+    mask_np = apply_sam(image['image'], coords)
+
+    mask_raw_list.append(mask_np)
+    mask_image = Image.fromarray((mask_np[:,:,np.newaxis] * np.array(image['image'])).astype(np.uint8))
+    
+    mask_list.append((mask_image, f"<region{len(mask_list)}>"))
+    # Return a list containing the mask image.
+    image['layers'] = []
+    image['composite'] = image['background']
+    return mask_list, image
+
+
+def generate_masks_video(image):
+    global mask_list_video
+    global mask_raw_list_video
+    image['image'] = image['background'].convert('RGB')
+    # del image['background'], image['composite']
+    assert len(image['layers']) == 1, f"Expected 1 layer, got {len(image['layers'])}"
+
+    mask = Image.fromarray((np.asarray(image['layers'][0])[..., 3] > 0).astype(np.uint8) * 255).convert('RGB')
+    points = extract_points_from_mask(mask)
+    np.random.seed(0)
+    if points.shape[0] == 0:
+        raise gr.Error("No points selected")
+
+    points_selected_indices = np.random.choice(points.shape[0], size=min(points.shape[0], 8), replace=False)
+    points = points[points_selected_indices]
+    coords = [points.tolist()]
+    mask_np = apply_sam(image['image'], coords)
+
+    mask_raw_list_video.append(mask_np)
+    mask_image = Image.fromarray((mask_np[:,:,np.newaxis] * np.array(image['image'])).astype(np.uint8))
+    
+    mask_list_video.append((mask_image, f"<object{len(mask_list_video)}>"))
+    # Return a list containing the mask image.
+    image['layers'] = []
+    image['composite'] = image['background']
+    return mask_list_video, image
+
+
+
+def describe(image, mode, query, masks):
+    # Create an image object from the uploaded image
+    # print(image.keys())
+
+    image['image'] = image['background'].convert('RGB')
+    # del image['background'], image['composite']
+    assert len(image['layers']) == 1, f"Expected 1 layer, got {len(image['layers'])}"
+
+    # Handle both hex and rgba color formats
+    
+    img_np = np.asarray(image['image']).astype(float) / 255.
+    if mode=='Caption':
+        mask = Image.fromarray((np.asarray(image['layers'][0])[..., 3] > 0).astype(np.uint8) * 255).convert('RGB')
+        
+        points = extract_points_from_mask(mask)
+
+        np.random.seed(0)
+
+        if points.shape[0] == 0:
+            if len(masks)>1:
+                raise gr.Error("No points selected")
+
+        else:
+            # Randomly sample 8 points from the mask
+            # Follow DAM https://github.com/NVlabs/describe-anything
+            points_selected_indices = np.random.choice(points.shape[0], size=min(points.shape[0], 8), replace=False)
+            points = points[points_selected_indices]
+
+            coords = [points.tolist()]
+
+            mask_np = apply_sam(image['image'], coords)
+            
+            masks = []
+            masks.append(mask_np)
+        mask_ids = [0]
+        
+        img_with_contour_np = add_contour(img_np, mask_np, color=color_rgb)
+        img_with_contour_pil = Image.fromarray((img_with_contour_np * 255.).astype(np.uint8))
+    else:
+        masks = mask_raw_list
+        img_with_contour_np = img_np.copy()
+
+        mask_ids = []
+        for i, mask_np in enumerate(masks):
+            img_with_contour_np = add_contour(img_with_contour_np, mask_np, color=color_rgbs[i])
+            img_with_contour_pil = Image.fromarray((img_with_contour_np * 255.).astype(np.uint8))
+            mask_ids.append(0)
+    
+    masks = np.stack(masks, axis=0)
+    masks = torch.from_numpy(masks).to(torch.uint8)
+
+
+    
+    img = np.asarray(image['image'])
+    
+
+    if mode == "Caption":
+        query = '<image>\nPlease describe the <region> in the image in detail.'
+    else:
+        if len(masks)==1:
+            prefix = "<image>\nThere is 1 region in the image: <region0> <region>. "
+        else:
+            prefix = f"<image>\nThere is {len(masks)} region in the image: "
+            for i in range(len(masks)):
+                prefix += f"<region{i}><region>, "
+            prefix = prefix[:-2]+'. '
+        query = prefix + query
+    # print(query)
+
+    image['layers'] = []
+    image['composite'] = image['background']
+
+    text = ""
+    yield img_with_contour_pil, text, image
+    
+    for token in get_model_output(
+        [img],
+        query,
+        model=model,
+        tokenizer=tokenizer,
+        masks=masks,
+        mask_ids=mask_ids,
+        modal='image',
+        image_downsampling=1,
+        streaming=True,
+    ):
+        text += token
+        yield gr.update(), text, gr.update()
+
+  
+def load_first_frame(video_path):
+    cap = cv2.VideoCapture(video_path)
+    ret, frame = cap.read()
+    cap.release()
+    if not ret:
+        raise gr.Error("Could not read the video file.")
+    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+    image = Image.fromarray(frame)  
+    return image
+
+def describe_video(video_path, mode, query, annotated_frame, masks):
+    global mask_list_video
+    # Create a temporary directory to save extracted video frames
+    cap = cv2.VideoCapture(video_path)
+
+    video_tensor = load_video(video_path, fps=4, max_frames=768, frame_ids=[0])
+
+    annotated_frame['image'] = annotated_frame['background'].convert('RGB')
+
+    # Process the annotated frame from the image editor
+    if isinstance(annotated_frame, dict):
+        # Get the composite image with annotations
+        frame_img = annotated_frame.get("image", annotated_frame.get("background"))
+        if frame_img is None:
+            raise gr.Error("No valid annotation found in the image editor.")
+        frame_img = frame_img.convert("RGB")
+        
+        # Get the annotation layer
+        if "layers" in annotated_frame and len(annotated_frame["layers"]) > 0:
+            mask = Image.fromarray((np.asarray(annotated_frame["layers"][0])[..., 3] > 0).astype(np.uint8) * 255).convert("RGB")
+        else:
+            mask = Image.new("RGB", frame_img.size, 0)
+    else:
+        frame_img = annotated_frame.convert("RGB")
+        mask = Image.new("RGB", frame_img.size, 0)
+
+    img_np = np.asarray(annotated_frame['image']).astype(float) / 255.
+    # Extract points from the annotated mask (using the first channel)
+    if mode == "Caption":
+        points = extract_points_from_mask(mask)
+        np.random.seed(0)
+        if points.shape[0] == 0:
+            raise gr.Error("No points were selected in the annotation.")
+        # Randomly select up to 8 points
+        # Follow DAM https://github.com/NVlabs/describe-anything
+        points_selected_indices = np.random.choice(points.shape[0], size=min(points.shape[0], 8), replace=False)
+        points = points[points_selected_indices]
+
+        # print(f"Selected points (to SAM): {points}")
+
+        coords = [points.tolist()]
+
+        mask_np = apply_sam(annotated_frame['image'], coords)
+    
+        masks = []
+        masks.append(mask_np)
+        mask_ids = [0]
+
+        # img_with_contour_np = add_contour(img_np, mask_np, color=color_rgb)
+        # img_with_contour_pil = Image.fromarray((img_with_contour_np * 255.).astype(np.uint8))
+
+
+    else:
+        masks = mask_raw_list_video
+        img_with_contour_np = img_np.copy()
+        
+        mask_ids = []
+        for i, mask_np in enumerate(masks):
+            # img_with_contour_np = add_contour(img_with_contour_np, mask_np, color=color_rgbs[i])
+            # img_with_contour_pil = Image.fromarray((img_with_contour_np * 255.).astype(np.uint8))
+            mask_ids.append(0)
+    
+
+
+    masks = np.stack(masks, axis=0)
+    masks = torch.from_numpy(masks).to(torch.uint8)
+
+
+    
+
+    if mode == "Caption":
+        query = '<video>\nPlease describe the <region> in the video in detail.'
+    else:
+        if len(masks)==1:
+            prefix = "<video>\nThere is 1 object in the video: <object0> <region>. "
+        else:
+            prefix = f"<video>\nThere is {len(masks)} objects in the video: "
+            for i in range(len(masks)):
+                prefix += f"<object{i}><region>, "
+            prefix = prefix[:-2]+'. '
+        query = prefix + query
+    
+    # Initialize empty text
+    # text = description_generator
+    annotated_frame['layers'] = []
+    annotated_frame['composite'] = annotated_frame['background']
+
+    if mode=="Caption":
+        mask_list_video = []
+        mask_image = Image.fromarray((mask_np[:,:,np.newaxis] * np.array(annotated_frame['image'])).astype(np.uint8))
+        mask_list_video.append((mask_image, f"<object{len(mask_list_video)}>"))
+    text = ""
+    yield frame_img, text, mask_list_video
+
+    for token in get_model_output(
+        video_tensor,
+        query,
+        model=model,
+        tokenizer=tokenizer,
+        masks=masks,
+        mask_ids=mask_ids,
+        modal='video',
+        streaming=True,
+    ):
+        text += token
+        yield gr.update(), text, gr.update()
+
+
+
+def apply_sam(image, input_points):
+    inputs = sam_processor(image, input_points=input_points, return_tensors="pt").to(device)
+
+    with torch.no_grad():
+        outputs = sam_model(**inputs)
+
+    masks = sam_processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())[0][0]
+    scores = outputs.iou_scores[0, 0]
+
+    mask_selection_index = scores.argmax()
+
+    mask_np = masks[mask_selection_index].numpy()
+
+    return mask_np
+
+def clear_masks():
+    global mask_list
+    global mask_raw_list
+    mask_list = []
+    mask_raw_list = []
+    return []
+
+
+def clear_masks_video():
+    global mask_list_video
+    global mask_raw_list_video
+    mask_list_video = []
+    mask_raw_list_video = []
+    return []
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="VideoRefer gradio demo")
+    parser.add_argument("--model-path", type=str, default="DAMO-NLP-SG/VideoRefer-VideoLLaMA3-7B", help="Path to the model checkpoint")
+    parser.add_argument("--prompt-mode", type=str, default="focal_prompt", help="Prompt mode")
+    parser.add_argument("--conv-mode", type=str, default="v1", help="Conversation mode")
+    parser.add_argument("--temperature", type=float, default=0.2, help="Sampling temperature")
+    parser.add_argument("--top_p", type=float, default=0.5, help="Top-p for sampling")
+
+    args_cli = parser.parse_args()
+    print(args_cli.model_path)
+
+    with gr.Blocks(theme=gr.themes.Soft(primary_hue="amber")) as demo:
+
+        HEADER = ("""
+            <div>
+                <h1>VideoRefer X VideoLLaMA3 Demo</h1>
+                <h5 style="margin: 0;">Feel free to click on anything that grabs your interest!</h5>
+                <h5 style="margin: 0;">If this demo please you, please give us a star ⭐ on Github or 💖 on this space.</h5>
+            </div>
+            </div>
+            <div style="display: flex; justify-content: left; margin-top: 10px;">
+            <a href="https://arxiv.org/pdf/2501.00599"><img src="https://img.shields.io/badge/Arxiv-2501.00599-ECA8A7" style="margin-right: 5px;"></a>
+            <a href="https://github.com/DAMO-NLP-SG/VideoRefer"><img src='https://img.shields.io/badge/Github-VideoRefer-F7C97E' style="margin-right: 5px;"></a>
+            <a href="https://github.com/DAMO-NLP-SG/VideoLLaMA3"><img src='https://img.shields.io/badge/Github-VideoLLaMA3-9DC3E6' style="margin-right: 5px;"></a>
+            </div>
+            """)
+
+        with gr.Row():
+            with gr.Column():
+                gr.HTML(HEADER)
+    
+
+        image_tips = """
+                ### 💡 Tips:
+
+                🧸 Upload an image, and you can use the drawing tool✍️ to highlight the areas you're interested in.
+            
+                🔖 For single-object caption mode, simply select the area and click the 'Generate Caption' button to receive a caption for the object.
+                
+                🔔 In QA mode, you can generate multiple masks by clicking the 'Generate Mask' button multiple times. Afterward, use the corresponding object id to ask questions.
+                
+                📌 Click the button 'Clear Masks' to clear the current generated masks.
+                
+                """
+        
+        video_tips = """
+                ### 💡 Tips:
+                ⚠️ For video mode, we only support masking on the first frame in this demo.
+
+                🧸 Upload an video, and you can use the drawing tool✍️ to highlight the areas you're interested in the first frame.
+            
+                🔖 For single-object caption mode, simply select the area and click the 'Generate Caption' button to receive a caption for the object.
+                
+                🔔 In QA mode, you can generate multiple masks by clicking the 'Generate Mask' button multiple times. Afterward, use the corresponding object id to ask questions.
+                
+                📌 Click the button 'Clear Masks' to clear the current generated masks.
+                
+                """
+  
+
+        with gr.TabItem("Image"):
+            with gr.Row():
+                with gr.Column():
+                    image_input = gr.ImageEditor(
+                        label="Image",
+                        type="pil", 
+                        sources=['upload'], 
+                        brush=gr.Brush(colors=["#ED7D31"], color_mode="fixed", default_size=10),
+                        eraser=True,
+                        layers=False,
+                        transforms=[],
+                        height=300,
+                    )
+                    generate_mask_btn = gr.Button("1️⃣ Generate Mask", visible=False, variant="primary")
+                    mode = gr.Radio(label="Mode", choices=["Caption", "QA"], value="Caption")
+                    query = gr.Textbox(label="Question", value="What is the relationship between <region0> and <region1>?", interactive=True, visible=False)
+                    
+                    submit_btn = gr.Button("Generate Caption", variant="primary")
+                    submit_btn1 = gr.Button("2️⃣ Generate Answer", variant="primary", visible=False)
+                    gr.Examples([f"./demo/images/{i+1}.jpg" for i in range(8)], inputs=image_input, label="Examples")
+    
+                with gr.Column():
+                    mask_output = gr.Gallery(label="Referred Masks", object_fit='scale-down', visible=False)
+                    output_image = gr.Image(label="Image with Mask", visible=True, height=400)
+                    description = gr.Textbox(label="Output", visible=True)
+                    
+                    clear_masks_btn = gr.Button("Clear Masks", variant="secondary", visible=False)
+            gr.Markdown(image_tips)
+
+        with gr.TabItem("Video"):
+            with gr.Row():
+                with gr.Column():
+                    video_input = gr.Video(label="Video")
+                    # load_btn = gr.Button("🖼️ Load First Frame", variant="secondary")
+                    first_frame = gr.ImageEditor(
+                        label="Annotate First Frame",
+                        type="pil", 
+                        sources=['upload'], 
+                        brush=gr.Brush(colors=["#ED7D31"], color_mode="fixed", default_size=10),
+                        eraser=True,
+                        layers=False,
+                        transforms=[],
+                        height=300,
+                    )
+                    generate_mask_btn_video = gr.Button("1️⃣ Generate Mask", visible=False, variant="primary")
+                    gr.Examples([f"./demo/videos/{i+1}.mp4" for i in range(4)], inputs=video_input, label="Examples")
+
+                with gr.Column():
+                    mode_video = gr.Radio(label="Mode", choices=["Caption", "QA"], value="Caption")
+                    mask_output_video = gr.Gallery(label="Referred Masks", object_fit='scale-down')
+
+                    query_video = gr.Textbox(label="Question", value="What is the relationship between <object0> and <object1>?", interactive=True, visible=False)
+
+                    submit_btn_video = gr.Button("Generate Caption", variant="primary")
+                    submit_btn_video1 = gr.Button("2️⃣ Generate Answer", variant="primary", visible=False)
+                    description_video = gr.Textbox(label="Output", visible=True)
+                    
+                    clear_masks_btn_video = gr.Button("Clear Masks", variant="secondary")
+
+            gr.Markdown(video_tips)
+
+        
+        def toggle_query_and_generate_button(mode):
+            query_visible = mode == "QA"
+            caption_visible = mode == "Caption"
+            global mask_list
+            global mask_raw_list
+            mask_list = []
+            mask_raw_list = []
+            return gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=caption_visible), gr.update(visible=caption_visible), [], ""
+
+        video_input.change(load_first_frame, inputs=video_input, outputs=first_frame)
+
+        mode.change(toggle_query_and_generate_button, inputs=mode, outputs=[query, generate_mask_btn, clear_masks_btn, submit_btn1, mask_output, output_image, submit_btn, mask_output, description])
+        
+        def toggle_query_and_generate_button_video(mode):
+            query_visible = mode == "QA"
+            caption_visible = mode == "Caption"
+            global mask_list_video
+            global mask_raw_list_video
+            mask_list_video = []
+            mask_raw_list_video = []
+            return gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=caption_visible), []
+
+
+        mode_video.change(toggle_query_and_generate_button_video, inputs=mode_video, outputs=[query_video, generate_mask_btn_video, submit_btn_video1, submit_btn_video, mask_output_video])
+
+        submit_btn.click(
+            fn=describe,
+            inputs=[image_input, mode, query],
+            outputs=[output_image, description, image_input],
+            api_name="describe"
+        )
+
+        submit_btn1.click(
+            fn=describe,
+            inputs=[image_input, mode, query],
+            outputs=[output_image, description, image_input],
+            api_name="describe"
+        )
+
+        generate_mask_btn.click(
+            fn=generate_masks,
+            inputs=[image_input],
+            outputs=[mask_output, image_input]
+        )
+
+        generate_mask_btn_video.click(
+            fn=generate_masks_video,
+            inputs=[first_frame],
+            outputs=[mask_output_video, first_frame]
+        )
+
+        clear_masks_btn.click(
+            fn=clear_masks,
+            outputs=[mask_output]
+        )
+
+        clear_masks_btn_video.click(
+            fn=clear_masks_video,
+            outputs=[mask_output_video]
+        )
+
+        submit_btn_video.click(
+            fn=describe_video,
+            inputs=[video_input, mode_video, query_video, first_frame],
+            outputs=[first_frame, description_video, mask_output_video],
+            api_name="describe_video"
+        )
+
+        submit_btn_video1.click(
+            fn=describe_video,
+            inputs=[video_input, mode_video, query_video, first_frame],
+            outputs=[first_frame, description_video, mask_output_video],
+            api_name="describe_video"
+        )
+
+
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    sam_model = SamModel.from_pretrained("facebook/sam-vit-huge").to(device)
+    sam_processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+
+    disable_torch_init()
+
+
+    model, processor, tokenizer = model_init(args_cli.model_path)
+    
+
+    demo.launch()
diff --git a/demo/.DS_Store b/demo/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..7b810a34efb67fb1697f0b97195da2e40cd95e1f
Binary files /dev/null and b/demo/.DS_Store differ
diff --git a/demo/images/1.jpg b/demo/images/1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6644fd3133ff430035467a9271e4995f580dcf68
--- /dev/null
+++ b/demo/images/1.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57f222d08703255914ed6cbda7d0c5fd8b772d7b975f3ffd73ee47f24f7eaabe
+size 490556
diff --git a/demo/images/2.jpg b/demo/images/2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9870800e31a802d152f4b569fceb1753e970f423
--- /dev/null
+++ b/demo/images/2.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9011049db02799c9bf68ba228445968a4dc2d097df8f3559c4e18a8a09a4f7f
+size 500692
diff --git a/demo/images/3.jpg b/demo/images/3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a6b898f4558d34b4a3fcd44dcffda58bbea2b942
--- /dev/null
+++ b/demo/images/3.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c5159bf7114d08967f95475176670043115b157bf700efa34190260cd917662
+size 1025438
diff --git a/demo/images/4.jpg b/demo/images/4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..231d5237cdd8700383ab39bc4c2fe12180fc5d7b
--- /dev/null
+++ b/demo/images/4.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39174b4188bc6d928cf0153f0d3a3224e15c9823f8cdc99b4ad6627067741bb8
+size 707645
diff --git a/demo/images/5.jpg b/demo/images/5.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..db9215dfbaefa5f1c64c03dd1b928de1c6117ff8
--- /dev/null
+++ b/demo/images/5.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e02c393a23aadd1304497e3a9b41144df166d1cfda33ea3e00eed94e27da3aa4
+size 1372251
diff --git a/demo/images/6.jpg b/demo/images/6.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2f671cc6bfabf85df756385c67d99d03adebb40d
--- /dev/null
+++ b/demo/images/6.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d512c06daf1b5c7919fc351c496ff65d9cac601c57ae263433c49d90d3b083e
+size 3777708
diff --git a/demo/images/7.jpg b/demo/images/7.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f7ade9ccb2b428d75d1cc355852ffc050d61b37a
--- /dev/null
+++ b/demo/images/7.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68d5970b974101b61b1bcf5dd790485a89f85a651641990c2629d4a56de40ba8
+size 3645239
diff --git a/demo/images/8.jpg b/demo/images/8.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2e24986628df0542b3a0a872860487a433ff2199
--- /dev/null
+++ b/demo/images/8.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bdb5acb53dfc78e74008d113b22f5a2fb1e2c7b33cb8eadf4983d709bfe366ba
+size 335061
diff --git a/demo/images/LICENSE b/demo/images/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..efb39f517865361078a34ccb5866d766df3df087
--- /dev/null
+++ b/demo/images/LICENSE
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ac4c813c90895cdc79c71fdbd02715fd0c5505c24d95c5941747c904d6e93bc
+size 149
diff --git a/demo/videos/1.mp4 b/demo/videos/1.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..287cf172bedd4d450bbab188f0a0cb8aa0a873ef
--- /dev/null
+++ b/demo/videos/1.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad78d268f6f1ad9a457a7768665157f74c20292136cefbf6bfc2a07de940dd0a
+size 804232
diff --git a/demo/videos/2.mp4 b/demo/videos/2.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..5c27e2d8784a5f1ac3bf27bfc6b967e3a36a33be
--- /dev/null
+++ b/demo/videos/2.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5eebbd330be490709c1b39cd1d82ae074f3fe275487bc6b77d2aa5cd74d40d05
+size 1255466
diff --git a/demo/videos/3.mp4 b/demo/videos/3.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..24a6bc56b545d48be6f8c7130c86c44558aac761
--- /dev/null
+++ b/demo/videos/3.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:946550c741c9dc515340ab93b203614094632191db0d8f9697bd580f4a271947
+size 8743247
diff --git a/demo/videos/4.mp4 b/demo/videos/4.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..9dd4709797727f01ef360e2eaf41e4fcde693746
--- /dev/null
+++ b/demo/videos/4.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b06b309812947b909ce7b8eaaea94a9ca60a8452a33e3109f5f6ffb1dbf8ee6
+size 1334796
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..db3508b894494bc1c4ba7e5a75df5569c4284053
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,48 @@
+--extra-index-url https://download.pytorch.org/whl/cu124
+--extra-index-url https://download.pytorch.org/whl/cu121
+--extra-index-url https://download.pytorch.org/whl/cu118
+# basic dependencies
+torch==2.4.0
+torchvision==0.19.0
+datasets==2.21.0
+transformers==4.46.3
+tokenizers==0.20.3
+deepspeed==0.15.4
+accelerate==1.0.1
+peft==0.4.0
+timm==1.0.3
+numpy==1.24.4
+# data processing
+decord==0.6.0
+imageio==2.34.0
+imageio-ffmpeg==0.4.9
+moviepy==1.0.3
+scenedetect==0.6.3
+opencv-python==4.6.0.66
+pyarrow
+pysubs2
+ffmpeg-python
+# misc
+scikit-learn==1.2.2
+huggingface_hub==0.23.4
+sentencepiece==0.1.99
+shortuuid
+einops==0.6.1
+einops-exts==0.0.4
+bitsandbytes==0.43.3 # for cuda 124
+pydantic>=2.0
+markdown2[all]
+gradio==5.34.0
+gradio_client==1.10.3
+httpx==0.24.1
+requests
+openai
+uvicorn
+fastapi
+tensorboard
+wandb
+tabulate
+Levenshtein
+pycocotools==2.0.8
+spaces
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
diff --git a/videollama3/.DS_Store b/videollama3/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..39199160a062a83a14c271208dcde3cc56baa402
Binary files /dev/null and b/videollama3/.DS_Store differ
diff --git a/videollama3/__init__.py b/videollama3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f96d0bcfd3a2010294661feaccdcb343b77300d7
--- /dev/null
+++ b/videollama3/__init__.py
@@ -0,0 +1,239 @@
+import os
+import copy
+import math
+import warnings
+import shutil
+from functools import partial
+
+import torch
+
+from .model import load_pretrained_model
+from .model.processor import Videollama3Processor
+from .mm_utils import load_images, process_images, load_video, process_video, tokenizer_multimodal_token, get_model_name_from_path, KeywordsStoppingCriteria, resize_image_mask
+from .constants import NUM_FRAMES, DEFAULT_IMAGE_TOKEN, DEFAULT_VIDEO_TOKEN, MODAL_INDEX_MAP, STREAM_START_TOKEN, STREAM_END_TOKEN
+from videollama3.constants import REGION_TOKEN
+from transformers import TextIteratorStreamer
+from threading import Thread
+
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+    
+
+def model_init(model_path=None, **kwargs):
+    model_path = "DAMO-NLP-SG/VideoLLaMA2-7B" if model_path is None else model_path
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, processor, context_len = load_pretrained_model(model_path, None, model_name, **kwargs)
+
+    if tokenizer.pad_token is None and tokenizer.unk_token is not None:
+        tokenizer.pad_token = tokenizer.unk_token
+
+    aspect_ratio = model.config.image_aspect_ratio if hasattr(model.config, "image_aspect_ratio") else "pad"
+    image_size = model.config.image_size if hasattr(model.config, "image_size") else 384
+    # NOTE: If num_frames is None, the frame sampling mode is "fps". If num_frames is not None, the frame sampling mode is "uniform". 
+    # num_frames = model.config.num_frames
+    model.config.region_token_index = tokenizer.convert_tokens_to_ids(REGION_TOKEN)
+    processor = {
+        'image': load_images,
+        'video': load_video,
+        'text':  None
+    }
+
+    return model, processor, tokenizer
+
+
+def get_model_output(images_or_videos, instruct, model, tokenizer, modal='video', **kwargs):
+    streaming = kwargs.pop('streaming', False)
+    if streaming:
+        return mm_infer(images_or_videos, instruct, model, tokenizer, modal, streaming=True, **kwargs)
+    else:
+        output = mm_infer(images_or_videos, instruct, model, tokenizer, modal, streaming=False, **kwargs)
+        return next(output)
+
+
+def mm_infer(images_or_videos, instruct, model, tokenizer, modal='video', **kwargs):
+    """inference api of VideoLLaMA2 for video understanding.
+
+    Args:
+        model: VideoLLaMA2 model.
+        images_or_videos (torch.Tensor): image tensor (1, C, H, W) / video tensor (T, C, H, W).
+        instruct (str): text instruction for understanding video.
+        tokenizer: tokenizer.
+        do_sample (bool): whether to sample.
+        modal (str): inference modality.
+    Returns:
+        str: response of the model.
+    """
+    mask_ids = kwargs.pop('mask_ids', None)
+    masks = kwargs.pop('masks', None)
+    streaming = kwargs.pop('streaming', False)
+    if modal == 'image':
+        modal_token = DEFAULT_IMAGE_TOKEN
+        images = images_or_videos
+        additional_frames = images.copy()
+        timestamps = None
+    elif modal == 'video':
+        modal_token = DEFAULT_VIDEO_TOKEN
+        images, timestamps, additional_frames = images_or_videos
+    elif modal == 'text':
+        modal_token = ''
+    else:
+        raise ValueError(f"Unsupported modal: {modal}")
+
+    vlprocessor = Videollama3Processor(model.get_vision_encoder().image_processor, tokenizer)
+    vlprocessor.tokenizer.add_tokens([DEFAULT_IMAGE_TOKEN, STREAM_START_TOKEN, STREAM_END_TOKEN], special_tokens=True)
+
+    model.config.image_token_index = vlprocessor.tokenizer.convert_tokens_to_ids(DEFAULT_IMAGE_TOKEN)
+
+    if masks is not None:
+        additional_frames, masks, mask_nums = resize_image_mask(additional_frames, masks, mask_ids)
+
+        for idx in range(len(mask_nums)):
+            instruct = instruct.replace('<region>', "["+REGION_TOKEN*mask_nums[idx]+"]", 1)
+
+
+        additional_images_dict = vlprocessor._process_image(additional_frames, image_downsampling=1) 
+        additional_images = additional_images_dict['images']
+        # import pdb 
+        # pdb.set_trace()
+
+
+        # flatten_patches1 = additional_images[0].reshape(26, 46, 3, -1)
+        # from matplotlib import pyplot as plt
+        # plt.imshow(flatten_patches1[:,:,:,0])
+        # plt.savefig('16.png')
+
+        additional_images_thws = additional_images_dict['grid_thws']
+        additional_images = (additional_images, additional_images_thws)
+    
+    else:
+        additional_images = None
+        
+
+    # 1. text preprocess (tag process & generate prompt).
+    if isinstance(instruct, str):
+        messages = [{'role': 'user', 'content': instruct}]
+    elif isinstance(instruct, list):
+        messages = copy.deepcopy(instruct)
+    else:
+        raise ValueError(f"Unsupported type of instruct: {type(instruct)}")
+
+    if all(not modal_token in message["content"] for message in messages):
+        warnings.warn(f"Image tag not found in the conversation, add it automatically at the beginning!")
+        messages[0]["content"] = modal_token + messages[0]["content"]
+
+    converted_messages = []
+    for message in messages:
+        chunks = message["content"].split(modal_token)
+        converted_messages.append({
+            "role": "user",
+            "content": []
+        })
+
+        for chunk_idx in range(1, 2 * len(chunks)):
+            if chunk_idx % 2 == 1:
+                chunk = chunks[chunk_idx // 2].strip()
+                converted_messages[-1]["content"].append({"type": "text",  "text": chunk}) if chunk else None
+            else:
+                if modal == 'image':
+                    converted_messages[-1]["content"].append({"type": "image"})
+                elif modal == 'video':
+                    converted_messages[-1]["content"].append({"type": "video", "num_frames": len(images), "time": timestamps})
+
+    messages = converted_messages
+
+    # 2. vision preprocess (load & transform image or video).
+    if model.config.model_type in ['videollama3_mistral', 'videollama3_mixtral']:
+        system_message = [
+            {'role': 'system', 'content': (
+            """<<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature."""
+            """\n"""
+            """If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>""")
+            }
+        ]
+    else:
+        system_message = []
+
+    image_downsampling = kwargs.get('image_downsampling', model.config.spatial_merge_size)
+    # TODO: attention mask?
+    messages = system_message + messages
+    data_dict = vlprocessor(
+        images=images,
+        text=messages,
+        image_downsampling=image_downsampling,
+        return_tensors="pt",
+    )
+
+    torch_dtype = model.config.torch_dtype if hasattr(model.config, "torch_dtype") else torch.float16
+
+    images = [x.to(torch_dtype).cuda(non_blocking=True) for x in data_dict["images"]]
+    grid_thws = [x.cuda(non_blocking=True) for x in data_dict["grid_thws"]]
+
+    # 3. generate response according to visual signals and prompts. 
+    keywords = [tokenizer.eos_token]
+    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, data_dict["input_ids"])
+    stop_str = tokenizer.eos_token
+
+    do_sample = kwargs.get('do_sample', False)
+    temperature = kwargs.get('temperature', 0.2 if do_sample else 0.0)
+    top_p = kwargs.get('top_p', 0.9)
+    max_new_tokens = kwargs.get('max_new_tokens', 2048)
+    if not streaming:
+        with torch.inference_mode():
+            output_ids = model.generate(
+                # input_ids,
+                # attention_mask=attention_masks,
+                # images=images,
+                data_dict["input_ids"].cuda(),
+                attention_mask=data_dict["attention_mask"].cuda(),
+                images=[(modal, images, grid_thws)],
+                do_sample=do_sample,
+                temperature=temperature,
+                max_new_tokens=max_new_tokens,
+                top_p=top_p,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria],
+                pad_token_id=tokenizer.eos_token_id,
+                additional_images=[additional_images],
+                masks=[masks],
+            )
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+        yield outputs
+
+    else:
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) 
+        generation_kwargs = dict(
+            inputs=data_dict["input_ids"].cuda(),
+            attention_mask=data_dict["attention_mask"].cuda(),
+            images=[(modal, images, grid_thws)],
+            do_sample=do_sample,
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
+            top_p=top_p,
+            use_cache=True,
+            stopping_criteria=[stopping_criteria],
+            pad_token_id=tokenizer.eos_token_id,
+            additional_images=[additional_images],
+            masks=[masks],
+            streamer=streamer
+        )
+
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        
+        generated_text = ""
+        for new_text in streamer:
+            generated_text += new_text
+            if stop_str in generated_text:
+                generated_text = generated_text[:generated_text.find(stop_str)]
+                break
+            yield new_text
+        
+        thread.join()
+       
\ No newline at end of file
diff --git a/videollama3/constants.py b/videollama3/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1d2173cecf7f53789a438badcc70cdd3d218e5c
--- /dev/null
+++ b/videollama3/constants.py
@@ -0,0 +1,46 @@
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+
+LOGDIR = "."
+
+# Model Constants
+IGNORE_INDEX = -100
+
+# Image arguments
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"
+
+# Video arguments
+VIDEO_TOKEN_INDEX = -201
+DEFAULT_VIDEO_TOKEN = "<video>"
+NUM_FRAMES = 128
+MAX_FRAMES = 768
+NUM_FRAMES_PER_SECOND = 1
+
+# Region arguments
+REGION_TOKEN = "<REGION>"
+
+# Audio arguments
+AUDIO_TOKEN_INDEX = -202
+DEFAULT_AUDIO_TOKEN = "<audio>"
+
+# Stream arguments
+STREAM_START_TOKEN = "<|stream_start|>"
+STREAM_END_TOKEN = "<|stream_end|>"
+STREAM_IMAGE_TOKEN = "<stream_image>"
+STREAM_FPS = 2
+STREAM_IMAGE_SIZE = 224
+STREAM_DOWNSAMPLING = 4
+STREAM_MAX_FRAMES = 400
+
+MODAL_INDEX_MAP = {
+    "<image>": -200,
+    "<video>": -201,
+    "<audio>": -202,
+}
+
+subimage_token_num=196
\ No newline at end of file
diff --git a/videollama3/infer.py b/videollama3/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7cc0f0804df81909524a929531ddff8eeb212f9
--- /dev/null
+++ b/videollama3/infer.py
@@ -0,0 +1,82 @@
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+
+import os
+import torch
+import sys
+sys.path.append('./')
+from videollama3 import disable_torch_init, model_init, mm_infer, get_model_output
+from videollama3.mm_utils import load_video
+
+import numpy as np
+from PIL import Image
+
+def infer_image(model, tokenizer):
+    image_path = 'demo/images/1.jpg'
+    image = Image.open(image_path)
+    image_data = np.array(image)
+
+    question = '<image>\nPlease describe the <region> in the image in detail.'
+
+    mask = np.load('demo/masks/demo0.npy')
+    masks = []
+    masks.append(mask)
+    masks = np.array(masks)
+    masks = torch.from_numpy(masks).to(torch.uint8)
+
+    mask_ids = [0]*len(masks)
+
+    output = get_model_output(
+        [image_data],
+        question,
+        model=model,
+        tokenizer=tokenizer,
+        masks=masks,
+        mask_ids=mask_ids,
+        modal='image',
+        image_downsampling=1,
+    )
+    print(output)
+
+def infer_video(model, tokenizer):
+    video_path = 'demo/videos/1.mp4'
+    question = '<video>\nPlease describe the <region> in the video in detail.'
+
+    frame_idx = 0 # mask from the first frame
+    video_tensor = load_video(video_path, fps=1, max_frames=768, frame_ids=[frame_idx])
+
+    mask = np.load('demo/masks/demo1.npy')
+    masks = []
+    masks.append(mask)
+    masks = np.array(masks)
+    masks = torch.from_numpy(masks).to(torch.uint8)
+
+    mask_ids = [0]*len(masks)
+
+    output = get_model_output(
+        video_tensor,
+        question,
+        model=model,
+        tokenizer=tokenizer,
+        masks=masks,
+        mask_ids=mask_ids,
+        modal='video',
+    )
+    print(output)
+
+def main():
+    disable_torch_init()
+
+    # fill in the model path here
+    model_path = '/mnt/workspace/workgroup/yuanyq/code/videollama3/ProjectX_region/work_dirs/VideoRefer-VideoLLaMA3-7B'
+    model, processor, tokenizer = model_init(model_path)
+
+    # image
+    infer_image(model, tokenizer)
+
+    # viideo
+    infer_video(model, tokenizer)
+
+
+if __name__=='__main__':
+    main()
\ No newline at end of file
diff --git a/videollama3/mm_utils.py b/videollama3/mm_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..10eec9da1aac5582b8efa3976afc120a61a60f49
--- /dev/null
+++ b/videollama3/mm_utils.py
@@ -0,0 +1,704 @@
+import ast
+import os
+import re
+import math
+import base64
+import traceback
+from io import BytesIO
+from typing import Optional
+
+import torch
+import torchvision.transforms.functional as VF
+import torch.nn.functional as F
+import numpy as np
+from transformers import StoppingCriteria
+
+import cv2
+import imageio
+import ffmpeg
+from PIL import Image
+from decord import VideoReader, cpu
+
+from .constants import NUM_FRAMES, MAX_FRAMES, NUM_FRAMES_PER_SECOND, MODAL_INDEX_MAP, DEFAULT_IMAGE_TOKEN
+from pycocotools import mask as maskUtils
+
+def resize_image_mask(images, masks, mask_ids, patch_size=14):
+    resize_images = []
+    resize_masks = []
+    mask_nums = []
+    for i, mask in enumerate(masks):
+        image = images[mask_ids[i]]
+        h, w = image.shape[:2]
+        if mask.sum()==0:
+            print('mask is none...')
+            mask = torch.ones((h, w))
+        rows, cols = np.where(mask == 1)
+        
+        min_row, max_row = rows.min(), rows.max()
+        min_col, max_col = cols.min(), cols.max()
+        
+        bbox = (max(0,min_row-patch_size*2), max(0,min_col-patch_size*2), min(h-1, max_row+patch_size*2), min(w-1, max_col+patch_size*2))
+        mask_h = bbox[2] - bbox[0]
+        mask_w = bbox[3] - bbox[1]
+        cropping_img = image[bbox[0]: bbox[2], bbox[1]: bbox[3], :]
+        cropping_mask = mask[bbox[0]: bbox[2], bbox[1]: bbox[3]]
+
+        scale_rate = math.ceil(math.sqrt(1960/mask.sum()))
+        if scale_rate==1:
+            if (mask.sum()/196)>100:
+                scale_rate = math.sqrt((mask.sum()/196)/100)
+                scale_rate = 1/scale_rate
+        resize_h = math.ceil((mask_h*scale_rate)/patch_size) * patch_size
+        resize_w = math.ceil((mask_w*scale_rate)/patch_size) * patch_size
+
+        resize_img = cv2.resize(cropping_img, (resize_w, resize_h))
+        resize_mask = F.interpolate(cropping_mask[None, None], size=(resize_h//patch_size, resize_w//patch_size), mode='bilinear', align_corners=False)[0,0]
+        mask_nums.append(min(10, int(resize_mask.sum())))
+
+        resize_images.append(resize_img)
+        resize_masks.append(resize_mask)
+        
+    return resize_images, resize_masks, mask_nums
+
+def reshape_images_to_raw_grid(mm_features_raw, grid_thws):
+    start_idx=0
+    reshaped_features = []
+    for thw_group in grid_thws:
+        for tensor_thw in thw_group:
+            _, H, W = tensor_thw.squeeze().tolist()
+            num_elements = H * W
+
+            split_tensor = mm_features_raw[start_idx:start_idx + num_elements].view(H, W, -1)
+            reshaped_features.append(split_tensor)
+
+            start_idx += num_elements
+    assert len(mm_features_raw)==start_idx
+    return reshaped_features
+  
+def annToMask(mask_ann, h=None, w=None):
+    if isinstance(mask_ann, list):
+        rles = maskUtils.frPyObjects(mask_ann, h, w)
+        rle = maskUtils.merge(rles)
+    elif isinstance(mask_ann['counts'], list):
+        # uncompressed RLE
+        rle = maskUtils.frPyObjects(mask_ann, h, w)
+    else:
+        # rle
+        rle = mask_ann
+    mask = maskUtils.decode(rle)
+    return mask
+
+def chunk_list(input_list, chunk_size):
+    return [input_list[i:i + chunk_size] for i in range(0, len(input_list), chunk_size)]
+
+
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+
+
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+
+
+def grid_divide(image, cell_size):
+    """
+    Divides an image into grid of a specified size.
+
+    Args:
+        image (PIL.Image.Image): The input image.
+        cell_size (int): The size of each cell.
+
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    grid = []
+    width, height = image.size
+    for i in range(0, height, cell_size):
+        row = []
+        for j in range(0, width, cell_size):
+            box = (j, i, j + cell_size, i + cell_size)
+            row.append(image.crop(box))
+        grid.append(row)
+
+    return grid
+
+
+def load_images(image_path):
+    if isinstance(image_path, str) and os.path.isfile(image_path):
+        images = [cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)]
+        # images = [Image.open(image_path).convert('RGB')]
+    elif isinstance(image_path, str) and os.path.isdir(image_path):
+        images = [cv2.cvtColor(cv2.imread(os.path.join(image_path, f)), cv2.COLOR_BGR2RGB) for f in sorted(os.listdir(image_path))]
+        # images = [Image.open(os.path.join(image_path, f)).convert('RGB') for f in sorted(os.listdir(image_path))]
+    elif isinstance(image_path, list) and isinstance(image_path[0], str):
+        images = [cv2.cvtColor(cv2.imread(f), cv2.COLOR_BGR2RGB) for f in image_path]
+        # images = [Image.open(f).convert('RGB') for f in image_path]
+    elif isinstance(image_path, list) and isinstance(image_path[0], Image.Image):
+        images = image_path
+    elif isinstance(image_path, Image.Image):
+        images = [image_path]
+    else:
+        print('image_path: ', image_path)
+        raise ValueError(f"Unsupported image path type: {image_path}")
+
+    return images
+
+
+def process_pad_image(image, padding_value=(0, 0, 0)):
+    image = expand2square(image, padding_value)
+
+    return [image]
+
+
+def find_closest_aspect_ratio(src_ratio, tgt_ratios, ori_size, tgt_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = ori_size[0] * ori_size[1]
+    for ratio in tgt_ratios:
+        tgt_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(src_ratio - tgt_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * tgt_size[0] * tgt_size[1] * ratio[0] * ratio[1]:
+                best_ratio = ratio
+
+    return best_ratio
+
+
+def process_dynamic_image(image, image_size=384, use_thumbnail=True):
+    # Grid Params:
+    min_num = 1
+    max_num = 12
+
+    if isinstance(image_size, int):
+        image_size = (image_size, image_size)
+
+    ori_size = image.size
+    aspect_ratio = ori_size[0] / ori_size[1]
+
+    # calculate the existing image aspect ratio
+    tgt_ratios = []
+    for n in range(min_num, max_num + 1):
+        tgt_ratios.extend([(i, j) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num])
+    tgt_ratios = set(tgt_ratios)
+    tgt_ratios = sorted(tgt_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    tgt_ratio = find_closest_aspect_ratio(aspect_ratio, tgt_ratios, ori_size, image_size)
+
+    # resize the image to the target size
+    tgt_width = image_size[0] * tgt_ratio[0]
+    tgt_height = image_size[1] * tgt_ratio[1]
+    resized_img = image.resize((tgt_width, tgt_height))
+
+    # NOTE: internvl2 style split the image into one column grids
+    # num_grids = tgt_ratio[0] * tgt_ratio[1]
+    # grid_images = []
+    # for i in range(num_grids):
+    #     box = (
+    #         (i %  tgt_ratio[0]) * image_size[0],
+    #         (i // tgt_ratio[0]) * image_size[1],
+    #         (i %  tgt_ratio[0] + 1) * image_size[0],
+    #         (i // tgt_ratio[0] + 1) * image_size[1],
+    #     )
+    #     # crop out the grid image
+    #     grid_images.append(resized_img.crop(box))
+    # assert len(grid_images) == num_grids
+    # grid_images = [grid_images]
+
+    # NOTE: eager implementation
+    # num_grids = tgt_ratio[0] * tgt_ratio[1]
+    # sub_grid_images = []
+    # tmp_grid_images = []
+    # for i in range(num_grids):
+    #     box = (
+    #         (i %  tgt_ratio[0]) * image_size[0],
+    #         (i // tgt_ratio[0]) * image_size[1],
+    #         (i %  tgt_ratio[0] + 1) * image_size[0],
+    #         (i // tgt_ratio[0] + 1) * image_size[1],
+    #     )
+    #     tmp_grid_images.append(resized_img.crop(box))
+
+    #     if (i + 1) % tgt_ratio[0] == 0:
+    #         sub_grid_images.append(tmp_grid_images)
+    #         tmp_grid_images = []
+
+    image_grid = grid_divide(resized_img, image_size[0])
+
+    if use_thumbnail:
+        thumbnail_img = image.resize((image_size[0], image_size[1]))
+        image_grid = [[thumbnail_img]] + image_grid
+
+    return image_grid
+
+
+def process_highres_image(image_path, image_size=384, use_thumbnail=True, padding_value=(0, 0, 0)):
+    # Grid Params:
+    grid_width = [1, 2, 3]
+    grid_width_real = [x * image_size for x in grid_width]
+
+    longest_side = max(image.size)
+    fit_grid_width_real = [x for x in grid_width_real if x >= longest_side]
+    if len(fit_grid_width_real) == 0:
+        select_size = max(grid_width_real)
+    else:
+        select_size = min(fit_grid_width_real)
+
+    image_padded = expand2square(image, padding_value)
+    image_padded = image_padded.resize((select_size, select_size))
+    image_grid = grid_divide(image_padded, image_size)
+
+    if use_thumbnail:
+        thumbnail_img = image.resize((image_size, image_size))
+        image_grid = [[thumbnail_img]] + image_grid
+
+    return image_grid
+
+
+def select_best_resolution(original_size, possible_resolutions):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float('inf')
+
+    for width, height in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+
+        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+
+    return best_fit
+
+
+def process_anyres_image(image, image_size=384, use_thumbnail=True, padding_value=(0, 0, 0)):
+    """
+    Process an image with variable resolutions.
+
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    # Grid Params:
+    possible_grids = [(1, 1), (1, 2), (1, 3), (2, 1), (2, 2), (2, 3)]
+    possible_resolutions = [(x * image_size, y * image_size) for x, y in possible_grids]
+
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+
+    # resize and padding image
+    nw, nh = best_resolution
+    ow, oh = image.size
+
+    scale_factor = min(nw / ow, nh / oh)
+    new_size = (int(ow * scale_factor), int(oh * scale_factor))
+
+    image_padded = Image.new("RGB", (nw, nh), padding_value)
+    image_padded.paste(image.resize(new_size), ((nw - new_size[0]) // 2, (nh - new_size[1]) // 2))
+
+    image_grid = grid_divide(image_padded, image_size)
+
+    if use_thumbnail:
+        thumbnail_img = image.resize((image_size, image_size))
+        image_grid = [[thumbnail_img]] + image_grid
+
+    return image_grid
+
+
+def process_adares_image(image_path, image_size=384, use_thumbnail=True):
+    # Grid Params:
+    min_num = 1
+    max_num = 12
+
+    if isinstance(image_size, int):
+        image_size = (image_size, image_size)
+
+    ori_size = image.size
+    aspect_ratio = ori_size[0] / ori_size[1]
+
+    # calculate the existing image aspect ratio
+    tgt_ratios = []
+    for n in range(min_num, max_num + 1):
+        tgt_ratios.extend([(i, j) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num])
+    tgt_ratios = set(tgt_ratios)
+    possible_resolutions = [(x * image_size[0], y * image_size[1]) for x, y in tgt_ratios]
+
+    # find the most possible resolution
+    best_resolution = select_best_resolution(ori_size, possible_resolutions)
+
+    # resize the image to the target size
+    resized_img = image.resize((best_resolution[0], best_resolution[1]))
+
+    image_grid = grid_divide(resized_img, image_size[0])
+
+    if use_thumbnail:
+        thumbnail_img = image.resize((image_size[0], image_size[1]))
+        image_grid = [[thumbnail_img]] + image_grid
+
+    return image_grid
+
+
+def process_images(image_path, processor, aspect_ratio='pad', image_size=384, use_thumbnail=True):
+    images = load_images(image_path)
+
+    padding_value = tuple(int(x*255) for x in processor.image_mean)
+
+    image_grids = []
+    for image in images:
+        if aspect_ratio == 'pad':
+            image_grid = process_pad_image(image, padding_value=padding_value)
+        elif aspect_ratio == 'dynamic':
+            image_grid = process_dynamic_image(image, image_size=image_size, use_thumbnail=use_thumbnail)
+        elif aspect_ratio == 'highres':
+            image_grid = process_highres_image(image, image_size=image_size, use_thumbnail=use_thumbnail, padding_value=padding_value)
+        elif aspect_ratio == 'anyres':
+            image_grid = process_anyres_image(image, image_size=image_size, use_thumbnail=use_thumbnail, padding_value=padding_value)
+        elif aspect_ratio == 'adares':
+            image_grid = process_adares_image(image, image_size=image_size, use_thumbnail=use_thumbnail)
+        else:
+            image_grid = [image]
+
+        image_grid = [processor.preprocess(image_row, return_tensors='pt', num_images=len(images)) for image_row in image_grid]
+        image_grids.append(image_grid)
+
+    return image_grids
+
+
+def frame_sample(duration, mode='uniform', num_frames=None, vid_fps=None, fps=None):
+    if mode == 'uniform':
+        assert num_frames is not None, "Number of frames must be provided for uniform sampling."
+        if duration <= num_frames:
+            return np.arange(duration).astype(int)
+        # NOTE: v1 version
+        # Calculate the size of each segment from which a frame will be extracted
+        # if duration <= num_frames:
+        #     return np.arange(duration).astype(int)
+        # seg_size = float(duration - 1) / num_frames
+
+        # frame_ids = []
+        # for i in range(num_frames):
+        #     # Calculate the start and end indices of each segment
+        #     start = seg_size * i
+        #     end   = seg_size * (i + 1)
+        #     # Append the middle index of the segment to the list
+        #     frame_ids.append((start + end) / 2)
+
+        # return np.round(np.array(frame_ids) + 1e-6).astype(int)
+        # NOTE: v0 version
+        return np.linspace(0, duration-1, num_frames, dtype=int)
+    elif mode == 'fps':
+        assert vid_fps is not None, "FPS must be provided for FPS sampling."
+        fps = fps if fps is not None else NUM_FRAMES_PER_SECOND
+        segment_len = min(vid_fps // fps, duration)
+        return np.arange(segment_len // 2, duration, segment_len, dtype=int)
+    else:
+        raise ImportError(f'Unsupported frame sampling mode: {mode}')
+
+
+def load_video_from_ids(video_path, s=None, e=None, fps=None, max_frames=None, temporal_factor=1, frame_ids=None):
+    if s is not None and e is not None:
+        s = s if s >= 0. else 0.
+        e = e if e >= 0. else 0.
+        if s > e:
+            s, e = e, s
+        elif s == e:
+            e = s + 1
+
+    # 1. Loading Video
+    if os.path.isdir(video_path):
+        frame_files = sorted(os.listdir(video_path))
+
+        vid_fps = 3
+        num_frames_of_video = len(frame_files)
+    elif video_path.endswith('.gif'):
+        gif_reader = imageio.get_reader(video_path)
+
+        vid_fps = 25
+        num_frames_of_video = len(gif_reader)
+    else:
+        vreader = VideoReader(video_path, ctx=cpu(0), num_threads=2)
+        # vreader = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+
+        vid_fps = vreader.get_avg_fps()
+        num_frames_of_video = len(vreader)
+
+    # 2. Determine frame range & Calculate frame indices
+    f_start = 0                       if s is None else max(int(s * vid_fps) - 1, 0)
+    f_end   = num_frames_of_video - 1 if e is None else min(int(e * vid_fps) - 1, num_frames_of_video - 1)
+    frame_indices = list(range(f_start, f_end + 1))
+
+    duration = len(frame_indices)
+    # 3. Sampling frame indices
+    max_frames = max_frames if max_frames is not None else MAX_FRAMES
+    if fps is not None and duration / vid_fps < max_frames:
+        try:
+            sampled_frame_indices = [frame_indices[i] for i in frame_sample(duration, mode='fps', vid_fps=vid_fps, fps=fps)]
+        except:
+            print('sampled_frame_indices error: ', )
+            sampled_frame_indices = [frame_indices[i] for i in frame_sample(duration, mode='uniform', num_frames=max_frames)]
+
+    else:
+        sampled_frame_indices = [frame_indices[i] for i in frame_sample(duration, mode='uniform', num_frames=max_frames)]
+
+    # 4. Acquire frame data
+    if os.path.isdir(video_path):
+        frames = [cv2.cvtColor(cv2.imread(os.path.join(video_path, frame_files[frame_idx])), cv2.COLOR_BGR2RGB) for frame_idx in sampled_frame_indices]
+    elif video_path.endswith('.gif'):
+        frames = [cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB) for idx, frame in enumerate(gif_reader) if idx in sampled_frame_indices]
+    else:
+        frames = vreader.get_batch(sampled_frame_indices).asnumpy()
+
+    # frames = frames.transpose(0, 3, 1, 2)
+    timestamps = [x / vid_fps for x in sampled_frame_indices]
+
+    if temporal_factor > 1:
+        pad_length = temporal_factor - len(frames) % temporal_factor
+        frames = np.concatenate([frames, frames[-1:].repeat(pad_length, axis=0)])
+        [timestamps.append(timestamps[-1] + 1 / fps) for _ in range(pad_length)]
+
+    # NOTE: pad the video with black frames
+    # while num_frames is not None and len(video_data) < num_frames:
+    #     video_data.append(Image.fromarray(np.zeros((*video_data[-1].size, 3), dtype=np.uint8)))
+
+    additional_frames = []
+    if frame_ids is not None:
+        if os.path.isdir(video_path):
+            additional_frames = [cv2.cvtColor(cv2.imread(os.path.join(video_path, frame_files[frame_idx])), cv2.COLOR_BGR2RGB) for frame_idx in frame_ids]
+        elif video_path.endswith('.gif'):
+            additional_frames = [cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB) for idx, frame in enumerate(gif_reader) if idx in frame_ids]
+        else:
+            additional_frames = vreader.get_batch(frame_ids).asnumpy()
+
+    return frames, timestamps, additional_frames
+
+
+def load_video(
+    video_path: str,
+    start_time: Optional[float] = None,
+    end_time: Optional[float] = None,
+    fps: Optional[float] = None,
+    max_frames: Optional[float] = None,
+    size: Optional[int] = None,
+    size_divisible: int = 1,
+    precise_time: bool = False,
+    verbose: bool = False,
+    temporal_factor: int = 1,
+    frame_ids = None
+):
+    """
+    Load and process a video file and return the frames and the timestamps of each frame.
+
+    Args:
+        video_path (str): Path to the video file.
+        start_time (float, optional): Start time in seconds. Defaults to None.
+        end_time (float, optional): End time in seconds. Defaults to None.
+        fps (float, optional): Frames per second. Defaults to None.
+        num_frames (float, optional): Number of frames to sample. Defaults to None.
+        size (int, optional): Size of the shortest side. Defaults to None.
+        size_divisible (int, optional): Size divisible by this number. Defaults to 1.
+        precise_time (bool, optional): Whether to use precise time. Defaults to False.
+        verbose (bool, optional): Print ffmpeg output. Defaults to False.
+
+    Returns:
+        frames (List[PIL.Image]): List of frames.
+        timestamps (List[float]): List of timestamps.
+    """
+    if start_time is not None and end_time is not None and end_time - start_time < 1:
+        return load_video_from_ids(video_path, start_time, end_time, fps=fps, max_frames=max_frames, frame_ids=frame_ids)
+    if os.path.isdir(video_path):
+        return load_video_from_ids(video_path, start_time, end_time, fps=fps, max_frames=max_frames, frame_ids=frame_ids)
+    if video_path.endswith('.gif'):
+        return load_video_from_ids(video_path, start_time, end_time, fps=fps, max_frames=max_frames, frame_ids=frame_ids)
+    probe = ffmpeg.probe(video_path)
+    duration = float(probe['format']['duration'])
+    video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
+    w, h = int(video_stream['width']), int(video_stream['height'])
+
+    kwargs, input_kwargs, output_kwargs = {}, {}, {}
+    do_trim = start_time is not None or end_time is not None
+    if start_time is not None:
+        new_start_time = max(float(video_stream['start_time']), start_time)
+        duration -= new_start_time - start_time
+        start_time = new_start_time
+    else:
+        start_time = float(video_stream['start_time'])
+    if end_time is not None:
+        duration = min(duration, end_time - start_time)
+    else:
+        duration = duration
+    if do_trim:
+        kwargs = {'ss': start_time, 't': duration}
+    if precise_time:
+        output_kwargs.update(kwargs)
+    else:
+        input_kwargs.update(kwargs)
+
+    if size is not None:
+        scale_factor = size / min(w, h)
+        new_w, new_h = round(w * scale_factor), round(h * scale_factor)
+    else:
+        new_w, new_h = w, h
+    new_w = new_w // size_divisible * size_divisible
+    new_h = new_h // size_divisible * size_divisible
+
+    # NOTE: It may result in unexpected number of frames in ffmpeg
+    # if calculate the fps directly according to max_frames
+    # NOTE: the below lines may hurt the performance
+    # if max_frames is not None and (fps is None or duration * fps > 2 * max_frames):
+    #     fps = max_frames / duration * 2
+
+    stream = ffmpeg.input(video_path, **input_kwargs)
+    if fps is not None:
+        stream = ffmpeg.filter(stream, "fps", fps=fps, round="down")
+    if new_w != w or new_h != h:
+        stream = ffmpeg.filter(stream, 'scale', new_w, new_h)
+    stream = ffmpeg.output(stream, "pipe:", format="rawvideo", pix_fmt="rgb24", **output_kwargs)
+    out, _ = ffmpeg.run(stream, capture_stdout=True, quiet=not verbose)
+
+    frames = np.frombuffer(out, np.uint8).reshape([-1, new_h, new_w, 3]).transpose([0, 3, 1, 2])
+
+    if fps is not None:
+        timestamps = np.arange(start_time, start_time + duration + 1 / fps, 1 / fps)[:len(frames)]
+    else:
+        timestamps = np.linspace(start_time, start_time + duration, len(frames))
+
+    max_frames = max_frames if max_frames is not None else MAX_FRAMES
+    if max_frames is not None and len(frames) > max_frames:
+        indices = np.linspace(0, len(frames) - 1, max_frames, dtype=int)
+        frames = frames[indices]
+        timestamps = [timestamps[i] for i in indices]
+
+    if temporal_factor > 1:
+        pad_length = temporal_factor - len(frames) % temporal_factor
+        frames = np.concatenate([frames, frames[-1:].repeat(pad_length, axis=0)])
+        [timestamps.append(timestamps[-1] + 1 / fps) for _ in range(pad_length)]
+
+    frames = [frame for frame in frames]
+    additional_frames = []
+    # print('frame_ids', frame_ids)
+    if frame_ids is not None:
+        vr = VideoReader(video_path, ctx=cpu(0))
+        additional_frames = vr.get_batch(frame_ids).asnumpy()
+    
+    return frames, timestamps, additional_frames
+
+
+def process_video(video_path, processor, s=None, e=None, aspect_ratio='pad', num_frames=None):
+    fps = 1 if num_frames is None else None
+    # FFmpeg
+    frames, timestamps = load_video(video_path, s, e, fps=fps, max_frames=num_frames)
+    # Decord
+    # frames, timestamps = load_video_from_ids(video_path, s, e, fps=fps, max_frames=num_frames)
+
+    assert len(frames) == len(timestamps), "Number of frames and timestamps must match."
+
+    if aspect_ratio == 'pad':
+        frames = [expand2square(f, tuple(int(x*255) for x in processor.image_mean)) for f in frames]
+
+    if aspect_ratio == 'qwen2vl':
+        frames = [processor.preprocess(frame, return_tensors='pt', image_num=len(frames)) for frame in frames]
+        grid_frames = [frames]
+    else:
+        frames = processor.preprocess(frames, return_tensors='pt', image_num=len(frames))
+        grid_frames = [[frames]]
+
+    return grid_frames, timestamps
+
+
+def tokenizer_multimodal_token(prompt, tokenizer, multimodal_token=DEFAULT_IMAGE_TOKEN, return_tensors=None):
+    """Tokenize text and multimodal tag to input_ids.
+
+    Args:
+        prompt (str): Text prompt (w/ multimodal tag), e.g., '<video>\nDescribe the video.'
+        tokenizer (transformers.PreTrainedTokenizer): Tokenizer object.
+        multimodal_token (int): Token index corresponding to the multimodal tag.
+    """
+    multimodal_token_index = MODAL_INDEX_MAP.get(multimodal_token, None)
+    if multimodal_token_index is None:
+        input_ids = tokenizer(prompt, add_special_tokens=False).input_ids
+    else:
+        prompt_chunks = [tokenizer(chunk, add_special_tokens=False).input_ids for idx, chunk in enumerate(prompt.split(multimodal_token))]
+
+        input_ids = []
+        for i in range(1, 2 * len(prompt_chunks)):
+            if i % 2 == 1:
+                input_ids.extend(prompt_chunks[i // 2])
+            else:
+                input_ids.append(multimodal_token_index)
+
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+
+
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith('checkpoint-'):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+
+
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        self.max_keyword_len = 0
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            if len(cur_keyword_ids) > self.max_keyword_len:
+                self.max_keyword_len = len(cur_keyword_ids)
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+
+    def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            if (output_ids[0, -keyword_id.shape[0]:] == keyword_id).all():
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        outputs = []
+        for i in range(output_ids.shape[0]):
+            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
+        return all(outputs)
diff --git a/videollama3/model/__init__.py b/videollama3/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d8f08fdb571522831b0598a0a0f4f5a35327cc7
--- /dev/null
+++ b/videollama3/model/__init__.py
@@ -0,0 +1,166 @@
+# Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+import os
+import warnings
+import shutil
+
+import torch
+from transformers import PretrainedConfig, AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
+
+from .projector import load_mm_projector
+from .videollama3_qwen2 import Videollama3Qwen2ForCausalLM, Videollama3Qwen2Config
+
+
+VLLMs = {
+    "videollama3_qwen2": Videollama3Qwen2ForCausalLM,
+}
+
+VLLMConfigs = {
+    "videollama3_qwen2": Videollama3Qwen2Config,
+}
+
+
+def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", **kwargs):
+    if 'token' in kwargs:
+        token = kwargs['token']
+    else:
+        token = None
+
+    # NOTE: auto device_map by default
+    # if want to put model into a single device, you can set device_map={"": "cuda:0"}
+    kwargs = {"device_map": device_map, **kwargs}
+
+    config = AutoConfig.from_pretrained(model_path)
+    config._attn_implementation = kwargs.pop('attn_implementation', "flash_attention_2") # default to flash_attention_2
+
+    torch_dtype = config.torch_dtype if hasattr(config, "torch_dtype") else kwargs.pop('torch_dtype', torch.float16)
+
+    if load_8bit:
+        kwargs['load_in_8bit'] = True
+    elif load_4bit:
+        # NOTE: High-version Transformers will report: """ValueError: You can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing `quantization_config` argument at the same time."""
+        # kwargs['load_in_4bit'] = True
+        kwargs['quantization_config'] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch_dtype,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4'
+        )
+    else:
+        kwargs['torch_dtype'] = torch_dtype
+
+    # judge model type
+    model_type = config.model_type if hasattr(config, "model_type") else kwargs.pop('model_type', "videollama3_qwen2")
+
+    # judge pretrain/finetune
+    is_alignment = getattr(config, "tune_mm_mlp_adapter", False) or getattr(config, "is_alignment", False)
+
+    # NOTE: lora/qlora model loading
+    if 'lora' in model_name.lower() or 'qlora' in model_name.lower():
+        cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
+        # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
+        # cfg_pretrained = AutoConfig.from_pretrained(model_path, token=token)
+        model_base = model_base if model_base is not None else cfg_pretrained._name_or_path
+
+        # NOTE: remove qlora training quantization config 
+        if hasattr(lora_cfg_pretrained, 'quantization_config'):
+            del lora_cfg_pretrained.quantization_config
+        tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
+        print('Loading VideoLLaMA from base model...')
+
+        if 'qwen2' in model_base.lower():
+            model = Videollama3Qwen2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
+        else:
+            model = Videollama3Qwen2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
+
+        token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
+        if model.lm_head.weight.shape[0] != token_num:
+            model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+            model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+
+        print('Loading additional VideoLLaMA weights...')
+        if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
+            non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
+        else:
+            # this is probably from HF Hub
+            from huggingface_hub import hf_hub_download
+            def load_from_hf(repo_id, filename, subfolder=None):
+                cache_file = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=filename,
+                    subfolder=subfolder)
+                return torch.load(cache_file, map_location='cpu')
+            non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
+        non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
+        if any(k.startswith('model.model.') for k in non_lora_trainables):
+            non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
+        model.load_state_dict(non_lora_trainables, strict=False)
+
+        from peft import PeftModel
+        print('Loading LoRA weights...')
+        model = PeftModel.from_pretrained(model, model_path)
+        print('Merging LoRA weights...')
+        model = model.merge_and_unload()
+        print('Model is loaded...')
+    elif model_base is not None or '-base' in model_name.lower() or is_alignment:
+        # NOTE: Base/Pretrain model loading
+        print('Loading VideoLLaMA 2 from base model...')
+        cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
+        # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
+        # cfg_pretrained = AutoConfig.from_pretrained(model_path, token=token)
+        model_base = model_base if model_base is not None else cfg_pretrained._name_or_path
+
+        tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
+
+        if model_type in ['videollama3', 'videollama3_qwen2']:
+            model = Videollama3Qwen2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
+        else:
+            model = Videollama3Qwen2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
+
+        # NOTE; loading vision-language projector
+        # * old codes for loading local mm_projector.bin
+        # mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
+        # mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
+        # model.load_state_dict(mm_projector_weights, strict=False)
+        # * new codes which supports loading mm_projector.bin both offline and online 
+        mm_projector_weights = load_mm_projector(model_path, token=token)
+        model.load_state_dict(mm_projector_weights, strict=False)
+    elif 'videollama' in model_type:
+        # NOTE: SFT model loading
+        print(model_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
+
+        if model_type in ['videollama3_qwen2']:
+            model = Videollama3Qwen2ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
+        else:
+            model = Videollama3Qwen2ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, token=token)
+        model = AutoModelForCausalLM.from_pretrained(model_path, config=config, **kwargs)
+
+    processor = None
+
+    if "videollama" in model_type:
+        vision_encoder = model.get_vision_encoder()
+        processor = vision_encoder.image_processor
+
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+
+    return tokenizer, model, processor, context_len
diff --git a/videollama3/model/__pycache__/__init__.cpython-310.pyc b/videollama3/model/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb5e4e855d3d9e08f518b000eb2e1f50442f49e5
Binary files /dev/null and b/videollama3/model/__pycache__/__init__.cpython-310.pyc differ
diff --git a/videollama3/model/__pycache__/encoder.cpython-310.pyc b/videollama3/model/__pycache__/encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d4b605bd58eb4eb313dc04377185530887064cb2
Binary files /dev/null and b/videollama3/model/__pycache__/encoder.cpython-310.pyc differ
diff --git a/videollama3/model/__pycache__/processor.cpython-310.pyc b/videollama3/model/__pycache__/processor.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9af93d6f4aae64dd4f8675ab3dfc569fc487ed37
Binary files /dev/null and b/videollama3/model/__pycache__/processor.cpython-310.pyc differ
diff --git a/videollama3/model/__pycache__/projector.cpython-310.pyc b/videollama3/model/__pycache__/projector.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ce5a27b2321cee94171d6e0dc724f8fc16feafe
Binary files /dev/null and b/videollama3/model/__pycache__/projector.cpython-310.pyc differ
diff --git a/videollama3/model/__pycache__/region_encoder.cpython-310.pyc b/videollama3/model/__pycache__/region_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..18c33daa3d9715d79c79604457f0f32fcb835331
Binary files /dev/null and b/videollama3/model/__pycache__/region_encoder.cpython-310.pyc differ
diff --git a/videollama3/model/__pycache__/videollama3_arch.cpython-310.pyc b/videollama3/model/__pycache__/videollama3_arch.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f30d5633b1a913ba4a2f580058fbbed1ac652bff
Binary files /dev/null and b/videollama3/model/__pycache__/videollama3_arch.cpython-310.pyc differ
diff --git a/videollama3/model/__pycache__/videollama3_qwen2.cpython-310.pyc b/videollama3/model/__pycache__/videollama3_qwen2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25958667baf91b72b90f35206d074da0c8a360e3
Binary files /dev/null and b/videollama3/model/__pycache__/videollama3_qwen2.cpython-310.pyc differ
diff --git a/videollama3/model/damovl_encoder/__init__.py b/videollama3/model/damovl_encoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7583e1b89cfcaf8eef4b79aac66e6f7720bd0746
--- /dev/null
+++ b/videollama3/model/damovl_encoder/__init__.py
@@ -0,0 +1,3 @@
+from .configuration_damovl_encoder import DAMOVLVisionConfig
+from .image_processing import DAMOVLImageProcessor
+from .modeling_damovl_encoder import DAMOVLVisionModel
\ No newline at end of file
diff --git a/videollama3/model/damovl_encoder/__pycache__/__init__.cpython-310.pyc b/videollama3/model/damovl_encoder/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..811a9fcc88c7e992cbbdcc603d490b3482e621f5
Binary files /dev/null and b/videollama3/model/damovl_encoder/__pycache__/__init__.cpython-310.pyc differ
diff --git a/videollama3/model/damovl_encoder/__pycache__/configuration_damovl_encoder.cpython-310.pyc b/videollama3/model/damovl_encoder/__pycache__/configuration_damovl_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1655223d6ddede107333a15438c850f58dd7a197
Binary files /dev/null and b/videollama3/model/damovl_encoder/__pycache__/configuration_damovl_encoder.cpython-310.pyc differ
diff --git a/videollama3/model/damovl_encoder/__pycache__/image_processing.cpython-310.pyc b/videollama3/model/damovl_encoder/__pycache__/image_processing.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9533f55ed61e5faa82811f359cb7d8e3812ce5ee
Binary files /dev/null and b/videollama3/model/damovl_encoder/__pycache__/image_processing.cpython-310.pyc differ
diff --git a/videollama3/model/damovl_encoder/__pycache__/modeling_damovl_encoder.cpython-310.pyc b/videollama3/model/damovl_encoder/__pycache__/modeling_damovl_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad6038195d59fc4d0c6615acf2881cf17d66e914
Binary files /dev/null and b/videollama3/model/damovl_encoder/__pycache__/modeling_damovl_encoder.cpython-310.pyc differ
diff --git a/videollama3/model/damovl_encoder/configuration_damovl_encoder.py b/videollama3/model/damovl_encoder/configuration_damovl_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..3849ceb181a9aadcd6fe8521af5362a2e0d68fb9
--- /dev/null
+++ b/videollama3/model/damovl_encoder/configuration_damovl_encoder.py
@@ -0,0 +1,71 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen2VL model configuration"""
+
+import os
+from typing import Union
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DAMOVLVisionConfig(PretrainedConfig):
+    model_type = "damovl"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        spatial_merge_size=1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.spatial_merge_size = spatial_merge_size
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
diff --git a/videollama3/model/damovl_encoder/image_processing.py b/videollama3/model/damovl_encoder/image_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..09cba4afc4489ea615844dd41274401bdec0f95a
--- /dev/null
+++ b/videollama3/model/damovl_encoder/image_processing.py
@@ -0,0 +1,472 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Qwen2-VL."""
+
+import math
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    VideoInput,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.utils import TensorType, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+
+    Returns:
+        list: A list of images.
+    """
+    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
+        return [img for img_list in images for img in img_list]
+
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+
+    elif is_valid_image(images):
+        return [images]
+
+    raise ValueError(f"Could not make batched images from {images}")
+
+
+# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
+def make_batched_videos(videos) -> List[VideoInput]:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+        return videos
+
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], Image.Image):
+            return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
+
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
+
+    raise ValueError(f"Could not make batched video from {videos}")
+
+
+def smart_resize(
+    height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
+):
+    """Rescales the image so that the following conditions are met:
+
+    1. Both dimensions (height and width) are divisible by 'factor'.
+
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+    3. The aspect ratio of the image is maintained as closely as possible.
+
+    """
+    if height < factor or width < factor:
+        scale = factor / min(height, width)
+        width = round(scale * width)
+        height = round(scale * height)
+    elif max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+
+
+class DAMOVLImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a DAMOVL image processor that dynamically resizes images based on the original images.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        min_pixels (`int`, *optional*, defaults to `56 * 56`):
+            The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
+            The max pixels of the image to resize the image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The spacial patch size of the vision encoder.
+        temporal_patch_size (`int`, *optional*, defaults to 2):
+            The temporal patch size of the vision encoder.
+        merge_size (`int`, *optional*, defaults to 2):
+            The merge size of the vision encoder to llm encoder.
+    """
+
+    model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 56 * 56,
+        max_pixels: int = 14 * 14 * 9477,
+        patch_size: int = 14,
+        merge_size: int = 1,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+        self.do_convert_rgb = do_convert_rgb
+        
+        self.temporal_patch_size = 1
+
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        do_resize: bool = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        num_images: Optional[int] = 1,
+        image_downsampling: Optional[int] = None,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            vision_info (`List[Dict]`, *optional*):
+                Optional list of dictionaries containing additional information about vision inputs.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        images = make_list_of_images(images)
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            if do_resize:
+                max_pixels = int(self.max_pixels / (self.merge_size / image_downsampling)**2)
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=self.patch_size * image_downsampling,
+                    min_pixels=self.min_pixels,
+                    max_pixels=int(max_pixels // num_images),
+                )
+                image = resize(
+                    image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
+                )
+
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+
+        patches = np.array(processed_images)
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose(0, 3, 1, 2)
+        
+        channel = patches.shape[1]
+        grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+        patches = patches.reshape(
+            channel,
+            grid_h // image_downsampling,
+            image_downsampling,
+            self.patch_size,
+            grid_w // image_downsampling,
+            image_downsampling,
+            self.patch_size,
+        )
+        patches = patches.transpose(1, 4, 2, 5, 0, 3, 6)
+        flatten_patches = patches.reshape(
+            grid_h * grid_w, channel * self.patch_size * self.patch_size
+        )
+        # print('image_downsampling', image_downsampling)
+        # flatten_patches1 = flatten_patches.reshape(grid_h, grid_w, channel, -1)
+        # from matplotlib import pyplot as plt
+        # plt.imshow(flatten_patches1[:,:,:,0])
+        # plt.savefig('8.png')
+
+        return flatten_patches, (1, grid_h, grid_w)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        videos: VideoInput = None,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        num_images: Optional[int] = 1,
+        image_downsampling: Optional[int] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            videos (`VideoInput`):
+                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        image_downsampling = image_downsampling if image_downsampling is not None else self.merge_size
+
+        if images is not None:
+            images = make_batched_images(images)
+        if videos is not None:
+            videos = make_batched_videos(videos)
+
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if images is not None:
+            pixel_values, vision_grid_thws = [], []
+            for image in images:
+                patches, image_grid_thw = self._preprocess(
+                    image,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                    num_images=num_images,
+                    image_downsampling=image_downsampling,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(image_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
+
+        assert videos is None, "Not support video for now."
+        # NOTE: not support video for now
+        # if videos is not None:
+        #     pixel_values, vision_grid_thws = [], []
+        #     for images in videos:
+        #         patches, video_grid_thw = self._preprocess(
+        #             images,
+        #             do_resize=do_resize,
+        #             resample=resample,
+        #             do_rescale=do_rescale,
+        #             rescale_factor=rescale_factor,
+        #             do_normalize=do_normalize,
+        #             image_mean=image_mean,
+        #             image_std=image_std,
+        #             data_format=data_format,
+        #             do_convert_rgb=do_convert_rgb,
+        #             input_data_format=input_data_format,
+        #             image_num=image_num,
+        #         )
+        #         pixel_values.extend(patches)
+        #         vision_grid_thws.append(video_grid_thw)
+        #     pixel_values = np.array(pixel_values)
+        #     vision_grid_thws = np.array(vision_grid_thws)
+        #     data = {"pixel_values_videos": pixel_values, "video_grid_thw": vision_grid_thws}
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/videollama3/model/damovl_encoder/modeling_damovl_encoder.py b/videollama3/model/damovl_encoder/modeling_damovl_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1a875b25e0edb840340e596d1520d6117864b01
--- /dev/null
+++ b/videollama3/model/damovl_encoder/modeling_damovl_encoder.py
@@ -0,0 +1,542 @@
+# coding=utf-8
+# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Siglip model."""
+
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from torch.nn.init import _calculate_fan_in_and_fan_out
+import torch.nn.functional as F
+
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (add_start_docstrings,
+                                add_start_docstrings_to_model_forward,
+                                is_flash_attn_2_available,
+                                is_flash_attn_greater_or_equal_2_10, logging,
+                                replace_return_docstrings)
+from .configuration_damovl_encoder import DAMOVLVisionConfig
+
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_varlen_func
+    from transformers.modeling_flash_attention_utils import \
+        _flash_attention_forward
+else:
+    flash_attn_varlen_func = None
+
+
+logger = logging.get_logger(__name__)
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    tensor.erfinv_()
+
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+
+    # Clamp to ensure it's in the proper range
+    tensor.clamp_(min=a, max=b)
+
+
+def trunc_normal_tf_(
+    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
+) -> torch.Tensor:
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsequently scaled and shifted by the mean and std args.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+
+
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+
+    variance = scale / denom
+
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+
+
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+
+
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    orig_dtype = tensor.dtype
+    tensor = tensor.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    output = (tensor * cos) + (rotate_half(tensor) * sin)
+    output = output.to(orig_dtype)
+    return output
+
+
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+    
+
+class DAMOVLVisionEmbeddings(nn.Module):
+    def __init__(self, config: DAMOVLVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = hidden_states.view(
+            -1, self.config.num_channels, self.patch_size, self.patch_size
+        )
+        patch_embeds = self.patch_embedding(hidden_states)  # shape = [*, width, grid, grid]
+        # embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        embeddings = patch_embeds.view(-1, self.embed_dim)
+
+        return embeddings
+
+
+class VisionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor = None,
+    ) -> torch.Tensor:
+        """Input shape: Time x Channel"""
+
+        q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(q_len, self.num_heads, self.head_dim)
+        value_states = value_states.view(q_len, self.num_heads, self.head_dim)
+        
+        query_states = apply_rotary_pos_emb_vision(query_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        key_states = apply_rotary_pos_emb_vision(key_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
+
+        attention_mask = torch.zeros([1, q_len, q_len], device=q.device, dtype=torch.bool)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
+
+        query_states = query_states.transpose(0, 1)
+        key_states = key_states.transpose(0, 1)
+        value_states = value_states.transpose(0, 1)
+        
+        attn_weights = torch.matmul(query_states, key_states.transpose(1, 2)) / math.sqrt(self.head_dim)
+        attn_weights = attn_weights + attention_mask
+        
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(q_len, -1)
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output
+
+
+class VisionFlashAttention2(VisionAttention):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    # Adapted from transformers.models.llama.modeling_llama.LlamaFlashAttention2.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor = None,
+    ) -> torch.Tensor:
+        q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(q_len, self.num_heads, self.head_dim)
+        value_states = value_states.view(q_len, self.num_heads, self.head_dim)
+        query_states = apply_rotary_pos_emb_vision(query_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        key_states = apply_rotary_pos_emb_vision(key_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        attn_output = flash_attn_varlen_func(query_states, key_states, value_states, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
+            q_len, -1
+        )
+        attn_output = self.out_proj(attn_output)
+        
+        return attn_output
+
+
+class VisionSdpaAttention(VisionAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "DAMOVLVisionModel is using VisionSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb=rotary_pos_emb,
+            )
+
+        seq_length = hidden_states.shape[0]
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(q_len, self.num_heads, self.head_dim)
+        value_states = value_states.view(q_len, self.num_heads, self.head_dim)
+        
+        query_states = apply_rotary_pos_emb_vision(query_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        key_states = apply_rotary_pos_emb_vision(key_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
+
+        attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
+            
+        query_states = query_states.transpose(0, 1)
+        key_states = key_states.transpose(0, 1)
+        value_states = value_states.transpose(0, 1)
+        attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attention_mask, dropout_p=0.0)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+
+DAMOVL_VISION_ATTENTION_CLASSES = {
+    "eager": VisionAttention,
+    "flash_attention_2": VisionFlashAttention2,
+    "sdpa": VisionSdpaAttention,
+}
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->DAMOVL
+class DAMOVLVisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class DAMOVLVisionEncoderLayer(nn.Module):
+    def __init__(self, config: DAMOVLVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = DAMOVL_VISION_ATTENTION_CLASSES[config._attn_implementation](config=config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = DAMOVLVisionMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    # Ignore copy
+    def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> torch.Tensor:
+        hidden_states = hidden_states + self.self_attn(
+            self.layer_norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
+        )
+        hidden_states = hidden_states + self.mlp(self.layer_norm2(hidden_states))
+        return hidden_states
+
+
+class DAMOVLPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = DAMOVLVisionConfig
+    base_model_prefix = "damovl"
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        "DAMOVLVisionEncoderLayer",
+        "DAMOVLVisionEmbeddings",
+    ]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, VisionAttention):
+            nn.init.xavier_uniform_(module.q_proj.weight)
+            nn.init.xavier_uniform_(module.k_proj.weight)
+            nn.init.xavier_uniform_(module.v_proj.weight)
+            nn.init.xavier_uniform_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, DAMOVLVisionMLP):
+            nn.init.xavier_uniform_(module.fc1.weight)
+            nn.init.xavier_uniform_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+class DAMOVLVisionEncoder(nn.Module):
+    def __init__(self, config: DAMOVLVisionConfig):
+        super().__init__()
+        self.config = config
+        head_dim = config.hidden_size // config.num_attention_heads
+        self.spatial_merge_size = config.spatial_merge_size
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+        self.layers = nn.ModuleList([DAMOVLVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def rot_pos_emb(self, grid_thw, strides):
+        pos_ids = []
+        for (t, h, w), stride in zip(grid_thw, strides):
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // stride,
+                stride,
+                w // stride,
+                stride,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // stride,
+                stride,
+                w // stride,
+                stride,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def forward(self, hidden_states, grid_thws, strides) -> torch.Tensor:
+        # BUG: These codes will cause deepspeed issue: `RuntimeError: disagreement between rank0 and rankx`
+        # rotary_pos_emb = []
+        # for thw in grid_thws:
+        #     rotary_pos_emb.append(self.rot_pos_emb(thw).unsqueeze(0))
+        # rotary_pos_emb1 = torch.cat(rotary_pos_emb, dim=1).squeeze(0)
+        # grid_thws = torch.cat(grid_thws, dim = 0)
+
+        # new version of creating rotary position embedding
+        # grid_thws shapes like [batch_flatten_image_num, 3]
+        # grid_thws = torch.cat(grid_thws, dim = 0) # is conducted in the `encoder.py`
+        rotary_pos_emb = self.rot_pos_emb(grid_thws, strides)
+
+        cu_seqlens = torch.repeat_interleave(grid_thws[:, 1] * grid_thws[:, 2], grid_thws[:, 0]).cumsum(dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+
+        for blk in self.layers:
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    blk.__call__,
+                    hidden_states,
+                    cu_seqlens,
+                    rotary_pos_emb
+                )
+            else:
+                hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
+        return hidden_states
+
+
+class DAMOVLVisionTransformer(nn.Module):
+    def __init__(self, config: DAMOVLVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = DAMOVLVisionEmbeddings(config)
+        self.encoder = DAMOVLVisionEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, grid_thws, strides) -> torch.Tensor:
+
+        # print(hidden_states)
+
+        # hidden_states = torch.cat(hidden_states, dim = 1)
+
+        hidden_states = self.embeddings(hidden_states)
+        hidden_states = self.encoder(hidden_states, grid_thws, strides)
+        hidden_states = self.post_layernorm(hidden_states)
+
+        return hidden_states
+
+
+class DAMOVLVisionModel(DAMOVLPreTrainedModel):
+    config_class = DAMOVLVisionConfig
+    main_input_name = "hidden_states"
+
+    def __init__(self, config: DAMOVLVisionConfig):
+        super().__init__(config)
+
+        self.vision_model = DAMOVLVisionTransformer(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(self, hidden_states, grid_thws, strides) -> torch.Tensor:
+        return self.vision_model(hidden_states=hidden_states, grid_thws=grid_thws, strides=strides)
diff --git a/videollama3/model/encoder.py b/videollama3/model/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..93324a8a8bc44224bfb9fab4fa72f044fc3faebd
--- /dev/null
+++ b/videollama3/model/encoder.py
@@ -0,0 +1,385 @@
+import os
+
+import torch
+import torch.nn as nn
+from transformers import (CLIPImageProcessor, CLIPVisionConfig,
+                          CLIPVisionModel, SiglipImageProcessor,
+                          SiglipVisionConfig, SiglipVisionModel)
+
+from .qwen2vl_encoder import (Qwen2VisionTransformerPretrainedModel,
+                              Qwen2VLImageProcessor, Qwen2VLVisionConfig)
+
+from .damovl_encoder  import (DAMOVLImageProcessor, DAMOVLVisionModel)
+
+
+class CLIPVisionEncoder(nn.Module):
+
+    def __init__(self, vision_encoder, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_encoder_name = vision_encoder
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        if not delay_load:
+            self.attn_implementation = getattr(args, 'mm_attn_implementation', 'flash_attention_2')
+            self.load_model()
+        else:
+            # uncertain whether flash-attention-2 is supported during inference phase.
+            self.attn_implementation = 'sdpa' # 'eager'
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_encoder_name)
+
+    def load_model(self):
+        if self.is_loaded:
+            print('Vision tower is already loaded, `load model` call again, skipping.')
+            return
+
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_encoder_name)
+
+        self.vision_encoder = CLIPVisionModel.from_pretrained(self.vision_encoder_name,
+                                                            attn_implementation=self.attn_implementation)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    def forward(self, images, **kwargs):
+        images = torch.cat(images)
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_encoder(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_encoder(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_encoder.dtype
+
+    @property
+    def device(self):
+        return self.vision_encoder.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_encoder.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+
+    @property
+    def image_size(self):
+        return self.config.image_size
+
+
+class SiglipVisionEncoder(nn.Module):
+
+    def __init__(self, vision_encoder, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_encoder_name = vision_encoder
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        if not delay_load:
+            self.attn_implementation = getattr(args, 'mm_attn_implementation', 'flash_attention_2')
+            self.load_model()
+        else:
+            # uncertain whether flash-attention-2 is supported during inference phase.
+            self.attn_implementation = 'sdpa' # 'eager'
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_encoder_name)
+
+    def load_model(self):
+        if self.is_loaded:
+            print('Vision tower is already loaded, `load model` call again, skipping.')
+            return
+
+        self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_encoder_name)
+
+        self.vision_encoder = SiglipVisionModel.from_pretrained(self.vision_encoder_name,
+                                                              attn_implementation=self.attn_implementation)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    def forward(self, images, **kwargs):
+        images = torch.cat(images)
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_encoder(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_encoder(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_encoder.dtype
+
+    @property
+    def device(self):
+        return self.vision_encoder.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_encoder.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+
+    @property
+    def image_size(self):
+        return self.config.image_size
+
+
+class Qwen2VLVisionEncoder(nn.Module):
+
+    def __init__(self, vision_encoder, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_encoder_name = vision_encoder
+        self.select_layer = args.mm_vision_select_layer
+
+        if not delay_load:
+            self.attn_implementation = getattr(args, 'mm_attn_implementation', 'flash_attention_2')
+            self.load_model(args)
+        else:
+            # uncertain whether flash-attention-2 is supported during inference phase.
+            self.attn_implementation = 'sdpa' # 'eager'
+            self.cfg_only = Qwen2VLVisionConfig.from_pretrained(self.vision_encoder_name)
+
+    def load_model(self, args):
+        if self.is_loaded:
+            print('Vision tower is already loaded, `load model` call again, skipping.')
+            return
+
+        # merge_size is set to 1 by default, because STAGE1, STAGE1.5, STAGE2 are trained with merge_size=1
+        # for stage 3, the merge_size is set to 2 by argments. 
+        self.image_processor = Qwen2VLImageProcessor.from_pretrained(self.vision_encoder_name)
+        self.image_processor.merge_size = args.spatial_merge_size
+        # NOTE: The maximum number of vision tokens is 8192 by default.
+        mm_max_length = args.mm_max_length if hasattr(args, 'mm_max_length') else 9477 // (args.spatial_merge_size**2)
+        self.image_processor.max_pixels = mm_max_length * (args.spatial_merge_size**2 * self.image_processor.patch_size**2)
+        self.image_processor.size["max_pixels"] = self.image_processor.max_pixels
+
+        # merge_size is fixed to 1 for STAGE1, STAGE1.5, STAGE2, STAGE3 in encoder and can be modified in connector.
+        self.cfg_only = Qwen2VLVisionConfig.from_pretrained(self.vision_encoder_name)
+        self.cfg_only.spatial_merge_size = args.spatial_merge_size
+
+        self.vision_encoder = Qwen2VisionTransformerPretrainedModel.from_pretrained(
+            self.vision_encoder_name,
+            config=self.cfg_only,
+            torch_dtype=args.torch_dtype,
+            attn_implementation=self.attn_implementation)
+
+        self.is_loaded = True
+
+    def forward(self, images, grid_thws, strides, **kwargs):
+        images    = [image    for sub_images in images for image in sub_images]
+        grid_thws = [grid_thw for sub_grid_thws in grid_thws for grid_thw in sub_grid_thws]
+        strides = [stride for sub_strides in strides for stride in sub_strides]
+
+        images = torch.cat(images, dim=0)
+        grid_thws = torch.cat(grid_thws, dim=0)
+
+        image_features = self.vision_encoder(images, grid_thws, strides=strides)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_encoder.dtype
+
+    @property
+    def device(self):
+        return self.vision_encoder.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_encoder.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches(self):
+        return -1
+
+    @property
+    def num_patches_per_side(self):
+        return -1
+
+    @property
+    def image_size(self):
+        return 14 * self.vision_encoder.config.spatial_merge_size
+
+
+class DAMOVLVisionEncoder(nn.Module):
+
+    def __init__(self, vision_encoder, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_encoder_name = vision_encoder
+        self.args = args
+
+        if not delay_load:
+            self.attn_implementation = getattr(args, 'mm_attn_implementation', 'flash_attention_2')
+            self.load_model(self.args)
+        else:
+            # uncertain whether flash-attention-2 is supported during inference phase.
+            self.attn_implementation = 'sdpa' # 'eager'
+            self.cfg_only = DAMOVLVisionConfig.from_pretrained(self.vision_encoder_name)
+
+    def load_model(self, args):
+        if self.is_loaded:
+            print('Vision tower is already loaded, `load model` call again, skipping.')
+            return
+
+        # merge_size is set to 1 by default, because STAGE1, STAGE1.5, STAGE2 are trained with merge_size=1
+        # for stage 3, the merge_size is set to 2 by argments. 
+        self.image_processor = DAMOVLImageProcessor.from_pretrained(self.vision_encoder_name)
+        self.image_processor.merge_size = args.spatial_merge_size
+        # NOTE: The maximum number of vision tokens is 8192 by default.
+        mm_max_length = args.mm_max_length if hasattr(args, 'mm_max_length') else 9477 // (args.spatial_merge_size**2)
+        self.image_processor.max_pixels = mm_max_length * (args.spatial_merge_size**2 * self.image_processor.patch_size**2)
+        self.image_processor.size["max_pixels"] = self.image_processor.max_pixels
+
+        # merge_size is fixed to 1 for STAGE1, STAGE1.5, STAGE2, STAGE3 in encoder and can be modified in connector.
+        self.cfg_only = Qwen2VLVisionConfig.from_pretrained(self.vision_encoder_name)
+        self.cfg_only.spatial_merge_size = args.spatial_merge_size
+
+        self.vision_encoder = DAMOVLVisionModel.from_pretrained(
+            self.vision_encoder_name,
+            spatial_merge_size=args.spatial_merge_size,
+            torch_dtype=args.torch_dtype,
+            attn_implementation=self.attn_implementation)
+
+        self.is_loaded = True
+
+    def forward(self, images, grid_thws, strides, **kwargs):
+        images    = [image    for sub_images in images for image in sub_images]
+        grid_thws = [grid_thw for sub_grid_thws in grid_thws for grid_thw in sub_grid_thws]
+        strides = [stride for sub_strides in strides for stride in sub_strides]
+
+        images = torch.cat(images, dim=0)
+        grid_thws = torch.cat(grid_thws, dim=0)
+
+        image_features = self.vision_encoder(images, grid_thws, strides)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_encoder.dtype
+
+    @property
+    def device(self):
+        return self.vision_encoder.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_encoder.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches(self):
+        return -1
+
+    @property
+    def num_patches_per_side(self):
+        return -1
+
+    @property
+    def image_size(self):
+        return 14 * self.vision_encoder.config.spatial_merge_size
+
+
+def build_vision_encoder(vision_encoder_cfg, **kwargs):
+
+    vision_encoder = getattr(vision_encoder_cfg, 'mm_vision_encoder', getattr(vision_encoder_cfg, 'vision_encoder', None))
+
+    vision_encoder = DAMOVLVisionEncoder(vision_encoder, args=vision_encoder_cfg, **kwargs)
+
+    return vision_encoder
diff --git a/videollama3/model/processor.py b/videollama3/model/processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..8441ae51116afd899e54e4802951e86588717461
--- /dev/null
+++ b/videollama3/model/processor.py
@@ -0,0 +1,366 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for VideoLLaMA3.
+"""
+import copy
+import math
+import warnings
+from typing import List, Union, Dict, Optional
+
+import torch
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, VideoInput
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+import sys
+sys.path.append(".")
+from videollama3.constants import DEFAULT_IMAGE_TOKEN, IGNORE_INDEX
+
+
+DEFAULT_CHAT_TEMPLATE = """
+{%- set identifier = 'im' %}
+{% for message in messages %}
+    {% if message['role'] == 'stream' %}
+        {% set identifier = 'stream' %}
+    {% else %}
+        {% set identifier = 'im' %}
+    {% endif %}
+    {{- '<|' + identifier + '_start|>' + message['role'] + '\n' -}}
+    {% if message['content'] is string %}
+        {{- message['content'] + '<|' + identifier + '_end|>\n' -}}
+    {% else %}
+        {% for content in message['content'] %}
+            {% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}
+                {% if 'time' in content %}
+                    {{- 'Time ' + content['time'] | round(1) | string + 's: ' -}}
+                {% endif %}
+"""
+DEFAULT_CHAT_TEMPLATE += """
+                {{- '%s\n' -}}
+""" % DEFAULT_IMAGE_TOKEN
+DEFAULT_CHAT_TEMPLATE += """
+            {% elif content['type'] == 'video' or 'video' in content or 'video_url' in content %}
+                {% for i in range(content['num_frames']) %}
+                    {% if 'time' in content %}
+                        {{- 'Time ' + content['time'][i] | round(1) | string + 's:' -}}
+                    {% endif %}
+                    {% if i < content['num_frames'] - 1 %}
+"""
+DEFAULT_CHAT_TEMPLATE += """
+                        {{- '%s,' -}}
+""" % DEFAULT_IMAGE_TOKEN
+DEFAULT_CHAT_TEMPLATE += """
+                    {% else %}
+"""
+DEFAULT_CHAT_TEMPLATE += """
+                        {{- '%s\n' -}}
+""" % DEFAULT_IMAGE_TOKEN
+DEFAULT_CHAT_TEMPLATE += """
+                    {% endif %}
+                {% endfor %}
+            {% elif 'text' in content %}
+                {{- content['text'] -}}
+            {% endif %}
+        {% endfor %}
+        {{- '<|' + identifier + '_end|>\n' -}}
+    {% endif %}
+{% endfor %}
+{% if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' -}}
+{% endif %}
+"""
+
+
+class Videollama3ProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+    }
+
+
+class Videollama3Processor(ProcessorMixin):
+    r"""
+    Modified from Qwen2VLProcessor
+    Args:
+        image_processor ([`Qwen2VLImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+    image_processor_class = "Qwen2VLImageProcessor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+        if chat_template is None:
+            chat_template = DEFAULT_CHAT_TEMPLATE
+        # super().__init__(image_processor, tokenizer, chat_template=chat_template)
+        tokenizer.chat_template = chat_template
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        self.generation_prompt = self._infer_generation_prompt()
+        self.generation_prompt_ids = self.tokenizer.encode(self.generation_prompt, return_tensors="pt")
+        self.generation_prompt_length = len(self.generation_prompt_ids[0])
+        self.image_token_id = self.tokenizer.convert_tokens_to_ids(DEFAULT_IMAGE_TOKEN)
+        self.eos_token_id = self.tokenizer.eos_token_id
+
+    def get_generation_prompt(self):
+        return self.generation_prompt
+
+    def get_generation_prompt_ids(self):
+        return self.generation_prompt_ids
+
+    def _infer_generation_prompt(self):
+        pseudo_message = [{"role": "user", "content": ""}]
+        instruction = self.tokenizer.apply_chat_template(pseudo_message, tokenize=False, add_generation_prompt=True)
+        conversation = self.tokenizer.apply_chat_template(pseudo_message, tokenize=False, add_generation_prompt=False)
+        return instruction.replace(conversation, "")
+
+    def _process_text_with_label(
+        self,
+        text: List[Dict],
+        image_grid_thw: torch.Tensor = None,
+        image_downsampling: Optional[int] = None,
+        **kwargs,
+    ):
+        assert kwargs.pop("return_tensors", "pt") == "pt", "Only PyTorch tensors are supported when return_labels=True."
+        assert isinstance(text[0], dict), "When return_labels=True, text must be a list of messages."
+
+        input_ids_list = []
+        targets_list = []
+        sample_types_list = []
+        image_idx = 0
+
+        for message_idx, message in enumerate(text):
+            # 1. set chat template and append image tokens
+            prompt = self.tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=False)
+            prompt_chunks = prompt.split(DEFAULT_IMAGE_TOKEN)
+            prompt = []
+            for chunk_idx in range(len(prompt_chunks) - 1):
+                prompt.append(prompt_chunks[chunk_idx])
+                thw = image_grid_thw[image_idx]
+                prompt.append(DEFAULT_IMAGE_TOKEN * (thw.prod() / image_downsampling**2).long())
+                image_idx += 1
+            prompt.append(prompt_chunks[-1])
+            prompt = "".join(prompt)
+
+            input_ids = self.tokenizer.encode(prompt, return_tensors="pt")[0]
+            input_ids_list.append(input_ids)
+
+            targets = torch.full_like(input_ids, IGNORE_INDEX)
+            sample_types = torch.full_like(input_ids, IGNORE_INDEX)
+            if message["role"] == "assistant":
+                targets[self.generation_prompt_length:-1] = input_ids[self.generation_prompt_length:-1].clone()
+            elif message["role"] == "stream":
+                diff = torch.diff((input_ids == self.image_token_id).float())
+                image_end_indices = torch.nonzero(diff < 0)[:, 0]
+                targets[image_end_indices + 1] = input_ids[image_end_indices + 1]
+                sample_types = targets.clone()
+                sample_types[torch.logical_and(sample_types > 0, sample_types != self.eos_token_id)] = 0
+                targets[-2] = input_ids[-2]    # <|im_end|>
+
+            # if message_idx > 0 and text[message_idx - 1]["role"] == "stream":
+            #     targets[0] = input_ids[0]
+            #     # TODO: consider non-special tokens
+            #     sample_types[0] = input_ids[0]
+
+            targets_list.append(targets)
+            sample_types_list.append(sample_types)
+
+        assert len(image_grid_thw) == image_idx, "Number of images does not match the number of image tokens in the text."
+
+        targets = torch.cat(targets_list)
+        sample_types = torch.cat(sample_types_list)
+        types, counts = torch.unique(sample_types[sample_types > -1], return_counts=True)
+
+        if len(types) > 0:
+            target_num_samples = counts.amin()
+
+            for type_id, type_count in zip(types, counts):
+                if type_count > target_num_samples:
+                    indices = torch.nonzero(sample_types == type_id)[:, 0]
+                    random_selector = torch.randperm(indices.size(0))[:-target_num_samples]
+                    targets[indices[random_selector]] = IGNORE_INDEX
+                    sample_types[indices[random_selector]] = -1
+
+        text_inputs = {
+            "input_ids": torch.cat(input_ids_list),
+            "labels": targets,
+        }
+
+        return text_inputs
+
+    def _process_text_without_label(
+        self,
+        text: Union[List[str], List[Dict]],
+        image_grid_thw: torch.Tensor = None,
+        image_downsampling: Optional[int] = None,
+        **kwargs,
+    ):
+        if isinstance(text[0], dict):
+            warnings.warn("Input text is a list of messages. Automatically convert it to a string with 'apply_chat_template' with generation prompt.")
+            text = [self.tokenizer.apply_chat_template(text, tokenize=False, add_generation_prompt=True)]
+
+        image_idx = 0
+        for i in range(len(text)):
+            while DEFAULT_IMAGE_TOKEN in text[i]:
+                thw = image_grid_thw[image_idx]
+                text[i] = text[i].replace(DEFAULT_IMAGE_TOKEN, "<placeholder>" * (thw.prod() / image_downsampling**2).long(), 1)
+                image_idx += 1
+            text[i] = text[i].replace("<placeholder>", DEFAULT_IMAGE_TOKEN)
+        assert len(image_grid_thw) == image_idx, "Number of images does not match the number of image tokens in the text."
+
+        text_inputs = self.tokenizer(text, **kwargs)
+        return text_inputs
+
+    def _process_text(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput], List[Dict]],
+        image_grid_thw: torch.Tensor = None,
+        image_downsampling: Optional[int] = None,
+        return_labels: bool = False,
+        **kwargs,
+    ):
+        if not isinstance(text, (list, tuple)):
+            text = [text]
+        assert len(text), "At least one text must be provided."
+
+        if return_labels:
+            return self._process_text_with_label(text, image_grid_thw, image_downsampling, **kwargs)
+        return self._process_text_without_label(text, image_grid_thw, image_downsampling, **kwargs)
+
+    def _process_image(
+        self,
+        images: ImageInput = None,
+        image_downsampling: Optional[int] = None,
+        **kwargs,
+    ):
+        if image_downsampling is None:
+            image_downsampling = self.image_processor.merge_size
+
+        image_inputs = {
+            "images": [],
+            "grid_thws": [],
+            "image_downsampling": image_downsampling
+        }
+        if images is not None and len(images) > 0:
+            num_images = kwargs.get('num_images', len(images))
+            if 'num_images' in kwargs:
+                kwargs.pop('num_images')
+            for image in images:
+                outputs = self.image_processor(images=image, num_images=num_images, image_downsampling=image_downsampling, **kwargs)
+                # images shapes like: [tensor([patches, 1176]), ...]
+                # grid_thws shapes like: tensor([num_images, 3])
+                
+                # flatten_patches1 = outputs["pixel_values"].reshape(26, 46, 3, -1)
+                # from matplotlib import pyplot as plt
+                # plt.imshow(flatten_patches1[:,:,:,0])
+                # plt.savefig('9.png')
+
+                image_inputs["images"].append(outputs["pixel_values"]) #正常的
+
+                # flatten_patches1 = image_inputs["images"][0].reshape(26, 46, 3, -1)
+                # from matplotlib import pyplot as plt
+                # plt.imshow(flatten_patches1[:,:,:,0])
+                # plt.savefig('12.png')
+                image_inputs["grid_thws"].append(outputs["image_grid_thw"])
+
+        return image_inputs
+
+    
+
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput], List[Dict]] = None,
+        images: ImageInput = None,
+        image_downsampling: Optional[int] = None,
+        return_labels: bool = False,
+        **kwargs: Unpack[Videollama3ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            Videollama3ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        output_kwargs["text_kwargs"].pop("padding")
+        output_kwargs["text_kwargs"].pop("padding_side")
+
+        image_inputs = self._process_image(images, image_downsampling, **output_kwargs["images_kwargs"])
+        text_inputs = self._process_text(text, image_inputs["grid_thws"], image_downsampling, return_labels, **output_kwargs["text_kwargs"])
+
+        return BatchFeature(data={**text_inputs, **image_inputs})
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/videollama3/model/projector.py b/videollama3/model/projector.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3dde6b1611322dd2fbe242fc60650eebe0433c0
--- /dev/null
+++ b/videollama3/model/projector.py
@@ -0,0 +1,160 @@
+#    Copyright 2024 Alibaba DAMO Academy
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+import math
+import os
+import re
+
+import einops
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import LayerNorm, LayerNorm2d
+from timm.models.regnet import RegStage
+from transformers import TRANSFORMERS_CACHE
+
+
+def parse_snapshot_folder(repo_id, cache_dir=None, repo_type="model"):
+    revision = "main"
+    # 1. parse the downloaded cache folder
+    if cache_dir is None:
+        cache_dir = TRANSFORMERS_CACHE
+    else:
+        cache_dir = cache_dir
+    object_id = repo_id.replace("/", "--")
+    repo_cache = os.path.join(cache_dir, f"{repo_type}s--{object_id}")
+    # 2. resolve refs (for instance to convert main to the associated commit sha)
+    refs_dir = os.path.join(repo_cache, "refs")
+    if os.path.isdir(refs_dir):
+        revision_file = os.path.join(refs_dir, revision)
+        if os.path.isfile(revision_file):
+            with open(revision_file) as f:
+                revision = f.read()
+    # 3. acquire the snapshot folder
+    folder = os.path.join(repo_cache, "snapshots", revision)
+
+    return folder
+
+
+def load_mm_projector(model_path, cache_dir=None, token=None):
+    if os.path.exists(os.path.join(model_path, 'mm_projector.bin')):
+        is_local = True
+        folder = model_path
+    else:
+        is_local = False
+        folder = parse_snapshot_folder(model_path, cache_dir=cache_dir, repo_type="model")
+        if not os.path.exists(os.path.join(folder, 'mm_projector.bin')):
+            # downloading from remote repo
+            from huggingface_hub import snapshot_download
+            snapshot_download(repo_id=model_path, cache_dir=cache_dir, token=token)
+
+    mm_projector_weights = torch.load(os.path.join(folder, 'mm_projector.bin'), map_location='cpu')
+    mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
+    return mm_projector_weights
+
+
+class IdentityMap(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, *args, **kwargs):
+        return x
+
+    @property
+    def config(self):
+        return {"mm_projector_type": 'identity'}
+
+
+def build_mlp(depth, hidden_size, output_hidden_size):
+    modules = [nn.Linear(hidden_size, output_hidden_size)]
+    for _ in range(1, depth):
+        modules.append(nn.GELU())
+        modules.append(nn.Linear(output_hidden_size, output_hidden_size))
+    return nn.Sequential(*modules)
+
+
+class SimSpatialConv(nn.Module):
+
+    def __init__(self, config, downsample=(2, 2), padding=1, depth=1, mlp_depth=2):
+        super().__init__()
+        self.encoder_hidden_size = encoder_hidden_size = config.mm_hidden_size
+        self.output_hidden_size = output_hidden_size = config.hidden_size
+        self.downsample = downsample
+        self.padding = padding
+        self.sampler = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.encoder_hidden_size,
+                out_channels=4 * self.encoder_hidden_size,
+                kernel_size=self.downsample,
+                stride=self.downsample,
+                padding=self.padding,
+                bias=True
+            ),
+            nn.SiLU(),
+        )
+        self.readout = build_mlp(mlp_depth, 4 * self.encoder_hidden_size, self.output_hidden_size)
+
+    def forward(self, x):
+        hw = int(x.size(1) ** 0.5)
+        x = einops.rearrange(x, "b (h w) d -> b d h w", h=hw, w=hw)
+        x = self.sampler(x)
+        x = einops.rearrange(x, "b d h w -> b (h w) d")
+        x = self.readout(x)
+        return x
+
+    def cal_proj_size(self, input_size):
+        if isinstance(input_size, int):
+            input_size = (input_size, input_size)
+        height = math.ceil((input_size[0] + self.padding) / self.downsample[0])
+        width  = math.ceil((input_size[1] + self.padding) / self.downsample[1])
+        return height * width
+
+
+class MlpGeluProjector(nn.Module):
+    def __init__(self, config, projector_type):
+        super().__init__()
+
+        mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
+        mlp_depth = int(mlp_gelu_match.group(1))
+
+        self.readout = build_mlp(mlp_depth, config.mm_hidden_size, config.hidden_size)
+
+    def forward(self, x):
+        x = self.readout(x)
+        return x
+
+    def cal_proj_size(self, input_size):
+        if isinstance(input_size, int):
+            input_size = (input_size, input_size)
+        height = input_size[0]
+        width  = input_size[1]
+        return height * width
+
+
+def build_vision_projector(config, delay_load=False, **kwargs):
+    # videollama3 projector only support image-wise operation now, i.e., prohibit the temporal aggregation
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+
+    if projector_type == "linear":
+        # NOTE: for both linear and mlp2x_gelu projector type, mean pooling is adopted to aggreate video features
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+    elif  projector_type == "simp_spatial_conv":
+        return SimSpatialConv(config)
+    elif projector_type.startswith("mlp"):
+        return MlpGeluProjector(config, projector_type)
+    if projector_type == 'identity':
+        return IdentityMap()
+
+    raise ValueError(f'Unknown projector type: {projector_type}')
diff --git a/videollama3/model/qwen2vl_encoder/__init__.py b/videollama3/model/qwen2vl_encoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..305dd2817be421fd9773a8afba7efca079111aee
--- /dev/null
+++ b/videollama3/model/qwen2vl_encoder/__init__.py
@@ -0,0 +1,3 @@
+from .configuration_qwen2vl_encoder import Qwen2VLVisionConfig
+from .image_processing import Qwen2VLImageProcessor
+from .modeling_qwen2vl_encoder import Qwen2VisionTransformerPretrainedModel
\ No newline at end of file
diff --git a/videollama3/model/qwen2vl_encoder/__pycache__/__init__.cpython-310.pyc b/videollama3/model/qwen2vl_encoder/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f0a3fa7ddd8d69e96625c89f181d55aaa208374
Binary files /dev/null and b/videollama3/model/qwen2vl_encoder/__pycache__/__init__.cpython-310.pyc differ
diff --git a/videollama3/model/qwen2vl_encoder/__pycache__/configuration_qwen2vl_encoder.cpython-310.pyc b/videollama3/model/qwen2vl_encoder/__pycache__/configuration_qwen2vl_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..883639a1d7b008d0779c2a11bf8cdace10b17a74
Binary files /dev/null and b/videollama3/model/qwen2vl_encoder/__pycache__/configuration_qwen2vl_encoder.cpython-310.pyc differ
diff --git a/videollama3/model/qwen2vl_encoder/__pycache__/image_processing.cpython-310.pyc b/videollama3/model/qwen2vl_encoder/__pycache__/image_processing.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8c939404cd6f484988554b0efdd6d14bf2a0e2f
Binary files /dev/null and b/videollama3/model/qwen2vl_encoder/__pycache__/image_processing.cpython-310.pyc differ
diff --git a/videollama3/model/qwen2vl_encoder/__pycache__/modeling_qwen2vl_encoder.cpython-310.pyc b/videollama3/model/qwen2vl_encoder/__pycache__/modeling_qwen2vl_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc1f2f81c255d687630d02f5e7eec5496be475b3
Binary files /dev/null and b/videollama3/model/qwen2vl_encoder/__pycache__/modeling_qwen2vl_encoder.cpython-310.pyc differ
diff --git a/videollama3/model/qwen2vl_encoder/configuration_qwen2vl_encoder.py b/videollama3/model/qwen2vl_encoder/configuration_qwen2vl_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..04d3bd2dfa79dd028144b4165a2bdfa161c0917f
--- /dev/null
+++ b/videollama3/model/qwen2vl_encoder/configuration_qwen2vl_encoder.py
@@ -0,0 +1,72 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen2VL model configuration"""
+
+import os
+from typing import Union
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Qwen2VLVisionConfig(PretrainedConfig):
+    model_type = "qwen2_vl"
+
+    def __init__(
+        self,
+        depth=32,
+        embed_dim=1280,
+        hidden_size=3584,
+        hidden_act="quick_gelu",
+        mlp_ratio=4,
+        num_heads=16,
+        in_channels=3,
+        patch_size=14,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.depth = depth
+        self.embed_dim = embed_dim
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # if config_dict.get("model_type") == "qwen2_vl":
+        #     config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
diff --git a/videollama3/model/qwen2vl_encoder/image_processing.py b/videollama3/model/qwen2vl_encoder/image_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..449aa35676c2c551515660703555454277c494d0
--- /dev/null
+++ b/videollama3/model/qwen2vl_encoder/image_processing.py
@@ -0,0 +1,469 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Qwen2-VL."""
+
+import math
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    VideoInput,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.utils import TensorType, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+
+    Returns:
+        list: A list of images.
+    """
+    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
+        return [img for img_list in images for img in img_list]
+
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+
+    elif is_valid_image(images):
+        return [images]
+
+    raise ValueError(f"Could not make batched images from {images}")
+
+
+# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
+def make_batched_videos(videos) -> List[VideoInput]:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+        return videos
+
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], Image.Image):
+            return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
+
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
+
+    raise ValueError(f"Could not make batched video from {videos}")
+
+
+def smart_resize(
+    height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
+):
+    """Rescales the image so that the following conditions are met:
+
+    1. Both dimensions (height and width) are divisible by 'factor'.
+
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+    3. The aspect ratio of the image is maintained as closely as possible.
+
+    """
+    if height < factor or width < factor:
+        scale = factor / min(height, width)
+        width = round(scale * width)
+        height = round(scale * height)
+    elif max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+
+
+class Qwen2VLImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Qwen2-VL image processor that dynamically resizes images based on the original images.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        min_pixels (`int`, *optional*, defaults to `56 * 56`):
+            The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
+            The max pixels of the image to resize the image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The spacial patch size of the vision encoder.
+        temporal_patch_size (`int`, *optional*, defaults to 2):
+            The temporal patch size of the vision encoder.
+        merge_size (`int`, *optional*, defaults to 2):
+            The merge size of the vision encoder to llm encoder.
+    """
+
+    model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 56 * 56,
+        max_pixels: int = 28 * 28 * 1280,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        merge_size: int = 2,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+        self.do_convert_rgb = do_convert_rgb
+
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        do_resize: bool = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        num_images: Optional[int] = 1,
+        image_downsampling: Optional[int] = None,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            vision_info (`List[Dict]`, *optional*):
+                Optional list of dictionaries containing additional information about vision inputs.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        images = make_list_of_images(images)
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            if do_resize:
+                max_pixels = int(self.max_pixels / (self.merge_size / image_downsampling)**2)
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=self.patch_size * image_downsampling,
+                    min_pixels=self.min_pixels,
+                    max_pixels=int(max_pixels // num_images),
+                )
+                image = resize(
+                    image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
+                )
+
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+
+        patches = np.array(processed_images)
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose(0, 3, 1, 2)
+        if patches.shape[0] == 1:
+            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // self.temporal_patch_size
+        grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+        patches = patches.reshape(
+            grid_t,
+            self.temporal_patch_size,
+            channel,
+            grid_h // image_downsampling,
+            image_downsampling,
+            self.patch_size,
+            grid_w // image_downsampling,
+            image_downsampling,
+            self.patch_size,
+        )
+        patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
+        flatten_patches = patches.reshape(
+            grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size
+        )
+        return flatten_patches, (grid_t, grid_h, grid_w)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        videos: VideoInput = None,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        num_images: Optional[int] = 1,
+        image_downsampling: Optional[int] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            videos (`VideoInput`):
+                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        image_downsampling = image_downsampling if image_downsampling is not None else self.merge_size
+
+        if images is not None:
+            images = make_batched_images(images)
+        if videos is not None:
+            videos = make_batched_videos(videos)
+
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if images is not None:
+            pixel_values, vision_grid_thws = [], []
+            for image in images:
+                patches, image_grid_thw = self._preprocess(
+                    image,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                    num_images=num_images,
+                    image_downsampling=image_downsampling,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(image_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
+
+        if videos is not None:
+            pixel_values, vision_grid_thws = [], []
+            for images in videos:
+                patches, video_grid_thw = self._preprocess(
+                    images,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                    num_images=num_images,
+                    image_downsampling=image_downsampling,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(video_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {"pixel_values_videos": pixel_values, "video_grid_thw": vision_grid_thws}
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/videollama3/model/qwen2vl_encoder/modeling_qwen2vl_encoder.py b/videollama3/model/qwen2vl_encoder/modeling_qwen2vl_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c90fcf2f7a346d81fad0f897ea85d8bd3213cdad
--- /dev/null
+++ b/videollama3/model/qwen2vl_encoder/modeling_qwen2vl_encoder.py
@@ -0,0 +1,367 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Qwen2-VL model."""
+
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.nn import CrossEntropyLoss, LayerNorm
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, StaticCache
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (add_start_docstrings,
+                                add_start_docstrings_to_model_forward,
+                                is_flash_attn_2_available,
+                                is_flash_attn_greater_or_equal_2_10, logging,
+                                replace_return_docstrings)
+
+from .configuration_qwen2vl_encoder import Qwen2VLVisionConfig
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_varlen_func
+    from transformers.modeling_flash_attention_utils import \
+        _flash_attention_forward
+else:
+    flash_attn_varlen_func = None
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    orig_dtype = tensor.dtype
+    tensor = tensor.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    output = (tensor * cos) + (rotate_half(tensor) * sin)
+    output = output.to(orig_dtype)
+    return output
+
+
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+
+
+class PatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        in_channels: int = 3,
+        embed_dim: int = 1152,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+
+        kernel_size = [temporal_patch_size, patch_size, patch_size]
+        self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.proj.weight.dtype
+        hidden_states = hidden_states.view(
+            -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
+        )
+        hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
+        return hidden_states
+
+
+class PatchMerger(nn.Module):
+    def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        self.ln_q = LayerNorm(context_dim, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(self.hidden_size, self.hidden_size),
+            nn.GELU(),
+            nn.Linear(self.hidden_size, dim),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
+        return x
+
+
+class VisionMlp(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int, hidden_act: str) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(dim, hidden_dim)
+        self.act = ACT2FN[hidden_act]
+        self.fc2 = nn.Linear(hidden_dim, dim)
+
+    def forward(self, x) -> torch.Tensor:
+        return self.fc2(self.act(self.fc1(x)))
+
+
+class VisionAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(
+        self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+
+        attention_mask = torch.full(
+            [1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype
+        )
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
+
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+        attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
+        attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
+        attn_output = torch.matmul(attn_weights, v)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+
+class VisionFlashAttention2(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(
+        self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
+            seq_length, -1
+        )
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+
+class VisionSdpaAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(
+        self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+
+        attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+        attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+
+QWEN2_VL_VISION_ATTENTION_CLASSES = {
+    "eager": VisionAttention,
+    "flash_attention_2": VisionFlashAttention2,
+    "sdpa": VisionSdpaAttention,
+}
+
+
+class Qwen2VLVisionBlock(nn.Module):
+    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
+        super().__init__()
+        self.norm1 = LayerNorm(config.embed_dim, eps=1e-6)
+        self.norm2 = LayerNorm(config.embed_dim, eps=1e-6)
+        mlp_hidden_dim = int(config.embed_dim * config.mlp_ratio)
+
+        self.attn = QWEN2_VL_VISION_ATTENTION_CLASSES[attn_implementation](
+            config.embed_dim, num_heads=config.num_heads
+        )
+        self.mlp = VisionMlp(dim=config.embed_dim, hidden_dim=mlp_hidden_dim, hidden_act=config.hidden_act)
+
+    def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> torch.Tensor:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+
+
+class Qwen2VLPreTrainedModel(PreTrainedModel):
+    config_class = Qwen2VLVisionConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2VLVisionBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_static_cache = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv3d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel):
+    config_class = Qwen2VLVisionConfig
+    _no_split_modules = ["Qwen2VLVisionBlock"]
+
+    def __init__(self, config) -> None:
+        super().__init__(config)
+        self.spatial_merge_size = config.spatial_merge_size
+        self.gradient_checkpointing = False
+
+        self.patch_embed = PatchEmbed(
+            patch_size=config.patch_size,
+            temporal_patch_size=config.temporal_patch_size,
+            in_channels=config.in_channels,
+            embed_dim=config.embed_dim,
+        )
+
+        head_dim = config.embed_dim // config.num_heads
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList(
+            [Qwen2VLVisionBlock(config, config._attn_implementation) for _ in range(config.depth)]
+        )
+        # 
+        # if self.spatial_merge_size > 1:
+        #     self.merger = PatchMerger(dim=config.hidden_size, context_dim=config.embed_dim)
+
+    def get_dtype(self) -> torch.dtype:
+        return self.blocks[0].mlp.fc2.weight.dtype
+
+    def get_device(self) -> torch.device:
+        return self.blocks[0].mlp.fc2.weight.device
+
+    def rot_pos_emb(self, grid_thw, strides):
+        pos_ids = []
+        for (t, h, w), stride in zip(grid_thw, strides):
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // stride,
+                stride,
+                w // stride,
+                stride,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // stride,
+                stride,
+                w // stride,
+                stride,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def forward(self, hidden_states, grid_thws, strides) -> torch.Tensor:
+        hidden_states  = self.patch_embed(hidden_states)
+        
+        # BUG: These codes will cause deepspeed issue: `RuntimeError: disagreement between rank0 and rankx`
+        # rotary_pos_emb = []
+        # for thw in grid_thws:
+        #     rotary_pos_emb.append(self.rot_pos_emb(thw).unsqueeze(0))
+        # rotary_pos_emb1 = torch.cat(rotary_pos_emb, dim=1).squeeze(0)
+        # grid_thws = torch.cat(grid_thws, dim = 0)
+
+        # new version of creating rotary position embedding
+        # grid_thws shapes like [batch_flatten_image_num, 3]
+        # grid_thws = torch.cat(grid_thws, dim = 0) # is conducted in the `encoder.py`
+        rotary_pos_emb = self.rot_pos_emb(grid_thws, strides)
+
+        cu_seqlens = torch.repeat_interleave(grid_thws[:, 1] * grid_thws[:, 2], grid_thws[:, 0]).cumsum(dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+
+        for blk in self.blocks:
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    blk.__call__,
+                    hidden_states,
+                    cu_seqlens,
+                    rotary_pos_emb
+                )
+            else:
+                hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
+
+        # if self.spatial_merge_size > 1:
+        #     hidden_states = self.merger(hidden_states)
+        return hidden_states
diff --git a/videollama3/model/region_encoder.py b/videollama3/model/region_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd302d67eb469295fb60e16f5e59a249fdcdaf0c
--- /dev/null
+++ b/videollama3/model/region_encoder.py
@@ -0,0 +1,117 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+
+class MaskExtractor(nn.Module):
+    def __init__(self, config, mm_hidden_size, depth=2):
+        super(MaskExtractor, self).__init__()
+        self.mask_pooling = MaskPooling()
+        modules = [nn.Linear(mm_hidden_size, config.hidden_size)]
+        for _ in range(1, depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        self.feat_linear =  nn.Sequential(*modules)
+
+    def forward(self, feats, masks):
+        query_feats = []
+        
+        if masks is None: #infer
+            return None
+            # masks = torch.zeros((1, 1, 336, 336)).to(feats.device).float()
+
+        num_imgs = len(masks)
+        region_token_nums = []
+        image_idx = 0
+        for idx in range(num_imgs):
+            if masks[idx]==None:
+                continue
+            for mask_idx in range(len(masks[idx])):
+                mask = masks[idx][mask_idx].unsqueeze(0).unsqueeze(0).float()
+                if len(mask[0])==0:
+                    print('mask error')
+                    mask = torch.zeros((1, 1, 336, 336)).to(feats.device).float()
+
+                feat = feats[image_idx].unsqueeze(0)
+                image_idx+=1
+                
+                # h, w = feat.shape[1:3]
+                feat = feat.permute(0,3,1,2)
+
+                raw_dtype = feat.dtype
+                feat = feat.to(mask.dtype)
+                
+                mask_feat_raw = self.mask_pooling(feat, mask) # [n, 1024]
+
+                query_feats.append(mask_feat_raw)
+        if len(query_feats)==0:
+            return None
+        mask_feats = torch.cat(query_feats, dim=0)
+        mask_feats = mask_feats.to(feats[0].dtype)
+        mask_feats_linear = self.feat_linear(mask_feats)
+        return mask_feats_linear
+
+def kmeans_fast(tokens, num_clusters=10, num_iterations=20):
+    # tokens: 输入的token数据，shape为[n, d]
+    # num_clusters: 压缩后的组数
+    # num_iterations: K-means算法的迭代次数
+    
+    # 初始化中心点
+    n, d = tokens.shape
+    centroids = tokens[torch.randperm(n)[:num_clusters]]
+
+    for _ in range(num_iterations):
+        # 扩展tokens和centroids维度以计算距离，避免显式循环
+        tokens_expand = tokens.unsqueeze(1)  # [n, 1, d]
+        centroids_expand = centroids.unsqueeze(0)  # [1, num_clusters, d]
+        
+        # 计算每个token到各个中心点的距离
+        distances = torch.sum((tokens_expand - centroids_expand) ** 2, dim=2)  # [n, num_clusters]
+        
+        # 找到每个token最近的中心点
+        labels = torch.argmin(distances, dim=1)  # [n]
+
+        # 计算新的中心点
+        new_centroids = torch.stack([tokens[labels == i].mean(dim=0) if tokens[labels == i].size(0) > 0 else centroids[i] for i in range(num_clusters)])
+        
+        # 检查是否收敛
+        if torch.allclose(centroids, new_centroids, atol=1e-6):
+            break
+        
+        centroids = new_centroids
+    
+    return centroids
+
+class MaskPooling(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, mask):
+
+        if not x.shape[-2:] == mask.shape[-2:]:
+            # reshape mask to x
+            x = F.interpolate(x, size=mask.shape[-2:], mode='bilinear', align_corners=False)
+            # mask = F.interpolate(mask, size=x.shape[-2:], mode='bilinear', align_corners=False)
+        if not x.device == mask.device:
+            mask = mask.to(x.device)
+        # b, c, h ,w = x.shape
+        # b, q, h, w = mask.shape
+        mask = (mask > 0).to(mask.dtype)
+        mask = mask.permute(1,0,2,3)
+        denorm = mask.sum(dim=(-1, -2), keepdim=True) + 1e-8
+       
+        mask_emb = x * mask
+        mask = torch.any(mask_emb != 0, dim=(0, 1))
+        mask_emb = mask_emb[:,:, mask]
+        mask_embedding = mask_emb[0].permute(1,0)
+
+        if len(mask_embedding)>10: #FIXME
+            mask_embedding = kmeans_fast(mask_embedding)
+
+        return mask_embedding
+
+
+def build_region_encoder(config, mm_hidden_size):
+
+    return MaskExtractor(config, mm_hidden_size)
+   
\ No newline at end of file
diff --git a/videollama3/model/videollama3_arch.py b/videollama3/model/videollama3_arch.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f30a217d927edb1923ee2e76c55b19b6db55926
--- /dev/null
+++ b/videollama3/model/videollama3_arch.py
@@ -0,0 +1,422 @@
+# Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+import os
+import math
+from abc import ABC, abstractmethod
+
+import einops
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import numpy as np
+
+from ..constants import IGNORE_INDEX, MODAL_INDEX_MAP, NUM_FRAMES
+from .encoder import build_vision_encoder
+from .projector import build_vision_projector, load_mm_projector
+from .region_encoder import build_region_encoder
+from ..mm_utils import reshape_images_to_raw_grid
+
+def spatial_downsampling(features, grid_thws, strides):
+    n, c = features.shape
+
+    flatten_grid_thws = torch.cat([grid_thw for batch_grid_thws in grid_thws for grid_thw in batch_grid_thws])
+    split_sizes = [grid_thw.prod() for grid_thw in flatten_grid_thws]
+    features = torch.split(features, split_sizes)
+    flatten_strides = [stride for batch_strides in strides for stride in batch_strides]
+
+    new_features = []
+    for feature, grid_thw, stride in zip(features, flatten_grid_thws, flatten_strides):
+        # NOTE: adapted for reshape in image processor 
+        feature = feature.view(grid_thw[0], grid_thw[1] // stride, grid_thw[2] // stride, stride, stride,  c).permute(0, 1, 3, 2, 4, 5)
+        feature = feature.reshape(grid_thw[0], grid_thw[1], grid_thw[2], c).permute(0, 3, 1, 2)
+        # NOTE: previous version model is align_corners=True
+        new_feature = torch.nn.functional.interpolate(feature, (math.ceil(grid_thw[1] / stride), math.ceil(grid_thw[2] / stride)), mode='bilinear')
+        # new_feature = nn.functional.avg_pool2d(feature, stride)
+        # new_feature = nn.functional.max_pool2d(feature, stride)
+        new_features.append(new_feature.permute(0, 2, 3, 1).view(-1, c))
+    new_features = torch.cat(new_features)
+
+    return new_features
+
+
+class Videollama3MetaModel:
+
+    def __init__(self, config):
+        super(Videollama3MetaModel, self).__init__(config)
+
+        if hasattr(config, "vision_encoder") or hasattr(config, "mm_vision_encoder"):
+            self.vision_encoder = build_vision_encoder(config, delay_load=False)
+            self.mm_projector = build_vision_projector(config)
+        self.region_encoder = build_region_encoder(config, self.vision_encoder.hidden_size) 
+
+    def get_vision_encoder(self):
+        vision_encoder = getattr(self, 'vision_encoder', None)
+        if type(vision_encoder) is list:
+            vision_encoder = vision_encoder[0]
+        return vision_encoder
+
+    def get_mm_projector(self):
+        return self.mm_projector
+
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        vision_encoder = model_args.vision_encoder
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_projector = model_args.pretrain_mm_projector
+
+        self.config.mm_vision_encoder = vision_encoder
+
+        if self.get_vision_encoder() is None:
+            vision_encoder = build_vision_encoder(model_args)
+
+            if fsdp is not None and len(fsdp) > 0:
+                self.vision_encoder = [vision_encoder]
+            else:
+                self.vision_encoder = vision_encoder
+        else:
+            if fsdp is not None and len(fsdp) > 0:
+                vision_encoder = self.vision_encoder[0]
+            else:
+                vision_encoder = self.vision_encoder
+            # NOTE: only compatible with delay_load encoder
+            # vision_encoder.load_model(vision_encoder.cfg_only)
+
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
+        self.config.mm_hidden_size = vision_encoder.hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+
+        if getattr(self, 'mm_projector', None) is None:
+            self.mm_projector = build_vision_projector(self.config)
+        else:
+            # In case it is frozen by LoRA
+            for p in self.mm_projector.parameters():
+                p.requires_grad = True
+
+        if pretrain_mm_projector is not None:
+            if os.path.exists(pretrain_mm_projector):
+                is_local = True
+                if os.path.isdir(pretrain_mm_projector):
+                    mm_projector_weights = load_mm_projector(pretrain_mm_projector)
+                else:
+                    mm_projector_weights = torch.load(pretrain_mm_projector, map_location='cpu')
+            else:
+                # Support loading projector weights from remote HuggingFace model hub
+                is_local = False
+                pretrain_mm_projector = pretrain_mm_projector.replace('mm_projector.bin', '')
+                pretrain_mm_projector = pretrain_mm_projector.strip('/').strip('\\').strip()
+                mm_projector_weights = load_mm_projector(pretrain_mm_projector)
+
+            def get_w(weights, keyword):
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
+
+            # self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
+            # set strict=False to avoid missing key error regarding bert.embeddings.position_ids
+            self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'), strict=False)
+
+
+class Videollama3MetaForCausalLM(ABC):
+
+    @abstractmethod
+    def get_model(self):
+        pass
+
+    def num_frames(self):
+        if hasattr(self.config, 'num_frames'):
+            return self.config.num_frames
+        else:
+            return NUM_FRAMES
+
+    def spatial_merge_size(self):
+        if hasattr(self.config, 'spatial_merge_size'):
+            return self.config.spatial_merge_size
+        else:
+            return 1
+
+    def get_vision_encoder(self):
+        return self.get_model().get_vision_encoder()
+
+    def get_mm_projector(self):
+        return self.get_model().get_mm_projector()
+
+    def encode_images(self,images, grid_thws, strides):
+        """
+            images shape [b c h w]
+        """
+        images_features = self.get_model().get_vision_encoder()(images, grid_thws=grid_thws, strides=strides)
+        # images_features = spatial_downsampling(images_features, grid_thws, stride=self.config.spatial_merge_size)
+        mm_features = spatial_downsampling(images_features, grid_thws, strides=strides)
+        images_features = self.get_model().mm_projector(mm_features)
+
+        return images_features
+
+    def prepare_inputs_labels_for_multimodal(
+        self, input_ids, attention_mask, past_key_values, labels, images, position_ids=None, masks=None, additional_images = None,
+    ):
+        if self.config.use_token_compression:
+            return self.prepare_inputs_labels_for_multimodal_with_compression(input_ids, attention_mask, past_key_values, labels, images, position_ids, masks, additional_images)
+
+        # # images shape (modal, tensor, flag)
+        # vision_encoder = self.get_vision_encoder()
+        # # NOTE: text-only situation
+        # if vision_encoder is None or images is None or input_ids.shape[1] == 1:
+        #     return input_ids, attention_mask, past_key_values, None, labels, position_ids
+
+        # # NOTE: Equvialent to the following code:
+        # # images_tensor = [image      for modal, image, image_flag, grid_thw in images]
+        # # images_flag   = [image_flag for modal, image, image_flag, grid_thw in images]
+        # # grid_thws     = [grid_thw   for modal, image, image_flag, grid_thw in images]
+        # modals, images, grid_thws = zip(*images)
+
+        # images_flag = []
+        # strides = []
+        # for modal, grid_thw in zip(modals, grid_thws):
+        #     grid_thw = torch.cat(grid_thw)
+        #     stride = self.config.spatial_merge_size if modal == "video" else 1
+        #     num_patches = grid_thw.prod(dim=-1).sum().div(stride**2).long()
+        #     image_flag = torch.full((num_patches, ), 0 if modal == 'text' else 1)
+        #     images_flag.append(image_flag)
+        #     strides.append([stride] * grid_thw.size(0))
+        # images_flag_tensor = torch.cat(images_flag)
+
+        # mm_features = self.encode_images(images, grid_thws, strides)
+        # mm_features = mm_features[images_flag_tensor.to(mm_features.device) == 1].to(input_ids.device)
+        
+        # additional_images_list = []
+        # additional_images_thw = []
+        # additional_images_strides = []
+        
+        # for i in range(len(additional_images)):
+        #     additional_images_list.append(torch.from_numpy(np.array(additional_images[0][0])).to(mm_features.dtype).to(mm_features.device))
+        #     additional_images_thw.append(torch.tensor(additional_images[0][1][0]).to(mm_features.device))
+        #     additional_images_strides.append([1]*len(additional_images[0][1][0]))
+            
+
+        # image_selected = (input_ids == self.config.image_token_index)
+        # audio_selected = (input_ids == MODAL_INDEX_MAP['<audio>'])
+        # input_ids[image_selected] = 0
+        # input_ids[audio_selected] = 0
+
+        # input_embeds = self.get_model().embed_tokens(input_ids).clone()
+
+        # B, N, C = input_embeds.shape
+        # input_embeds = input_embeds.reshape(B * N, C).to(input_ids.device)
+        # image_selected = image_selected.reshape(B * N)
+        # audio_selected = audio_selected.reshape(B * N)
+
+        # input_embeds[image_selected] = input_embeds[image_selected] * 0.0 + mm_features.reshape(-1, C)
+
+        # # replace region token
+        # mask_selected = (input_ids == self.config.region_token_index)
+        # if mask_selected.sum()>0:
+            
+        #     additional_images_features = self.get_model().get_vision_encoder()(additional_images_list, grid_thws=[additional_images_thw], strides=additional_images_strides)
+        #     reshaped_features = reshape_images_to_raw_grid(additional_images_features, additional_images_thw)
+        #     mask_additional_image_features = []
+        #     for idx in mask_ids:
+        #         mask_additional_image_features.append(reshaped_features[idx])
+        #     mask_feats = self.model.region_encoder(mask_additional_image_features, masks)
+        #     input_embeds[mask_selected] = input_embeds[mask_selected]*0.0 + mask_feats
+
+        
+        # input_embeds = input_embeds.reshape(B, N, C)
+
+        # return None, attention_mask, past_key_values, input_embeds, labels, position_ids
+
+    def prepare_inputs_labels_for_multimodal_with_compression(
+        self, input_ids, attention_mask, past_key_values, labels, images, position_ids=None, masks=None, additional_images = None,
+    ):
+        # images shape (modal, tensor, flag)
+        vision_encoder = self.get_vision_encoder()
+        # NOTE: text-only situation
+        if vision_encoder is None or images is None or input_ids.shape[1] == 1:
+            return input_ids, attention_mask, past_key_values, None, labels, position_ids
+
+        # NOTE: Equvialent to the following code:
+        # images_tensor = [image      for modal, image, image_flag, grid_thw in images]
+        # images_flag   = [image_flag for modal, image, image_flag, grid_thw in images]
+        # grid_thws     = [grid_thw   for modal, image, image_flag, grid_thw in images]
+        modals, images, grid_thws = zip(*images)
+
+        images_flag = []
+        visual_masks = []
+        strides = []
+        visual_trunc_masks = []
+        
+        for modal, image, grid_thw in zip(modals, images, grid_thws):
+            grid_thw = torch.cat(grid_thw)
+            stride = self.config.spatial_merge_size if modal == "video" else 1
+            num_patches = grid_thw.prod(dim=-1).sum().div(stride**2).long()
+            image_flag = torch.full((num_patches, ), 0 if modal == 'text' else 1)
+            images_flag.append(image_flag)
+            strides.append([stride] * grid_thw.size(0))
+
+            if modal == "image" or (modal == "video" and len(image) == 1):
+                visual_masks.append(torch.ones((num_patches,), dtype=torch.bool, device=input_ids.device))
+                visual_trunc_masks.append(torch.ones((num_patches,), dtype=torch.bool, device=input_ids.device))
+
+            elif modal == "video":
+                # NOTE: video frame compressor
+                n, h, w = len(image), grid_thw[0][1], grid_thw[0][2]
+                image = torch.stack(image, dim=0).view(n, (h // stride) * (w // stride), -1)
+
+                threshold = 0.1
+                min_tokens = 1
+                pixel_diff = image[1:] - image[:-1]
+                pixel_diff = torch.abs(pixel_diff).mean(dim=-1) * 255
+                pixel_diff = torch.cat([torch.full_like(pixel_diff[0:1], threshold + 1), pixel_diff], dim=0)
+                # if dist.get_rank() == 0:
+                #     print(pixel_diff.shape, image.shape)
+                mask = pixel_diff > threshold
+                padding_ids = torch.nonzero(mask.sum(dim=1) < min_tokens)[:, 0]
+                # mask[padding_ids, torch.randperm(min_tokens)] = 1
+                mask[padding_ids, :min_tokens] = 1
+                visual_masks.append(mask.flatten())
+                visual_trunc_masks.append(torch.ones((num_patches,), dtype=torch.bool, device=input_ids.device))
+
+            elif modal == "text":
+                visual_trunc_masks.append(torch.ones((0,), dtype=torch.bool, device=input_ids.device))
+
+        images_flag_tensor = torch.cat(images_flag)
+        mm_features = self.encode_images(images, grid_thws, strides)
+        mm_features = mm_features[images_flag_tensor.to(mm_features.device) == 1]
+        
+        additional_images_list = []
+        additional_images_thw = []
+        additional_images_strides = []
+
+        if additional_images is not None: #and additional_images[0] is not None
+            for i in range(len(additional_images)):
+                for img_idx in range(len(additional_images[i][0])):
+                    additional_images_list.append([torch.from_numpy(np.array(additional_images[i][0][img_idx])).to(mm_features.dtype).to(mm_features.device)])
+                    additional_images_thw.append([torch.tensor(np.array(additional_images[i][1][img_idx])).to(mm_features.device)])
+                    additional_images_strides.append([1]*len(additional_images[i][1][img_idx]))
+                    # additional_images_list.append(additional_images[i][0])
+                    # additional_images_thw.append(additional_images[i][1])
+                    # additional_images_strides.append([1]*len(additional_images[i][1]))
+            
+        # import pdb 
+        # pdb.set_trace()
+
+        B, N = input_ids.shape
+        C = mm_features.shape[-1]
+
+        assert B == 1, "Only support batch flattening for now"
+        input_ids = input_ids.view(B * N)
+        image_selected = (input_ids == self.config.image_token_index)
+        audio_selected = (input_ids == MODAL_INDEX_MAP['<audio>'])
+
+        if len(visual_masks) > 0:
+            # if dist.get_rank() == 0:
+            #     print(grid_thws, [x.shape for x in visual_masks])
+            visual_masks = torch.cat(visual_masks)
+            # print((visual_masks == 1).sum(), (visual_masks == 0).sum())
+
+            mm_features = mm_features[visual_masks]
+            # text_masks = torch.zeros_like(input_ids, dtype=torch.bool)
+            # text_masks[~image_selected] = True
+            text_masks = torch.logical_not(image_selected)
+
+            try:
+                text_masks[image_selected] = visual_masks
+            except Exception as e:
+                assert position_ids is not None, "Position ids must be provided when shapes mismatch"
+                print(
+                    f'warning: {e}, text_masks[image_selected].shape={text_masks[image_selected].shape},',
+                    f'visual_masks.shape={visual_masks.shape}'
+                )
+
+                seq_end_indices = torch.nonzero(position_ids.view(B * N) == 0)[:, 0]
+                seq_end_indices = seq_end_indices[seq_end_indices > 0]
+                seq_end_indices = seq_end_indices.tolist()+ [len(input_ids)]
+                seq_start_indices = [0] + seq_end_indices[:-1]
+                num_visual_tokens = [
+                    input_ids[start:end].eq(self.config.image_token_index).sum()
+                    for start, end in zip(seq_start_indices, seq_end_indices)
+                ]
+
+                for n, mask in zip(num_visual_tokens, visual_trunc_masks):
+                    if len(mask) > 0:
+                        mask[n:] = False
+                visual_trunc_masks = torch.cat(visual_trunc_masks)
+
+                text_masks[image_selected] = visual_masks[visual_trunc_masks]
+                mm_features = mm_features[visual_trunc_masks[visual_masks]]
+
+        else:
+            text_masks = torch.ones_like(input_ids, dtype=torch.bool)
+
+        input_ids = input_ids[text_masks]
+        if attention_mask is not None:
+            attention_mask = attention_mask.view(B * N)[text_masks].reshape(1, -1)
+        if labels is not None:
+            labels = labels.view(B * N)[text_masks].reshape(1, -1)
+        if position_ids is not None:
+            position_ids = position_ids.view(B * N)[text_masks]
+            pos_start = [0] + torch.nonzero(position_ids == 0)[:, 0].tolist()
+            pos_end = pos_start[1:] + [len(input_ids)]
+            position_ids = torch.cat([torch.arange(end - start, device=input_ids.device) for start, end in zip(pos_start, pos_end)])
+            position_ids = position_ids.reshape(1, -1)
+
+        image_selected = (input_ids == self.config.image_token_index)
+        audio_selected = (input_ids == MODAL_INDEX_MAP['<audio>'])
+        input_ids[image_selected] = 0
+        input_ids[audio_selected] = 0
+
+        input_embeds = self.get_model().embed_tokens(input_ids).clone()
+
+        input_embeds[image_selected] = input_embeds[image_selected] * 0.0 + mm_features.reshape(-1, C)
+        
+        # replace region token
+        mask_selected = (input_ids == self.config.region_token_index)
+        
+        try:
+            if mask_selected.sum()>0:
+                # try:
+                # patches = np.ascontiguousarray(additional_images_list[0][0])
+                # grid_h = additional_images_thw[0][0][0][1]
+                # grid_w = additional_images_thw[0][0][0][2]
+                # patches = patches.reshape(grid_h ,grid_w, 3, 14, 14)
+                # from matplotlib import pyplot as plt
+                # plt.imshow(patches[:,:,:,0,0])
+                # plt.savefig('7.png')
+                # import pdb 
+                # pdb.set_trace()
+                # patches = patches.transpose(2, 0, 3, 1, 4)
+                # reconstructed_image = patches.reshape(3, grid_h*14, grid_w*14).transpose(1, 2, 0) 
+                # from matplotlib import pyplot as plt
+                # plt.imshow(reconstructed_image)
+                # plt.savefig('7.png')
+                # import pdb 
+                # pdb.set_trace()
+                additional_images_features = self.get_model().get_vision_encoder()(additional_images_list, grid_thws=additional_images_thw, strides=additional_images_strides)
+                reshaped_features = reshape_images_to_raw_grid(additional_images_features, additional_images_thw)
+
+                # mask_additional_image_features = []
+                # for idx in mask_ids:
+                #     mask_additional_image_features.append(reshaped_features[idx])
+                
+                mask_feats = self.model.region_encoder(reshaped_features, masks)
+            
+                input_embeds[mask_selected] = input_embeds[mask_selected]*0.0 + mask_feats
+                # except: #FIXME
+                #     print('additional_images_list is empty...')
+        except Exception as exp:
+            print('error: ', exp)
+        new_input_embeds = input_embeds.reshape(1, -1, C)
+
+        return None, attention_mask, past_key_values, new_input_embeds, labels, position_ids
diff --git a/videollama3/model/videollama3_qwen2.py b/videollama3/model/videollama3_qwen2.py
new file mode 100644
index 0000000000000000000000000000000000000000..eab91f10f5330c18c7cb868e5a48d2d89fee7376
--- /dev/null
+++ b/videollama3/model/videollama3_qwen2.py
@@ -0,0 +1,163 @@
+# Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from transformers import (AutoConfig, AutoModelForCausalLM, Qwen2Config,
+                          Qwen2ForCausalLM, Qwen2Model)
+from transformers.generation.utils import GenerateOutput
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+from .videollama3_arch import Videollama3MetaForCausalLM, Videollama3MetaModel
+
+
+class Videollama3Qwen2Config(Qwen2Config):
+    model_type = "videollama3_qwen2"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.model_type = "videollama3_qwen2"
+
+
+class Videollama3Qwen2Model(Videollama3MetaModel, Qwen2Model):
+    config_class = Videollama3Qwen2Config
+
+    def __init__(self, config: Videollama3Qwen2Config):
+        super(Videollama3Qwen2Model, self).__init__(config)
+
+
+class Videollama3Qwen2ForCausalLM(Qwen2ForCausalLM, Videollama3MetaForCausalLM):
+    config_class = Videollama3Qwen2Config
+
+    def __init__(self, config, **kwargs):
+        super(Qwen2ForCausalLM, self).__init__(config)
+        self.model = Videollama3Qwen2Model(config)
+        # self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[int] = None,
+        masks: Optional[List[torch.LongTensor]] = None,
+        additional_images = None,
+        **kwargs
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if inputs_embeds is None:
+            (
+                input_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels, position_ids
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images, 
+                position_ids,
+                masks,
+                additional_images
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        additional_images = kwargs.pop("additional_images", None)
+        masks = kwargs.pop("masks", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (
+                input_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                _,
+                position_ids
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids=inputs,
+                attention_mask=attention_mask,
+                past_key_values=None,
+                labels=None,
+                images=images,
+                position_ids=position_ids,
+                additional_images=additional_images,
+                masks=masks,
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            _inputs['images'] = images
+        return _inputs
+
+
+AutoConfig.register("videollama3_qwen2", Videollama3Qwen2Config)
+AutoModelForCausalLM.register(Videollama3Qwen2Config, Videollama3Qwen2ForCausalLM)
diff --git a/videollama3/train.py b/videollama3/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..79d8824fd095140775663084c4209a85c846bc59
--- /dev/null
+++ b/videollama3/train.py
@@ -0,0 +1,798 @@
+# Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+import math
+import copy
+import json
+import os
+import pathlib
+import random
+import re
+import sys
+import warnings
+import traceback
+from packaging import version
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Sequence
+import numpy as np
+
+# torch-related packages
+# NOTE: torch must be imported before transformers. Otherwise, `Segmentation fault (core dumped)` will occur.
+import torch
+import transformers
+from packaging import version
+from datasets import load_dataset, concatenate_datasets
+from torch.utils.data import Dataset
+from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
+
+sys.path.append('./')
+
+from videollama3.constants import (IGNORE_INDEX, MODAL_INDEX_MAP,
+    NUM_FRAMES, DEFAULT_IMAGE_TOKEN, STREAM_MAX_FRAMES,
+    STREAM_DOWNSAMPLING, STREAM_FPS, STREAM_IMAGE_SIZE,
+    STREAM_START_TOKEN, STREAM_END_TOKEN, REGION_TOKEN)
+from videollama3.mm_utils import (load_images, load_video,
+                                  tokenizer_multimodal_token, annToMask, resize_image_mask)
+from videollama3.model import *
+from videollama3.videollama3_trainer import (
+    VideoLLaMA3Trainer, find_all_linear_names, get_peft_state_maybe_zero_3,
+    get_peft_state_non_lora_maybe_zero_3, safe_save_model_for_hf_trainer)
+from videollama3.model.processor import Videollama3Processor
+
+# NOTE: fast tokenizer warning issue: https://github.com/huggingface/transformers/issues/5486
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+local_rank = None
+
+
+def rank0_print(*args):
+    if local_rank == 0:
+        print(*args)
+
+
+def set_seed(seed=42):
+    """
+    Set the random seed for reproducible results.
+
+    :param seed: An integer value to be used as the random seed.
+    """
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)  # for multi-GPU setups
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+
+def int_with_none(value):
+    if value == 'None':
+        return None
+    return int(value)
+
+
+@dataclass
+class ModelArguments:
+    # LLM Arguments
+    model_type: Optional[str] = field(default="videollama3", metadata={"help": "Model type selected in the list: " + ", ".join(VLLMs.keys())})
+    model_path: Optional[str] = field(default="lmsys/vicuna-7b-v1.5")
+    version: Optional[str] = field(default="v1", metadata={"help": "Version of the conversation template."})
+    freeze_backbone: bool = field(default=False, metadata={"help": "Whether to freeze the LLM backbone."})
+    # Connector Arguments
+    mm_projector_type: Optional[str] = field(default='linear')
+    pretrain_mm_projector: Optional[str] = field(default=None)
+    # Vision tower Arguments
+    vision_encoder: Optional[str] = field(default=None)
+    mm_vision_select_layer: Optional[int] = field(default=-1)
+    mm_vision_select_feature: Optional[str] = field(default="patch")
+    mm_attn_implementation: Optional[str] = field(default="flash_attention_2")
+    # Token downsampling Arguments
+    spatial_merge_size: Optional[int] = field(default=1)
+    mm_max_length: Optional[int] = field(default=9477)
+    use_token_compression: Optional[bool] = field(default=False)
+
+
+@dataclass
+class DataArguments:
+    # Path Arguments
+    data_path: List[str] = field(default=None, metadata={"help": "Path to the training data."})
+    # image_folder: Optional[str] = field(default=None)
+    # video_folder: Optional[str] = field(default=None)
+    data_folder: Optional[str] = field(default=None)
+    # Loading Arguments
+    is_multimodal: bool = False
+    fps: Optional[int] = field(default=None)
+    max_frames: Optional[int_with_none] = field(default=None)
+    # Preprocess Arguments
+    image_aspect_ratio: str = 'square'
+    use_batch_flattening: bool = field(default=True, metadata={"help": "Whether to flatten the in-batch sequences of variable lengths."})
+    dataset_cache_dir: Optional[str] = field(default=None)
+
+
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    # shut auto processing (_remove_unused_columns) of transformers Trainer
+    remove_unused_columns: bool = field(default=False)
+
+    optim: str = field(default="adamw_torch")
+    # Training learning rate Arguments
+    vision_encoder_lr: Optional[float] = None
+    mm_projector_lr: Optional[float] = None
+    llm_lr: Optional[float] = None
+    region_encoder_lr: Optional[float] = None
+    # Training Data Arguments
+    group_by_modality_length: bool = field(default=False)
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            "help":
+            "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    # Lora or Quant Arguments
+    double_quant: bool = field(
+        default=True,
+        metadata={"help": "Compress the quantization statistics through double quantization."}
+    )
+    quant_type: str = field(
+        default="nf4",
+        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
+    )
+    bits: int = field(
+        default=16,
+        metadata={"help": "How many bits to use."}
+    )
+    lora_enable: bool = False
+    lora_r: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+
+
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(self, data_path: str, vlprocessor, data_args: DataArguments):
+        super(LazySupervisedDataset, self).__init__()
+        data_objs = []
+        # try:
+        #     for data in data_path:
+        #         # NOTE: load_dataset can process both json or jsonl files
+        #         if data.endswith(".json") or data.endswith(".jsonl"):
+        #             data_objs.append(load_dataset("json", data_files=data, cache_dir=data_args.dataset_cache_dir)["train"])
+        #         else:
+        #             raise Exception(f"Unsupported file format (<{data}>)!")
+        #     list_data_dict = concatenate_datasets(data_objs)
+        # except:
+        traceback.print_exc()
+        # NOTE: compatible with the old version
+        list_data_dict = []
+        for data in data_path:
+            if data.endswith(".json"):
+                data = json.load(open(data, "r"))
+                for i in data:
+                    i['id'] = len(list_data_dict)
+                    list_data_dict.append(i)
+            elif data.endswith(".jsonl"):
+                with open(data, "r", encoding="utf-8") as fp:
+                    for line in fp:
+                        line = line.strip()
+                        obj = json.loads(line)
+                        obj["id"] = len(list_data_dict)
+                        list_data_dict.append(obj)
+            else:
+                raise Exception(f"Unsupported file format (<{data}>)!!!")
+
+        rank0_print("Formatting inputs...Skip in lazy mode")
+        self.vlprocessor = vlprocessor
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+
+    def __len__(self):
+        return len(self.list_data_dict)
+
+    @property
+    def lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            img_tokens = 576 if 'image' in sample else 0
+            length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
+        return length_list
+
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            cur_len = sum(len(conv['value'].split()) for conv in sample['conversations'])
+            cur_len = cur_len if 'image' in sample else -cur_len
+            length_list.append(cur_len)
+        return length_list
+
+    def _convert_normal(self, data_dict):
+        data_folder = self.data_args.data_folder
+        conversation = copy.deepcopy(data_dict["conversations"])
+
+        # data sanity check and repair
+        start_idx = 0
+        for sentence in conversation:
+            if sentence["from"] == "human" or sentence["from"] == "system":
+                break
+            start_idx += 1
+        if start_idx > 0:
+            warnings.warn(f"Find {start_idx} non-user sentences at the beginning of the conversation, remove them automatically!")
+            conversation = conversation[start_idx:]
+        assert len(conversation) > 1, f"Invalid conversation"
+
+        additional_frames = []
+        mask_ids = []
+        if 'image' in data_dict and data_dict['image'] is not None:
+            modal = 'image'
+            if all(not "<image>" in sentence["value"] for sentence in conversation):
+                warnings.warn(f"Image tag not found in the conversation, add it automatically at the beginning!")
+                conversation[0]["value"] = "<image>" + conversation[0]["value"]
+            image_file = data_dict['image']
+            if isinstance(image_file, list):
+                image_file = [os.path.join(data_folder, f) for f in image_file]
+            else:
+                image_file = os.path.join(data_folder, image_file)
+            images = load_images(image_file)
+            
+            masks = []
+            if 'masks' in data_dict and data_dict['masks'] is not None and len(data_dict['masks'])>0:
+                if 'height' in data_dict:
+                    h = data_dict['height']
+                    w = data_dict['width']
+                else:
+                    h = None
+                    w = None
+                for ann in data_dict['masks']:
+                    mask = annToMask(ann, h, w)
+                    masks.append(mask)
+                    mask_ids.append(0)
+                masks = np.stack(masks, axis=0)
+                masks = torch.from_numpy(masks)
+                    
+                additional_frames = images.copy()
+            else:
+                masks = None
+                
+        elif 'video' in data_dict and data_dict['video'] is not None:
+            modal = 'video'
+            if all(not "<video>" in sentence["value"] for sentence in conversation):
+                warnings.warn(f"Video tag not found in the conversation, add it automatically at the beginning!")
+                conversation[0]["value"] = "<video>" + conversation[0]["value"]
+            video_file = data_dict['video']
+            
+            masks = []
+            frame_ids = []
+            if 'masks' in data_dict and data_dict['masks'] is not None:
+                if 'height' in data_dict:
+                    h = data_dict['height']
+                    w = data_dict['width']
+                else:
+                    h = None
+                    w = None
+                for ann in data_dict['masks']:
+                    for k in ann.keys():
+                        if int(k) not in frame_ids:
+                            frame_ids.append(int(k))
+                        mask_ids.append(frame_ids.index(int(k)))
+                        mask = annToMask(ann[k], h, w)
+                        masks.append(mask)
+                masks = np.stack(masks, axis=0)
+                masks = torch.from_numpy(masks)
+            else:
+                masks = None
+                
+            if isinstance(video_file, list) and len(video_file) == 1:
+                video_file = os.path.join(data_folder, video_file[0])
+                images, timestamps, additional_frames = load_video(video_file, fps=self.data_args.fps, max_frames=self.data_args.max_frames, frame_ids=frame_ids)
+            elif isinstance(video_file, list) and len(video_file)>1: #images
+                images = []
+                for vf in video_file:
+                    images+=load_images(os.path.join(data_folder, vf))
+                timestamps = data_dict['timestamps']
+                additional_frames = []
+                for mv in data_dict['masked_video']:
+                    additional_frames+=load_images(os.path.join(data_folder, mv))
+            else:
+                raise ValueError(f"Unsupported video format: {video_file}")
+        else:
+            modal = 'text'
+            images = []
+            masks = None
+
+        if masks is not None and len(masks)>0:
+            additional_frames, masks, mask_nums = resize_image_mask(additional_frames, masks, mask_ids)
+            conv_i = 0
+            for idx in range(len(mask_nums)):
+                while '<region>' not in conversation[conv_i]['value']:
+                    conv_i+=1
+                conversation[conv_i]['value'] = conversation[conv_i]['value'].replace('<region>', "["+REGION_TOKEN*mask_nums[idx]+"]", 1)
+                    
+
+        messages = []
+        for conv in conversation:
+            if conv["from"] == "human":
+                # replace video tag to image tag for unified processing
+                # conv["value"] = conv["value"].replace("<video>", "<image>" * len(images))
+                chunks = conv["value"].split("<image>" if modal == 'image' else "<video>")
+                messages.append({
+                    "role": "user",
+                    "content": []
+                })
+
+                for chunk_idx in range(1, 2 * len(chunks)):
+                    if chunk_idx % 2 == 1:
+                        chunk = chunks[chunk_idx // 2].strip()
+                        messages[-1]["content"].append({"type": "text",  "text": chunk}) if chunk else None
+                    else:
+                        if modal == 'image':
+                            messages[-1]["content"].append({"type": "image"})
+                        elif modal == 'video':
+                            messages[-1]["content"].append({"type": "video", "num_frames": len(images), "time": timestamps})
+            else:
+                messages.append({
+                    "role": "assistant",
+                    "content": conv['value']
+                })
+
+        # TODO: dynamic downsampling
+        # image_downsampling = self.data_args.spatial_merge_size
+        image_downsampling = self.data_args.spatial_merge_size if modal == "video" else 1
+        # if modal == 'video':
+        #     image_downsampling = 2
+        # else:
+        #     # image/text
+        #     image_downsampling = 1
+
+        return modal, images, messages, image_downsampling, masks, additional_frames
+
+    def _convert_stream(self, data_dict):
+        video_path = os.path.join(self.data_args.data_folder, data_dict['video'][0])
+        frames, timestamps = load_video(
+            video_path=video_path,
+            start_time=data_dict["start_time"],
+            end_time=data_dict["end_time"],
+            fps=self.data_args.fps,
+            max_frames=self.data_args.max_frames,
+            size=STREAM_IMAGE_SIZE,
+            # size_divisible=14 * STREAM_DOWNSAMPLING,
+        )
+
+        if len(frames) > STREAM_MAX_FRAMES:
+            max_time = timestamps[STREAM_MAX_FRAMES]
+            frames = frames[:STREAM_MAX_FRAMES]
+            timestamps = timestamps[:STREAM_MAX_FRAMES]
+        else:
+            max_time = float("inf")
+
+        messages = []
+        frame_idx = 0
+
+        conversation = copy.deepcopy(data_dict["conversation"])
+        for message in conversation:
+            if message["time"] >= max_time:
+                break
+
+            while frame_idx < len(timestamps) and timestamps[frame_idx] <= message["time"]:
+                messages.append({
+                    "role": "stream",
+                    "content": [{"type": "image", "time": timestamps[frame_idx] - data_dict["start_time"]}],
+                })
+                frame_idx += 1
+
+            messages.append(message)
+
+        frames = frames[:frame_idx]
+
+        # return "video", frames, messages, STREAM_DOWNSAMPLING
+        return "video", frames, messages, self.data_args.spatial_merge_size
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        data_dict = self.list_data_dict[i]
+
+        try:
+            if "stream" in data_dict and data_dict["stream"]:
+                modal, images, messages, image_downsampling = self._convert_stream(data_dict)
+            else:
+                modal, images, messages, image_downsampling, masks, additional_frames = self._convert_normal(data_dict)
+
+            data_dict = self.vlprocessor(
+                images=images,
+                text=messages,
+                image_downsampling=image_downsampling,
+                return_labels=True,
+                return_tensors="pt",
+            )
+            if len(additional_frames)>0:
+                additional_images_dict = self.vlprocessor._process_image(additional_frames, num_images=1, image_downsampling=1)
+                additional_images = additional_images_dict['images']
+                additional_images_thws = additional_images_dict['grid_thws']
+            else:
+                additional_images = []
+                additional_images_thws = []
+
+            if modal == 'text':
+                unit_size = self.vlprocessor.image_processor.patch_size**2 * 3 * self.vlprocessor.image_processor.temporal_patch_size
+                data_dict['images'] = [torch.zeros(self.data_args.spatial_merge_size**2, unit_size)]
+                data_dict['grid_thws'] = [torch.tensor([[1, self.data_args.spatial_merge_size, self.data_args.spatial_merge_size]])]
+            elif modal == 'image' or modal == 'video':
+                assert len(data_dict['images']) > 0 and len(data_dict['grid_thws']) > 0, f"Invalid image data: {data_dict['images']}, {data_dict['grid_thws']}"
+
+            data_dict['modal'] = modal
+            data_dict['masks'] = masks
+            data_dict['additional_images'] = additional_images
+            data_dict['additional_images_thws'] = additional_images_thws
+
+        except Exception as e:
+            traceback.print_exc()
+            backup_idx = random.randint(0, len(self.list_data_dict) - 1)
+            print(f"Encounted error when process {i}-th example: {data_dict}, use {backup_idx}-th example instead!!!")
+            return self.__getitem__(backup_idx)
+
+        return data_dict
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+
+    vlprocessor: transformers.ProcessorMixin
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances]
+                                  for key in ("input_ids", "labels"))
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            padding_value=self.vlprocessor.tokenizer.pad_token_id)
+        labels = torch.nn.utils.rnn.pad_sequence(labels,
+                                                 batch_first=True,
+                                                 padding_value=IGNORE_INDEX)
+        input_ids = input_ids[:, :self.vlprocessor.tokenizer.model_max_length]
+        labels = labels[:, :self.vlprocessor.tokenizer.model_max_length]
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.vlprocessor.tokenizer.pad_token_id),
+        )
+
+        # work for 'images' argument in `prepare_inputs_labels_for_multimodal`
+        batch['images'] = []
+        batch['additional_images'] = []
+        batch["masks"] = []
+        mask_idx_start = 0
+        for instance in instances:
+            # for modal_token in MODAL_INDEX_MAP.keys():
+            #     modal_token = modal_token.lower()
+            #     # MODAL_TOKEN shape like: <image>, <video>, ...
+            #     modal_name = re.findall(f'[<](.*)[>]', modal_token)
+            #     assert len(modal_name) == 1
+            #     modal_name = modal_name[0]
+            batch['images'].append((instance['modal'], instance['images'], instance['grid_thws']))
+            if len(instance['additional_images'])>0:
+                batch['additional_images'].append((instance['additional_images'], instance['additional_images_thws']))
+            if instance["masks"] is not None:  
+                batch["masks"].append(instance["masks"])    
+            mask_idx_start+=len(instance['additional_images'])         
+        return batch
+
+
+def make_supervised_data_module(vlprocessor, data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = LazySupervisedDataset(
+        vlprocessor=vlprocessor,
+        data_path=data_args.data_path,
+        data_args=data_args
+    )
+    data_collator = DataCollatorForSupervisedDataset(vlprocessor=vlprocessor)
+    return dict(train_dataset=train_dataset,
+                eval_dataset=None,
+                data_collator=data_collator)
+
+
+@dataclass
+class DataCollatorWithFlatteningForSupervisedDataset(object):
+    """Collate examples for batch flattened supervised fine-tuning."""
+
+    vlprocessor: transformers.ProcessorMixin
+
+    def __call__(self, instances: Sequence[Dict], separator_id=-100) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances]
+                                  for key in ("input_ids", "labels"))
+
+        new_input_ids = []
+        new_labels = []
+        position_ids = []
+        for idx in range(0, len(input_ids)):
+            new_input_ids.append(input_ids[idx][:self.vlprocessor.tokenizer.model_max_length])
+            temp_label = labels[idx][:self.vlprocessor.tokenizer.model_max_length]
+            temp_label[0] = separator_id
+            new_labels.append(temp_label)
+            position_ids.append(torch.tensor(list(range(len(input_ids[idx][:self.vlprocessor.tokenizer.model_max_length])))))
+
+        new_input_ids = torch.cat(new_input_ids)
+        new_labels = torch.cat(new_labels)
+        position_ids = torch.cat(position_ids)
+
+        batch = dict(
+            input_ids=new_input_ids.unsqueeze(0),
+            labels=new_labels.unsqueeze(0),
+            position_ids=position_ids.unsqueeze(0),
+        )
+        
+        # work for 'images' argument in `prepare_inputs_labels_for_multimodal`
+        batch['images'] = []
+        batch['additional_images'] = []
+        # mask_idx_start = 0
+        for instance in instances:
+            batch['images'].append((instance['modal'], instance['images'], instance['grid_thws']))
+            if len(instance['additional_images'])>0:
+                batch['additional_images'].append((instance['additional_images'], instance['additional_images_thws']))
+                # mask_idx_start+=len(instance['additional_images'])         
+        batch["masks"] = [x["masks"] for x in instances]
+        return batch
+
+
+def make_flattening_supervised_data_module(vlprocessor: transformers.ProcessorMixin, data_args) -> Dict:
+    """Make batch flattened dataset and collator for supervised fine-tuning."""
+    train_dataset = LazySupervisedDataset(
+        vlprocessor=vlprocessor,
+        data_path=data_args.data_path,
+        data_args=data_args
+    )
+    data_collator = DataCollatorWithFlatteningForSupervisedDataset(vlprocessor=vlprocessor)
+    return dict(train_dataset=train_dataset,
+                eval_dataset=None,
+                data_collator=data_collator)
+
+
+def train(attn_implementation=None):
+    global local_rank
+    set_seed(42)
+
+    parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    local_rank = training_args.local_rank
+
+    if local_rank == 0:
+        print('------model args------')
+        print(model_args)
+        print('------data args------')
+        print(data_args)
+        print('------training args------')
+        print(training_args)
+
+    compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+
+    bnb_model_from_pretrained_args = {}
+    if training_args.bits in [4, 8]:
+        from transformers import BitsAndBytesConfig
+        bnb_model_from_pretrained_args.update(dict(
+            # device_map={"": training_args.device},
+            # BUG: High version transformers report error:
+            # ValueError: You can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing `quantization_config` argument at the same time
+            # load_in_4bit=training_args.bits == 4,
+            # load_in_8bit=training_args.bits == 8,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=training_args.bits == 4,
+                load_in_8bit=training_args.bits == 8,
+                llm_int8_skip_modules=["mm_projector"],
+                llm_int8_threshold=6.0,
+                llm_int8_has_fp16_weight=False,
+                bnb_4bit_compute_dtype=compute_dtype,
+                bnb_4bit_use_double_quant=training_args.double_quant,
+                bnb_4bit_quant_type=training_args.quant_type, # {'fp4', 'nf4'}
+                bnb_4bit_quant_storage=compute_dtype,
+            )
+        ))
+
+    config = VLLMConfigs[model_args.model_type].from_pretrained(model_args.model_path)
+
+    config._attn_implementation = attn_implementation
+    # NOTE: active spatial_merge_size arguments
+    config.spatial_merge_size = model_args.spatial_merge_size
+    config.mm_max_length = model_args.mm_max_length
+    config.use_token_compression = model_args.use_token_compression
+
+    if model_args.vision_encoder is not None:
+        model = VLLMs[model_args.model_type].from_pretrained(
+            model_args.model_path,
+            config=config,
+            torch_dtype=compute_dtype,
+            do_sample=True,
+            **bnb_model_from_pretrained_args
+        )
+        if 'mixtral' in model_args.model_type:
+            import deepspeed
+            deepspeed.utils.set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
+    else:
+        model = transformers.LlamaForCausalLM.from_pretrained(
+            model_args.model_path,
+            config=config,
+            torch_dtype=compute_dtype,
+            do_sample=True,
+            **bnb_model_from_pretrained_args
+        )
+    model.config.use_cache = False
+    if model_args.freeze_backbone:
+        model.model.requires_grad_(False)
+
+    if training_args.bits in [4, 8]:
+        from peft import prepare_model_for_kbit_training
+        model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+        model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
+
+    if training_args.gradient_checkpointing:
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        else:
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+    if training_args.lora_enable:
+        from peft import LoraConfig, get_peft_model
+        lora_config = LoraConfig(
+            r=training_args.lora_r,
+            lora_alpha=training_args.lora_alpha,
+            target_modules=find_all_linear_names(model),
+            lora_dropout=training_args.lora_dropout,
+            bias=training_args.lora_bias,
+            task_type="CAUSAL_LM",
+        )
+        if training_args.bits == 16:
+            if training_args.bf16:
+                model.to(torch.bfloat16)
+            if training_args.fp16:
+                model.to(torch.float16)
+        rank0_print("Adding LoRA adapters...")
+        model = get_peft_model(model, lora_config)
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_path,
+        model_max_length=training_args.model_max_length,
+        padding_side="right",
+        use_fast=True,
+    )
+
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.unk_token
+
+    if model_args.vision_encoder is not None:
+        # initialize vision encoder + multi-modal projector
+        model.get_model().initialize_vision_modules(model_args=model_args, fsdp=training_args.fsdp)
+
+        vision_encoder = model.get_vision_encoder()
+        vision_encoder.to(dtype=compute_dtype, device=training_args.device)
+
+        mm_projector = model.get_mm_projector()
+        mm_projector.to(dtype=compute_dtype if training_args.bf16 else torch.float16, device=training_args.device)
+
+        data_args.is_multimodal = True
+
+        model.config.tokenizer_padding_side = tokenizer.padding_side
+        model.config.tokenizer_model_max_length = tokenizer.model_max_length
+
+        if training_args.bits in [4, 8]:
+            model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device)
+
+        # decoupled learning rate
+        model.config.llm_lr = training_args.llm_lr
+        model.config.vision_encoder_lr = training_args.vision_encoder_lr
+        model.config.mm_projector_lr = training_args.mm_projector_lr
+        model.config.region_encoder_lr = training_args.region_encoder_lr
+
+        if model.config.llm_lr is None:
+            for p in model.get_model().parameters():
+                p.requires_grad = False
+            for p in model.get_model().vision_encoder.parameters():
+                p.requires_grad = True
+            for p in model.get_model().mm_projector.parameters():
+                p.requires_grad = True
+            for p in model.get_model().region_encoder.parameters():
+                p.requires_grad = True
+
+
+        if model.config.vision_encoder_lr is None:
+            for p in model.get_model().vision_encoder.parameters():
+                p.requires_grad = False
+
+        if model.config.mm_projector_lr is None:
+            for p in model.get_model().mm_projector.parameters():
+                p.requires_grad = False
+                
+        if model.config.region_encoder_lr is None:
+            for p in model.get_model().region_encoder.parameters():
+                p.requires_grad = False
+
+        model.config.max_frames = getattr(data_args, 'max_frames', NUM_FRAMES)
+        model.config.image_aspect_ratio = data_args.image_aspect_ratio if 'qwen2vl' not in model_args.vision_encoder else 'qwen2vl'
+
+        # NOTE: complement data_args via model hyperparameters
+        # 1. acquire image size
+        model.config.image_size = data_args.image_size = vision_encoder.image_size
+        # 2. calculate the number of tokens in the image
+        model.config.image_token_length = data_args.image_token_length = mm_projector.cal_proj_size(vision_encoder.num_patches_per_side)
+        # 3. check if alignment
+        model.config.is_alignment = training_args.is_alignment = data_args.is_alignment = (
+            model.config.mm_projector_lr is not None and
+            model.config.llm_lr is None and
+            model.config.vision_encoder_lr is None
+        )
+        # 4. set spatial merge size as default
+        model.config.spatial_merge_size = data_args.spatial_merge_size = model_args.spatial_merge_size
+        tokenizer.add_tokens([DEFAULT_IMAGE_TOKEN, STREAM_START_TOKEN, STREAM_END_TOKEN], special_tokens=True)
+        tokenizer.add_tokens([REGION_TOKEN], special_tokens=True)
+        model.resize_token_embeddings(len(tokenizer))
+
+        model.config.image_token_index = tokenizer.convert_tokens_to_ids(DEFAULT_IMAGE_TOKEN)
+        model.config.region_token_index = tokenizer.convert_tokens_to_ids(REGION_TOKEN)
+        
+
+        vlprocessor = Videollama3Processor(vision_encoder.image_processor, tokenizer)
+
+    if training_args.bits in [4, 8]:
+        from peft.tuners.lora import LoraLayer
+        for name, module in model.named_modules():
+            if isinstance(module, LoraLayer):
+                if training_args.bf16:
+                    module = module.to(torch.bfloat16)
+            if 'norm' in name:
+                module = module.to(torch.float32)
+            if 'lm_head' in name or 'embed_tokens' in name:
+                if hasattr(module, 'weight'):
+                    if training_args.bf16 and module.weight.dtype == torch.float32:
+                        module = module.to(torch.bfloat16)
+
+    if local_rank == 0:
+        print("Current model:", model)
+        print("Model config:", model.config)
+
+    if data_args.use_batch_flattening:
+        rank0_print('You are using flattening operation to flatten the entire mini batch into a single sequence')
+        assert model.config._attn_implementation == 'flash_attention_2'
+        assert version.parse(transformers.__version__) >= version.parse("4.44.0")
+        data_module = make_flattening_supervised_data_module(vlprocessor=vlprocessor, data_args=data_args)
+    else:
+        data_module = make_supervised_data_module(vlprocessor=vlprocessor, data_args=data_args)
+
+    # select a Trainer
+    trainer = VideoLLaMA3Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)
+
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    trainer.save_state()
+
+    model.config.use_cache = True
+
+    if training_args.lora_enable:
+        state_dict = get_peft_state_maybe_zero_3(model.named_parameters(), training_args.lora_bias)
+        non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(model.named_parameters())
+        if training_args.local_rank == 0 or training_args.local_rank == -1:
+            model.config.save_pretrained(training_args.output_dir)
+            model.save_pretrained(training_args.output_dir, state_dict=state_dict)
+            torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin'))
+    else:
+        safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
+
+
+if __name__ == "__main__":
+    train(attn_implementation="flash_attention_2")
diff --git a/videollama3/videollama3_trainer.py b/videollama3/videollama3_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9852e67ba92556b7bc16e03f2f09f61afbd5af2
--- /dev/null
+++ b/videollama3/videollama3_trainer.py
@@ -0,0 +1,398 @@
+# Adopted from: https://github.com/haotian-liu/LLaVA/blob/main/llava/train/llava_trainer.py
+import os
+import logging
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+from torch.utils.data import Sampler
+
+from transformers import Trainer
+from transformers.trainer import (
+    is_sagemaker_mp_enabled,
+    get_parameter_names,
+    has_length,
+    ALL_LAYERNORM_LAYERS,
+    logger,
+    TRAINER_STATE_NAME,
+)
+
+
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+
+
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
+    to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
+    return to_return
+
+
+# Borrowed from peft.utils.get_peft_model_state_dict
+def get_peft_state_maybe_zero_3(named_params, bias):
+    if bias == "none":
+        to_return = {k: t for k, t in named_params if "lora_" in k}
+    elif bias == "all":
+        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
+    elif bias == "lora_only":
+        to_return = {}
+        maybe_lora_bias = {}
+        lora_bias_names = set()
+        for k, t in named_params:
+            if "lora_" in k:
+                to_return[k] = t
+                bias_name = k.split("lora_")[0] + "bias"
+                lora_bias_names.add(bias_name)
+            elif "bias" in k:
+                maybe_lora_bias[k] = t
+        for k, t in maybe_lora_bias:
+            if bias_name in lora_bias_names:
+                to_return[bias_name] = t
+    else:
+        raise NotImplementedError
+    to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
+    return to_return
+
+
+def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
+    to_return = {k: t for k, t in named_params if "lora_" not in k}
+    if require_grad_only:
+        to_return = {k: t for k, t in to_return.items() if t.requires_grad}
+    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
+    return to_return
+
+
+def find_all_linear_names(model):
+    cls = torch.nn.Linear
+    lora_module_names = set()
+    multimodal_keywords = ['mm_projector', 'vision_encoder', 'vision_resampler']
+    for name, module in model.named_modules():
+        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
+            continue
+        if isinstance(module, cls):
+            names = name.split('.')
+            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+
+    if 'lm_head' in lora_module_names: # needed for 16-bit
+        lora_module_names.remove('lm_head')
+    return list(lora_module_names)
+
+
+def safe_save_model_for_hf_trainer(trainer: Trainer,
+                                   output_dir: str):
+    """Collects the state dict and dump to disk."""
+
+    if getattr(trainer.args, "is_alignment", False):
+        # Only save Adapter
+        keys_to_match = ['mm_projector']
+
+        weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match)
+        trainer.model.config.save_pretrained(output_dir)
+
+        current_folder = output_dir.split('/')[-1]
+        parent_folder = os.path.dirname(output_dir)
+        # if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
+        if torch.distributed.get_rank() == 0:
+            if current_folder.startswith('checkpoint-'):
+                mm_projector_folder = os.path.join(parent_folder, "mm_projector")
+                os.makedirs(mm_projector_folder, exist_ok=True)
+                torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin'))
+            else:
+                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
+        return
+
+    if trainer.deepspeed:
+        torch.cuda.synchronize()
+        trainer.save_model(output_dir)
+        return
+
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {
+            key: value.cpu()
+            for key, value in state_dict.items()
+        }
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
+
+
+def split_to_even_chunks(indices, lengths, num_chunks):
+    """
+    Split a list of indices into `chunks` chunks of roughly equal lengths.
+    """
+
+    if len(indices) % num_chunks != 0:
+        return [indices[i::num_chunks] for i in range(num_chunks)]
+
+    num_indices_per_chunk = len(indices) // num_chunks
+
+    chunks = [[] for _ in range(num_chunks)]
+    chunks_lengths = [0 for _ in range(num_chunks)]
+    for index in indices:
+        shortest_chunk = chunks_lengths.index(min(chunks_lengths))
+        chunks[shortest_chunk].append(index)
+        chunks_lengths[shortest_chunk] += lengths[index]
+        if len(chunks[shortest_chunk]) == num_indices_per_chunk:
+            chunks_lengths[shortest_chunk] = float("inf")
+
+    return chunks
+
+
+def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    assert all(l != 0 for l in lengths), "Should not have zero length."
+    if all(l > 0 for l in lengths) or all(l < 0 for l in lengths):
+        # all samples are in the same modality
+        return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator)
+    mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
+    lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])
+
+    mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]
+    lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
+    megabatch_size = world_size * batch_size
+    mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
+    lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]
+
+    last_mm = mm_megabatches[-1]
+    last_lang = lang_megabatches[-1]
+    additional_batch = last_mm + last_lang
+    megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
+    megabatch_indices = torch.randperm(len(megabatches), generator=generator)
+    megabatches = [megabatches[i] for i in megabatch_indices]
+
+    if len(additional_batch) > 0:
+        megabatches.append(sorted(additional_batch))
+
+    return [i for megabatch in megabatches for i in megabatch]
+
+
+def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    indices = torch.randperm(len(lengths), generator=generator)
+    megabatch_size = world_size * batch_size
+    megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
+    megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
+    megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]
+
+    return [i for megabatch in megabatches for batch in megabatch for i in batch]
+
+
+class LengthGroupedSampler(Sampler):
+    r"""
+    Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
+    keeping a bit of randomness.
+    """
+
+    def __init__(
+        self,
+        batch_size: int,
+        world_size: int,
+        lengths: Optional[List[int]] = None,
+        generator=None,
+        group_by_modality: bool = False,
+    ):
+        if lengths is None:
+            raise ValueError("Lengths must be provided.")
+
+        self.batch_size = batch_size
+        self.world_size = world_size
+        self.lengths = lengths
+        self.generator = generator
+        self.group_by_modality = group_by_modality
+
+    def __len__(self):
+        return len(self.lengths)
+
+    def __iter__(self):
+        if self.group_by_modality:
+            indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+        else:
+            indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+        return iter(indices)
+
+
+class VideoLLaMA3Trainer(Trainer):
+
+    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
+        if self.train_dataset is None or not has_length(self.train_dataset):
+            return None
+
+        if self.args.group_by_modality_length:
+            lengths = self.train_dataset.modality_lengths
+            return LengthGroupedSampler(
+                self.args.train_batch_size,
+                world_size=self.args.world_size * self.args.gradient_accumulation_steps,
+                lengths=lengths,
+                group_by_modality=True,
+            )
+        else:
+            return super()._get_train_sampler()
+
+    def create_optimizer(self):
+        """
+        Setup the optimizer.
+
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
+        """
+        if is_sagemaker_mp_enabled():
+            return super().create_optimizer()
+
+        opt_model = self.model
+
+        if self.optimizer is None:
+            optimized_parameters = [(n, p) for n, p in opt_model.named_parameters() if p.requires_grad]
+            optimizer_grouped_parameters = []
+
+            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
+            decay_parameters = [name for name in decay_parameters if "bias" not in name]
+
+            if self.args.llm_lr is not None:
+                lm_parameters = [
+                    name for name, _ in optimized_parameters if "vision_encoder" not in name and "mm_projector" not in name and "region_encoder" not in name
+                ]
+                decay_lm_parameters = [name for name in lm_parameters if name in decay_parameters]
+                nodecay_lm_parameters = [name for name in lm_parameters if name not in decay_parameters]
+                optimizer_grouped_parameters.extend([
+                    {
+                        "params": [p for n, p in optimized_parameters if n in decay_lm_parameters],
+                        "weight_decay": self.args.weight_decay,
+                        "lr": self.args.llm_lr,
+                    },
+                    {
+                        "params": [p for n, p in optimized_parameters if n in nodecay_lm_parameters],
+                        "weight_decay": 0.0,
+                        "lr": self.args.llm_lr,
+                    }
+                ])
+
+            if self.args.mm_projector_lr is not None:
+                projector_parameters = [name for name, _ in optimized_parameters if "mm_projector" in name]
+                decay_projector_parameters = [name for name in projector_parameters if name in decay_parameters]
+                nodecay_projector_parameters = [name for name in projector_parameters if name not in decay_parameters]
+                optimizer_grouped_parameters.extend([
+                    {
+                        "params": [p for n, p in optimized_parameters if n in decay_projector_parameters], 
+                        "weight_decay": self.args.weight_decay,
+                        "lr": self.args.mm_projector_lr,
+                    },
+                    {
+                        "params": [p for n, p in optimized_parameters if n in nodecay_projector_parameters],
+                        "weight_decay": 0.0,
+                        "lr": self.args.mm_projector_lr,
+                    }
+                ])
+
+            if self.args.vision_encoder_lr is not None:
+                vision_encoder_parameters = [name for name, _ in optimized_parameters if "vision_encoder" in name]
+                decay_vision_encoder_parameters = [name for name in vision_encoder_parameters if name in decay_parameters]
+                nodecay_vision_encoder_parameters = [name for name in vision_encoder_parameters if name not in decay_parameters]
+                optimizer_grouped_parameters.extend([
+                    {
+                        "params": [p for n, p in optimized_parameters if n in decay_vision_encoder_parameters], 
+                        "weight_decay": self.args.weight_decay,
+                        "lr": self.args.vision_encoder_lr,
+                    },
+                    {
+                        "params": [p for n, p in optimized_parameters if n in nodecay_vision_encoder_parameters],
+                        "weight_decay": 0.0,
+                        "lr": self.args.vision_encoder_lr,
+                    }
+                ])
+
+            if self.args.region_encoder_lr is not None:
+                projector_parameters = [name for name, _ in optimized_parameters if "region_encoder" in name]
+                decay_projector_parameters = [name for name in projector_parameters if name in decay_parameters]
+                nodecay_projector_parameters = [name for name in projector_parameters if name not in decay_parameters]
+                optimizer_grouped_parameters.extend([
+                    {
+                        "params": [p for n, p in optimized_parameters if n in decay_projector_parameters], 
+                        "weight_decay": self.args.weight_decay,
+                        "lr": self.args.region_encoder_lr,
+                    },
+                    {
+                        "params": [p for n, p in optimized_parameters if n in nodecay_projector_parameters],
+                        "weight_decay": 0.0,
+                        "lr": self.args.region_encoder_lr,
+                    }
+                ])
+
+            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
+
+            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+            if optimizer_cls.__name__ == "Adam8bit":
+                import bitsandbytes
+
+                manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
+
+                skipped = 0
+                for module in opt_model.modules():
+                    if isinstance(module, nn.Embedding):
+                        skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
+                        logger.info(f"skipped {module}: {skipped/2**20}M params")
+                        manager.register_module_override(module, "weight", {"optim_bits": 32})
+                        logger.debug(f"bitsandbytes: will optimize {module} in fp32")
+                logger.info(f"skipped: {skipped/2**20}M params")
+
+        return self.optimizer
+
+    def _save_checkpoint(self, model, trial, metrics=None):
+        if getattr(self.args, 'is_alignment', False):
+            from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+            checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+
+            run_dir = self._get_output_dir(trial=trial)
+            output_dir = os.path.join(run_dir, checkpoint_folder)
+
+            # Only save Adapter
+            keys_to_match = ['mm_projector', 'vision_resampler']
+
+            weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)
+
+            if self.args.local_rank == 0 or self.args.local_rank == -1:
+                self.model.config.save_pretrained(output_dir)
+                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
+            # Save optimizer and scheduler
+            self._save_optimizer_and_scheduler(output_dir)
+            # Save RNG state
+            self._save_rng_state(output_dir)
+            self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
+            self.args.distributed_state.wait_for_everyone()
+        else:
+            # NOTE: Supporting save complete lora checkpoint during training.
+            if self.args.lora_enable:
+                from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+                checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+
+                run_dir = self._get_output_dir(trial=trial)
+                output_dir = os.path.join(run_dir, checkpoint_folder)
+
+                state_dict = get_peft_state_maybe_zero_3(self.model.named_parameters(), self.args.lora_bias)
+                non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(self.model.named_parameters())
+                if self.args.local_rank == 0 or self.args.local_rank == -1:
+                    # save for acquring `config.json`
+                    self.model.config.save_pretrained(output_dir)
+                    # save for acquring `adapter_config.json`, `adapter_model.bin`
+                    # self.model.save_pretrained(output_dir, state_dict=state_dict)
+                    torch.save(non_lora_state_dict, os.path.join(output_dir, 'non_lora_trainables.bin'))
+
+                # save for acquring lora adapter parameters & trainer states: `adapter_config.json`, `adapter_model.safetensors`
+                super(VideoLLaMA3Trainer, self)._save_checkpoint(model, trial, metrics)
+            else:
+                super(VideoLLaMA3Trainer, self)._save_checkpoint(model, trial, metrics)
+
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        if getattr(self.args, 'is_alignment', False):
+            pass
+        else:
+            super(VideoLLaMA3Trainer, self)._save(output_dir, state_dict)