Spaces:

lixin4ever
/

VideoRefer-VideoLLaMA3

Running on Zero

App Files Files Community

init

by CircleRadon - opened 10 days ago

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+6708

-0

This view is limited to 50 files because it contains too many changes. See the raw diff here.

Files changed (50) hide show

.DS_Store +0 -0
.gitattributes +14 -0
app.py +562 -0
demo/.DS_Store +0 -0
demo/images/1.jpg +3 -0
demo/images/2.jpg +3 -0
demo/images/3.jpg +3 -0
demo/images/4.jpg +3 -0
demo/images/5.jpg +3 -0
demo/images/6.jpg +3 -0
demo/images/7.jpg +3 -0
demo/images/8.jpg +3 -0
demo/images/LICENSE +3 -0
demo/videos/1.mp4 +3 -0
demo/videos/2.mp4 +3 -0
demo/videos/3.mp4 +3 -0
demo/videos/4.mp4 +3 -0
requirements.txt +48 -0
videollama3/.DS_Store +0 -0
videollama3/__init__.py +239 -0
videollama3/constants.py +46 -0
videollama3/infer.py +82 -0
videollama3/mm_utils.py +704 -0
videollama3/model/__init__.py +166 -0
videollama3/model/__pycache__/__init__.cpython-310.pyc +0 -0
videollama3/model/__pycache__/encoder.cpython-310.pyc +0 -0
videollama3/model/__pycache__/processor.cpython-310.pyc +0 -0
videollama3/model/__pycache__/projector.cpython-310.pyc +0 -0
videollama3/model/__pycache__/region_encoder.cpython-310.pyc +0 -0
videollama3/model/__pycache__/videollama3_arch.cpython-310.pyc +0 -0
videollama3/model/__pycache__/videollama3_qwen2.cpython-310.pyc +0 -0
videollama3/model/damovl_encoder/__init__.py +3 -0
videollama3/model/damovl_encoder/__pycache__/__init__.cpython-310.pyc +0 -0
videollama3/model/damovl_encoder/__pycache__/configuration_damovl_encoder.cpython-310.pyc +0 -0
videollama3/model/damovl_encoder/__pycache__/image_processing.cpython-310.pyc +0 -0
videollama3/model/damovl_encoder/__pycache__/modeling_damovl_encoder.cpython-310.pyc +0 -0
videollama3/model/damovl_encoder/configuration_damovl_encoder.py +71 -0
videollama3/model/damovl_encoder/image_processing.py +472 -0
videollama3/model/damovl_encoder/modeling_damovl_encoder.py +542 -0
videollama3/model/encoder.py +385 -0
videollama3/model/processor.py +366 -0
videollama3/model/projector.py +160 -0
videollama3/model/qwen2vl_encoder/__init__.py +3 -0
videollama3/model/qwen2vl_encoder/__pycache__/__init__.cpython-310.pyc +0 -0
videollama3/model/qwen2vl_encoder/__pycache__/configuration_qwen2vl_encoder.cpython-310.pyc +0 -0
videollama3/model/qwen2vl_encoder/__pycache__/image_processing.cpython-310.pyc +0 -0
videollama3/model/qwen2vl_encoder/__pycache__/modeling_qwen2vl_encoder.cpython-310.pyc +0 -0
videollama3/model/qwen2vl_encoder/configuration_qwen2vl_encoder.py +72 -0
videollama3/model/qwen2vl_encoder/image_processing.py +469 -0
videollama3/model/qwen2vl_encoder/modeling_qwen2vl_encoder.py +367 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitattributes CHANGED Viewed

@@ -33,3 +33,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+demo/videos/ filter=lfs diff=lfs merge=lfs -text
+demo/videos/3.mp4 filter=lfs diff=lfs merge=lfs -text
+demo/videos/4.mp4 filter=lfs diff=lfs merge=lfs -text
+demo/videos/1.mp4 filter=lfs diff=lfs merge=lfs -text
+demo/videos/2.mp4 filter=lfs diff=lfs merge=lfs -text
+demo/images/4.jpg filter=lfs diff=lfs merge=lfs -text
+demo/images/5.jpg filter=lfs diff=lfs merge=lfs -text
+demo/images/6.jpg filter=lfs diff=lfs merge=lfs -text
+demo/images/8.jpg filter=lfs diff=lfs merge=lfs -text
+demo/images/1.jpg filter=lfs diff=lfs merge=lfs -text
+demo/images/2.jpg filter=lfs diff=lfs merge=lfs -text
+demo/images/3.jpg filter=lfs diff=lfs merge=lfs -text
+demo/images/7.jpg filter=lfs diff=lfs merge=lfs -text
+demo/images/LICENSE filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,562 @@

+import gradio as gr
+import numpy as np
+import torch
+from transformers import SamModel, SamProcessor
+from PIL import Image
+import os
+import cv2
+import argparse
+import sys
+# This is for making model initialization faster and has no effect since we are loading the weights
+sys.path.append('./')
+from videollama3 import disable_torch_init, model_init, mm_infer, get_model_output
+from videollama3.mm_utils import load_images
+from videollama3.mm_utils import load_video
+color_rgb = (1.0, 1.0, 1.0)
+color_rgbs = [
+        (1.0, 1.0, 1.0),
+        (1.0, 0.0, 0.0),
+        (0.0, 1.0, 1.0),
+        (0.0, 1.0, 0.0),
+        (0.0, 0.0, 1.0),
+        (1.0, 0.0, 1.0),
+    ]
+mask_list = []
+mask_raw_list = []
+mask_list_video = []
+mask_raw_list_video = []
+def extract_first_frame_from_video(video):
+    cap = cv2.VideoCapture(video)
+    success, frame = cap.read()
+    cap.release()
+    if success:
+        return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+    return None
+def extract_points_from_mask(mask_pil):
+    mask = np.asarray(mask_pil)[..., 0]
+    coords = np.nonzero(mask)
+    coords = np.stack((coords[1], coords[0]), axis=1)
+    return coords
+def add_contour(img, mask, color=(1., 1., 1.)):
+    img = img.copy()
+    mask = mask.astype(np.uint8) * 255
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cv2.drawContours(img, contours, -1, color, thickness=8)
+    return img
+def generate_masks(image):
+    global mask_list
+    global mask_raw_list
+    image['image'] = image['background'].convert('RGB')
+    # del image['background'], image['composite']
+    assert len(image['layers']) == 1, f"Expected 1 layer, got {len(image['layers'])}"
+    mask = Image.fromarray((np.asarray(image['layers'][0])[..., 3] > 0).astype(np.uint8) * 255).convert('RGB')
+    points = extract_points_from_mask(mask)
+    np.random.seed(0)
+    if points.shape[0] == 0:
+        raise gr.Error("No points selected")
+    points_selected_indices = np.random.choice(points.shape[0], size=min(points.shape[0], 8), replace=False)
+    points = points[points_selected_indices]
+    coords = [points.tolist()]
+    mask_np = apply_sam(image['image'], coords)
+    mask_raw_list.append(mask_np)
+    mask_image = Image.fromarray((mask_np[:,:,np.newaxis] * np.array(image['image'])).astype(np.uint8))
+    mask_list.append((mask_image, f"<region{len(mask_list)}>"))
+    # Return a list containing the mask image.
+    image['layers'] = []
+    image['composite'] = image['background']
+    return mask_list, image
+def generate_masks_video(image):
+    global mask_list_video
+    global mask_raw_list_video
+    image['image'] = image['background'].convert('RGB')
+    # del image['background'], image['composite']
+    assert len(image['layers']) == 1, f"Expected 1 layer, got {len(image['layers'])}"
+    mask = Image.fromarray((np.asarray(image['layers'][0])[..., 3] > 0).astype(np.uint8) * 255).convert('RGB')
+    points = extract_points_from_mask(mask)
+    np.random.seed(0)
+    if points.shape[0] == 0:
+        raise gr.Error("No points selected")
+    points_selected_indices = np.random.choice(points.shape[0], size=min(points.shape[0], 8), replace=False)
+    points = points[points_selected_indices]
+    coords = [points.tolist()]
+    mask_np = apply_sam(image['image'], coords)
+    mask_raw_list_video.append(mask_np)
+    mask_image = Image.fromarray((mask_np[:,:,np.newaxis] * np.array(image['image'])).astype(np.uint8))
+    mask_list_video.append((mask_image, f"<object{len(mask_list_video)}>"))
+    # Return a list containing the mask image.
+    image['layers'] = []
+    image['composite'] = image['background']
+    return mask_list_video, image
+def describe(image, mode, query, masks):
+    # Create an image object from the uploaded image
+    # print(image.keys())
+    image['image'] = image['background'].convert('RGB')
+    # del image['background'], image['composite']
+    assert len(image['layers']) == 1, f"Expected 1 layer, got {len(image['layers'])}"
+    # Handle both hex and rgba color formats
+    img_np = np.asarray(image['image']).astype(float) / 255.
+    if mode=='Caption':
+        mask = Image.fromarray((np.asarray(image['layers'][0])[..., 3] > 0).astype(np.uint8) * 255).convert('RGB')
+        points = extract_points_from_mask(mask)
+        np.random.seed(0)
+        if points.shape[0] == 0:
+            if len(masks)>1:
+                raise gr.Error("No points selected")
+        else:
+            # Randomly sample 8 points from the mask
+            # Follow DAM https://github.com/NVlabs/describe-anything
+            points_selected_indices = np.random.choice(points.shape[0], size=min(points.shape[0], 8), replace=False)
+            points = points[points_selected_indices]
+            coords = [points.tolist()]
+            mask_np = apply_sam(image['image'], coords)
+            masks = []
+            masks.append(mask_np)
+        mask_ids = [0]
+        img_with_contour_np = add_contour(img_np, mask_np, color=color_rgb)
+        img_with_contour_pil = Image.fromarray((img_with_contour_np * 255.).astype(np.uint8))
+    else:
+        masks = mask_raw_list
+        img_with_contour_np = img_np.copy()
+        mask_ids = []
+        for i, mask_np in enumerate(masks):
+            img_with_contour_np = add_contour(img_with_contour_np, mask_np, color=color_rgbs[i])
+            img_with_contour_pil = Image.fromarray((img_with_contour_np * 255.).astype(np.uint8))
+            mask_ids.append(0)
+    masks = np.stack(masks, axis=0)
+    masks = torch.from_numpy(masks).to(torch.uint8)
+    img = np.asarray(image['image'])
+    if mode == "Caption":
+        query = '<image>\nPlease describe the <region> in the image in detail.'
+    else:
+        if len(masks)==1:
+            prefix = "<image>\nThere is 1 region in the image: <region0> <region>. "
+        else:
+            prefix = f"<image>\nThere is {len(masks)} region in the image: "
+            for i in range(len(masks)):
+                prefix += f"<region{i}><region>, "
+            prefix = prefix[:-2]+'. '
+        query = prefix + query
+    # print(query)
+    image['layers'] = []
+    image['composite'] = image['background']
+    text = ""
+    yield img_with_contour_pil, text, image
+    for token in get_model_output(
+        [img],
+        query,
+        model=model,
+        tokenizer=tokenizer,
+        masks=masks,
+        mask_ids=mask_ids,
+        modal='image',
+        image_downsampling=1,
+        streaming=True,
+    ):
+        text += token
+        yield gr.update(), text, gr.update()
+def load_first_frame(video_path):
+    cap = cv2.VideoCapture(video_path)
+    ret, frame = cap.read()
+    cap.release()
+    if not ret:
+        raise gr.Error("Could not read the video file.")
+    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+    image = Image.fromarray(frame)
+    return image
+def describe_video(video_path, mode, query, annotated_frame, masks):
+    global mask_list_video
+    # Create a temporary directory to save extracted video frames
+    cap = cv2.VideoCapture(video_path)
+    video_tensor = load_video(video_path, fps=4, max_frames=768, frame_ids=[0])
+    annotated_frame['image'] = annotated_frame['background'].convert('RGB')
+    # Process the annotated frame from the image editor
+    if isinstance(annotated_frame, dict):
+        # Get the composite image with annotations
+        frame_img = annotated_frame.get("image", annotated_frame.get("background"))
+        if frame_img is None:
+            raise gr.Error("No valid annotation found in the image editor.")
+        frame_img = frame_img.convert("RGB")
+        # Get the annotation layer
+        if "layers" in annotated_frame and len(annotated_frame["layers"]) > 0:
+            mask = Image.fromarray((np.asarray(annotated_frame["layers"][0])[..., 3] > 0).astype(np.uint8) * 255).convert("RGB")
+        else:
+            mask = Image.new("RGB", frame_img.size, 0)
+    else:
+        frame_img = annotated_frame.convert("RGB")
+        mask = Image.new("RGB", frame_img.size, 0)
+    img_np = np.asarray(annotated_frame['image']).astype(float) / 255.
+    # Extract points from the annotated mask (using the first channel)
+    if mode == "Caption":
+        points = extract_points_from_mask(mask)
+        np.random.seed(0)
+        if points.shape[0] == 0:
+            raise gr.Error("No points were selected in the annotation.")
+        # Randomly select up to 8 points
+        # Follow DAM https://github.com/NVlabs/describe-anything
+        points_selected_indices = np.random.choice(points.shape[0], size=min(points.shape[0], 8), replace=False)
+        points = points[points_selected_indices]
+        # print(f"Selected points (to SAM): {points}")
+        coords = [points.tolist()]
+        mask_np = apply_sam(annotated_frame['image'], coords)
+        masks = []
+        masks.append(mask_np)
+        mask_ids = [0]
+        # img_with_contour_np = add_contour(img_np, mask_np, color=color_rgb)
+        # img_with_contour_pil = Image.fromarray((img_with_contour_np * 255.).astype(np.uint8))
+    else:
+        masks = mask_raw_list_video
+        img_with_contour_np = img_np.copy()
+        mask_ids = []
+        for i, mask_np in enumerate(masks):
+            # img_with_contour_np = add_contour(img_with_contour_np, mask_np, color=color_rgbs[i])
+            # img_with_contour_pil = Image.fromarray((img_with_contour_np * 255.).astype(np.uint8))
+            mask_ids.append(0)
+    masks = np.stack(masks, axis=0)
+    masks = torch.from_numpy(masks).to(torch.uint8)
+    if mode == "Caption":
+        query = '<video>\nPlease describe the <region> in the video in detail.'
+    else:
+        if len(masks)==1:
+            prefix = "<video>\nThere is 1 object in the video: <object0> <region>. "
+        else:
+            prefix = f"<video>\nThere is {len(masks)} objects in the video: "
+            for i in range(len(masks)):
+                prefix += f"<object{i}><region>, "
+            prefix = prefix[:-2]+'. '
+        query = prefix + query
+    # Initialize empty text
+    # text = description_generator
+    annotated_frame['layers'] = []
+    annotated_frame['composite'] = annotated_frame['background']
+    if mode=="Caption":
+        mask_list_video = []
+        mask_image = Image.fromarray((mask_np[:,:,np.newaxis] * np.array(annotated_frame['image'])).astype(np.uint8))
+        mask_list_video.append((mask_image, f"<object{len(mask_list_video)}>"))
+    text = ""
+    yield frame_img, text, mask_list_video
+    for token in get_model_output(
+        video_tensor,
+        query,
+        model=model,
+        tokenizer=tokenizer,
+        masks=masks,
+        mask_ids=mask_ids,
+        modal='video',
+        streaming=True,
+    ):
+        text += token
+        yield gr.update(), text, gr.update()
+def apply_sam(image, input_points):
+    inputs = sam_processor(image, input_points=input_points, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = sam_model(**inputs)
+    masks = sam_processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())[0][0]
+    scores = outputs.iou_scores[0, 0]
+    mask_selection_index = scores.argmax()
+    mask_np = masks[mask_selection_index].numpy()
+    return mask_np
+def clear_masks():
+    global mask_list
+    global mask_raw_list
+    mask_list = []
+    mask_raw_list = []
+    return []
+def clear_masks_video():
+    global mask_list_video
+    global mask_raw_list_video
+    mask_list_video = []
+    mask_raw_list_video = []
+    return []
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="VideoRefer gradio demo")
+    parser.add_argument("--model-path", type=str, default="DAMO-NLP-SG/VideoRefer-VideoLLaMA3-7B", help="Path to the model checkpoint")
+    parser.add_argument("--prompt-mode", type=str, default="focal_prompt", help="Prompt mode")
+    parser.add_argument("--conv-mode", type=str, default="v1", help="Conversation mode")
+    parser.add_argument("--temperature", type=float, default=0.2, help="Sampling temperature")
+    parser.add_argument("--top_p", type=float, default=0.5, help="Top-p for sampling")
+    args_cli = parser.parse_args()
+    print(args_cli.model_path)
+    with gr.Blocks(theme=gr.themes.Soft(primary_hue="amber")) as demo:
+        HEADER = ("""
+            <div>
+                <h1>VideoRefer X VideoLLaMA3 Demo</h1>
+                <h5 style="margin: 0;">Feel free to click on anything that grabs your interest!</h5>
+                <h5 style="margin: 0;">If this demo please you, please give us a star ⭐ on Github or 💖 on this space.</h5>
+            </div>
+            </div>
+            <div style="display: flex; justify-content: left; margin-top: 10px;">
+            <a href="https://arxiv.org/pdf/2501.00599"><img src="https://img.shields.io/badge/Arxiv-2501.00599-ECA8A7" style="margin-right: 5px;"></a>
+            <a href="https://github.com/DAMO-NLP-SG/VideoRefer"><img src='https://img.shields.io/badge/Github-VideoRefer-F7C97E' style="margin-right: 5px;"></a>
+            <a href="https://github.com/DAMO-NLP-SG/VideoLLaMA3"><img src='https://img.shields.io/badge/Github-VideoLLaMA3-9DC3E6' style="margin-right: 5px;"></a>
+            </div>
+            """)
+        with gr.Row():
+            with gr.Column():
+                gr.HTML(HEADER)
+        image_tips = """
+                ### 💡 Tips:
+                🧸 Upload an image, and you can use the drawing tool✍️ to highlight the areas you're interested in.
+                🔖 For single-object caption mode, simply select the area and click the 'Generate Caption' button to receive a caption for the object.
+                🔔 In QA mode, you can generate multiple masks by clicking the 'Generate Mask' button multiple times. Afterward, use the corresponding object id to ask questions.
+                📌 Click the button 'Clear Masks' to clear the current generated masks.
+                """
+        video_tips = """
+                ### 💡 Tips:
+                ⚠️ For video mode, we only support masking on the first frame in this demo.
+                🧸 Upload an video, and you can use the drawing tool✍️ to highlight the areas you're interested in the first frame.
+                🔖 For single-object caption mode, simply select the area and click the 'Generate Caption' button to receive a caption for the object.
+                🔔 In QA mode, you can generate multiple masks by clicking the 'Generate Mask' button multiple times. Afterward, use the corresponding object id to ask questions.
+                📌 Click the button 'Clear Masks' to clear the current generated masks.
+                """
+        with gr.TabItem("Image"):
+            with gr.Row():
+                with gr.Column():
+                    image_input = gr.ImageEditor(
+                        label="Image",
+                        type="pil",
+                        sources=['upload'],
+                        brush=gr.Brush(colors=["#ED7D31"], color_mode="fixed", default_size=10),
+                        eraser=True,
+                        layers=False,
+                        transforms=[],
+                        height=300,
+                    )
+                    generate_mask_btn = gr.Button("1️⃣ Generate Mask", visible=False, variant="primary")
+                    mode = gr.Radio(label="Mode", choices=["Caption", "QA"], value="Caption")
+                    query = gr.Textbox(label="Question", value="What is the relationship between <region0> and <region1>?", interactive=True, visible=False)
+                    submit_btn = gr.Button("Generate Caption", variant="primary")
+                    submit_btn1 = gr.Button("2️⃣ Generate Answer", variant="primary", visible=False)
+                    gr.Examples([f"./demo/images/{i+1}.jpg" for i in range(8)], inputs=image_input, label="Examples")
+                with gr.Column():
+                    mask_output = gr.Gallery(label="Referred Masks", object_fit='scale-down', visible=False)
+                    output_image = gr.Image(label="Image with Mask", visible=True, height=400)
+                    description = gr.Textbox(label="Output", visible=True)
+                    clear_masks_btn = gr.Button("Clear Masks", variant="secondary", visible=False)
+            gr.Markdown(image_tips)
+        with gr.TabItem("Video"):
+            with gr.Row():
+                with gr.Column():
+                    video_input = gr.Video(label="Video")
+                    # load_btn = gr.Button("🖼️ Load First Frame", variant="secondary")
+                    first_frame = gr.ImageEditor(
+                        label="Annotate First Frame",
+                        type="pil",
+                        sources=['upload'],
+                        brush=gr.Brush(colors=["#ED7D31"], color_mode="fixed", default_size=10),
+                        eraser=True,
+                        layers=False,
+                        transforms=[],
+                        height=300,
+                    )
+                    generate_mask_btn_video = gr.Button("1️⃣ Generate Mask", visible=False, variant="primary")
+                    gr.Examples([f"./demo/videos/{i+1}.mp4" for i in range(4)], inputs=video_input, label="Examples")
+                with gr.Column():
+                    mode_video = gr.Radio(label="Mode", choices=["Caption", "QA"], value="Caption")
+                    mask_output_video = gr.Gallery(label="Referred Masks", object_fit='scale-down')
+                    query_video = gr.Textbox(label="Question", value="What is the relationship between <object0> and <object1>?", interactive=True, visible=False)
+                    submit_btn_video = gr.Button("Generate Caption", variant="primary")
+                    submit_btn_video1 = gr.Button("2️⃣ Generate Answer", variant="primary", visible=False)
+                    description_video = gr.Textbox(label="Output", visible=True)
+                    clear_masks_btn_video = gr.Button("Clear Masks", variant="secondary")
+            gr.Markdown(video_tips)
+        def toggle_query_and_generate_button(mode):
+            query_visible = mode == "QA"
+            caption_visible = mode == "Caption"
+            global mask_list
+            global mask_raw_list
+            mask_list = []
+            mask_raw_list = []
+            return gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=caption_visible), gr.update(visible=caption_visible), [], ""
+        video_input.change(load_first_frame, inputs=video_input, outputs=first_frame)
+        mode.change(toggle_query_and_generate_button, inputs=mode, outputs=[query, generate_mask_btn, clear_masks_btn, submit_btn1, mask_output, output_image, submit_btn, mask_output, description])
+        def toggle_query_and_generate_button_video(mode):
+            query_visible = mode == "QA"
+            caption_visible = mode == "Caption"
+            global mask_list_video
+            global mask_raw_list_video
+            mask_list_video = []
+            mask_raw_list_video = []
+            return gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=query_visible), gr.update(visible=caption_visible), []
+        mode_video.change(toggle_query_and_generate_button_video, inputs=mode_video, outputs=[query_video, generate_mask_btn_video, submit_btn_video1, submit_btn_video, mask_output_video])
+        submit_btn.click(
+            fn=describe,
+            inputs=[image_input, mode, query],
+            outputs=[output_image, description, image_input],
+            api_name="describe"
+        )
+        submit_btn1.click(
+            fn=describe,
+            inputs=[image_input, mode, query],
+            outputs=[output_image, description, image_input],
+            api_name="describe"
+        )
+        generate_mask_btn.click(
+            fn=generate_masks,
+            inputs=[image_input],
+            outputs=[mask_output, image_input]
+        )
+        generate_mask_btn_video.click(
+            fn=generate_masks_video,
+            inputs=[first_frame],
+            outputs=[mask_output_video, first_frame]
+        )
+        clear_masks_btn.click(
+            fn=clear_masks,
+            outputs=[mask_output]
+        )
+        clear_masks_btn_video.click(
+            fn=clear_masks_video,
+            outputs=[mask_output_video]
+        )
+        submit_btn_video.click(
+            fn=describe_video,
+            inputs=[video_input, mode_video, query_video, first_frame],
+            outputs=[first_frame, description_video, mask_output_video],
+            api_name="describe_video"
+        )
+        submit_btn_video1.click(
+            fn=describe_video,
+            inputs=[video_input, mode_video, query_video, first_frame],
+            outputs=[first_frame, description_video, mask_output_video],
+            api_name="describe_video"
+        )
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    sam_model = SamModel.from_pretrained("facebook/sam-vit-huge").to(device)
+    sam_processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+    disable_torch_init()
+    model, processor, tokenizer = model_init(args_cli.model_path)
+    demo.launch()

demo/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

demo/images/1.jpg ADDED Viewed

Git LFS Details

SHA256: 57f222d08703255914ed6cbda7d0c5fd8b772d7b975f3ffd73ee47f24f7eaabe
Pointer size: 131 Bytes
Size of remote file: 491 kB

demo/images/2.jpg ADDED Viewed

Git LFS Details

SHA256: a9011049db02799c9bf68ba228445968a4dc2d097df8f3559c4e18a8a09a4f7f
Pointer size: 131 Bytes
Size of remote file: 501 kB

demo/images/3.jpg ADDED Viewed

Git LFS Details

SHA256: 5c5159bf7114d08967f95475176670043115b157bf700efa34190260cd917662
Pointer size: 132 Bytes
Size of remote file: 1.03 MB

demo/images/4.jpg ADDED Viewed

Git LFS Details

SHA256: 39174b4188bc6d928cf0153f0d3a3224e15c9823f8cdc99b4ad6627067741bb8
Pointer size: 131 Bytes
Size of remote file: 708 kB

demo/images/5.jpg ADDED Viewed

Git LFS Details

SHA256: e02c393a23aadd1304497e3a9b41144df166d1cfda33ea3e00eed94e27da3aa4
Pointer size: 132 Bytes
Size of remote file: 1.37 MB

demo/images/6.jpg ADDED Viewed

Git LFS Details

SHA256: 1d512c06daf1b5c7919fc351c496ff65d9cac601c57ae263433c49d90d3b083e
Pointer size: 132 Bytes
Size of remote file: 3.78 MB

demo/images/7.jpg ADDED Viewed

Git LFS Details

SHA256: 68d5970b974101b61b1bcf5dd790485a89f85a651641990c2629d4a56de40ba8
Pointer size: 132 Bytes
Size of remote file: 3.65 MB

demo/images/8.jpg ADDED Viewed

Git LFS Details

SHA256: bdb5acb53dfc78e74008d113b22f5a2fb1e2c7b33cb8eadf4983d709bfe366ba
Pointer size: 131 Bytes
Size of remote file: 335 kB

demo/images/LICENSE ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ac4c813c90895cdc79c71fdbd02715fd0c5505c24d95c5941747c904d6e93bc
+size 149

demo/videos/1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad78d268f6f1ad9a457a7768665157f74c20292136cefbf6bfc2a07de940dd0a
+size 804232

demo/videos/2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5eebbd330be490709c1b39cd1d82ae074f3fe275487bc6b77d2aa5cd74d40d05
+size 1255466

demo/videos/3.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:946550c741c9dc515340ab93b203614094632191db0d8f9697bd580f4a271947
+size 8743247

demo/videos/4.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b06b309812947b909ce7b8eaaea94a9ca60a8452a33e3109f5f6ffb1dbf8ee6
+size 1334796

requirements.txt ADDED Viewed

	@@ -0,0 +1,48 @@

+--extra-index-url https://download.pytorch.org/whl/cu124
+--extra-index-url https://download.pytorch.org/whl/cu121
+--extra-index-url https://download.pytorch.org/whl/cu118
+# basic dependencies
+torch==2.4.0
+torchvision==0.19.0
+datasets==2.21.0
+transformers==4.46.3
+tokenizers==0.20.3
+deepspeed==0.15.4
+accelerate==1.0.1
+peft==0.4.0
+timm==1.0.3
+numpy==1.24.4
+# data processing
+decord==0.6.0
+imageio==2.34.0
+imageio-ffmpeg==0.4.9
+moviepy==1.0.3
+scenedetect==0.6.3
+opencv-python==4.6.0.66
+pyarrow
+pysubs2
+ffmpeg-python
+# misc
+scikit-learn==1.2.2
+huggingface_hub==0.23.4
+sentencepiece==0.1.99
+shortuuid
+einops==0.6.1
+einops-exts==0.0.4
+bitsandbytes==0.43.3 # for cuda 124
+pydantic>=2.0
+markdown2[all]
+gradio==5.34.0
+gradio_client==1.10.3
+httpx==0.24.1
+requests
+openai
+uvicorn
+fastapi
+tensorboard
+wandb
+tabulate
+Levenshtein
+pycocotools==2.0.8
+spaces
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl

videollama3/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

videollama3/__init__.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import os
+import copy
+import math
+import warnings
+import shutil
+from functools import partial
+import torch
+from .model import load_pretrained_model
+from .model.processor import Videollama3Processor
+from .mm_utils import load_images, process_images, load_video, process_video, tokenizer_multimodal_token, get_model_name_from_path, KeywordsStoppingCriteria, resize_image_mask
+from .constants import NUM_FRAMES, DEFAULT_IMAGE_TOKEN, DEFAULT_VIDEO_TOKEN, MODAL_INDEX_MAP, STREAM_START_TOKEN, STREAM_END_TOKEN
+from videollama3.constants import REGION_TOKEN
+from transformers import TextIteratorStreamer
+from threading import Thread
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+def model_init(model_path=None, **kwargs):
+    model_path = "DAMO-NLP-SG/VideoLLaMA2-7B" if model_path is None else model_path
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, processor, context_len = load_pretrained_model(model_path, None, model_name, **kwargs)
+    if tokenizer.pad_token is None and tokenizer.unk_token is not None:
+        tokenizer.pad_token = tokenizer.unk_token
+    aspect_ratio = model.config.image_aspect_ratio if hasattr(model.config, "image_aspect_ratio") else "pad"
+    image_size = model.config.image_size if hasattr(model.config, "image_size") else 384
+    # NOTE: If num_frames is None, the frame sampling mode is "fps". If num_frames is not None, the frame sampling mode is "uniform".
+    # num_frames = model.config.num_frames
+    model.config.region_token_index = tokenizer.convert_tokens_to_ids(REGION_TOKEN)
+    processor = {
+        'image': load_images,
+        'video': load_video,
+        'text':  None
+    }
+    return model, processor, tokenizer
+def get_model_output(images_or_videos, instruct, model, tokenizer, modal='video', **kwargs):
+    streaming = kwargs.pop('streaming', False)
+    if streaming:
+        return mm_infer(images_or_videos, instruct, model, tokenizer, modal, streaming=True, **kwargs)
+    else:
+        output = mm_infer(images_or_videos, instruct, model, tokenizer, modal, streaming=False, **kwargs)
+        return next(output)
+def mm_infer(images_or_videos, instruct, model, tokenizer, modal='video', **kwargs):
+    """inference api of VideoLLaMA2 for video understanding.
+    Args:
+        model: VideoLLaMA2 model.
+        images_or_videos (torch.Tensor): image tensor (1, C, H, W) / video tensor (T, C, H, W).
+        instruct (str): text instruction for understanding video.
+        tokenizer: tokenizer.
+        do_sample (bool): whether to sample.
+        modal (str): inference modality.
+    Returns:
+        str: response of the model.
+    """
+    mask_ids = kwargs.pop('mask_ids', None)
+    masks = kwargs.pop('masks', None)
+    streaming = kwargs.pop('streaming', False)
+    if modal == 'image':
+        modal_token = DEFAULT_IMAGE_TOKEN
+        images = images_or_videos
+        additional_frames = images.copy()
+        timestamps = None
+    elif modal == 'video':
+        modal_token = DEFAULT_VIDEO_TOKEN
+        images, timestamps, additional_frames = images_or_videos
+    elif modal == 'text':
+        modal_token = ''
+    else:
+        raise ValueError(f"Unsupported modal: {modal}")
+    vlprocessor = Videollama3Processor(model.get_vision_encoder().image_processor, tokenizer)
+    vlprocessor.tokenizer.add_tokens([DEFAULT_IMAGE_TOKEN, STREAM_START_TOKEN, STREAM_END_TOKEN], special_tokens=True)
+    model.config.image_token_index = vlprocessor.tokenizer.convert_tokens_to_ids(DEFAULT_IMAGE_TOKEN)
+    if masks is not None:
+        additional_frames, masks, mask_nums = resize_image_mask(additional_frames, masks, mask_ids)
+        for idx in range(len(mask_nums)):
+            instruct = instruct.replace('<region>', "["+REGION_TOKEN*mask_nums[idx]+"]", 1)
+        additional_images_dict = vlprocessor._process_image(additional_frames, image_downsampling=1)
+        additional_images = additional_images_dict['images']
+        # import pdb
+        # pdb.set_trace()
+        # flatten_patches1 = additional_images[0].reshape(26, 46, 3, -1)
+        # from matplotlib import pyplot as plt
+        # plt.imshow(flatten_patches1[:,:,:,0])
+        # plt.savefig('16.png')
+        additional_images_thws = additional_images_dict['grid_thws']
+        additional_images = (additional_images, additional_images_thws)
+    else:
+        additional_images = None
+    # 1. text preprocess (tag process & generate prompt).
+    if isinstance(instruct, str):
+        messages = [{'role': 'user', 'content': instruct}]
+    elif isinstance(instruct, list):
+        messages = copy.deepcopy(instruct)
+    else:
+        raise ValueError(f"Unsupported type of instruct: {type(instruct)}")
+    if all(not modal_token in message["content"] for message in messages):
+        warnings.warn(f"Image tag not found in the conversation, add it automatically at the beginning!")
+        messages[0]["content"] = modal_token + messages[0]["content"]
+    converted_messages = []
+    for message in messages:
+        chunks = message["content"].split(modal_token)
+        converted_messages.append({
+            "role": "user",
+            "content": []
+        })
+        for chunk_idx in range(1, 2 * len(chunks)):
+            if chunk_idx % 2 == 1:
+                chunk = chunks[chunk_idx // 2].strip()
+                converted_messages[-1]["content"].append({"type": "text",  "text": chunk}) if chunk else None
+            else:
+                if modal == 'image':
+                    converted_messages[-1]["content"].append({"type": "image"})
+                elif modal == 'video':
+                    converted_messages[-1]["content"].append({"type": "video", "num_frames": len(images), "time": timestamps})
+    messages = converted_messages
+    # 2. vision preprocess (load & transform image or video).
+    if model.config.model_type in ['videollama3_mistral', 'videollama3_mixtral']:
+        system_message = [
+            {'role': 'system', 'content': (
+            """<<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature."""
+            """\n"""
+            """If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>""")
+            }
+        ]
+    else:
+        system_message = []
+    image_downsampling = kwargs.get('image_downsampling', model.config.spatial_merge_size)
+    # TODO: attention mask?
+    messages = system_message + messages
+    data_dict = vlprocessor(
+        images=images,
+        text=messages,
+        image_downsampling=image_downsampling,
+        return_tensors="pt",
+    )
+    torch_dtype = model.config.torch_dtype if hasattr(model.config, "torch_dtype") else torch.float16
+    images = [x.to(torch_dtype).cuda(non_blocking=True) for x in data_dict["images"]]
+    grid_thws = [x.cuda(non_blocking=True) for x in data_dict["grid_thws"]]
+    # 3. generate response according to visual signals and prompts.
+    keywords = [tokenizer.eos_token]
+    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, data_dict["input_ids"])
+    stop_str = tokenizer.eos_token
+    do_sample = kwargs.get('do_sample', False)
+    temperature = kwargs.get('temperature', 0.2 if do_sample else 0.0)
+    top_p = kwargs.get('top_p', 0.9)
+    max_new_tokens = kwargs.get('max_new_tokens', 2048)
+    if not streaming:
+        with torch.inference_mode():
+            output_ids = model.generate(
+                # input_ids,
+                # attention_mask=attention_masks,
+                # images=images,
+                data_dict["input_ids"].cuda(),
+                attention_mask=data_dict["attention_mask"].cuda(),
+                images=[(modal, images, grid_thws)],
+                do_sample=do_sample,
+                temperature=temperature,
+                max_new_tokens=max_new_tokens,
+                top_p=top_p,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria],
+                pad_token_id=tokenizer.eos_token_id,
+                additional_images=[additional_images],
+                masks=[masks],
+            )
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+        yield outputs
+    else:
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = dict(
+            inputs=data_dict["input_ids"].cuda(),
+            attention_mask=data_dict["attention_mask"].cuda(),
+            images=[(modal, images, grid_thws)],
+            do_sample=do_sample,
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
+            top_p=top_p,
+            use_cache=True,
+            stopping_criteria=[stopping_criteria],
+            pad_token_id=tokenizer.eos_token_id,
+            additional_images=[additional_images],
+            masks=[masks],
+            streamer=streamer
+        )
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        generated_text = ""
+        for new_text in streamer:
+            generated_text += new_text
+            if stop_str in generated_text:
+                generated_text = generated_text[:generated_text.find(stop_str)]
+                break
+            yield new_text
+        thread.join()

videollama3/constants.py ADDED Viewed

	@@ -0,0 +1,46 @@

+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+# Image arguments
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"
+# Video arguments
+VIDEO_TOKEN_INDEX = -201
+DEFAULT_VIDEO_TOKEN = "<video>"
+NUM_FRAMES = 128
+MAX_FRAMES = 768
+NUM_FRAMES_PER_SECOND = 1
+# Region arguments
+REGION_TOKEN = "<REGION>"
+# Audio arguments
+AUDIO_TOKEN_INDEX = -202
+DEFAULT_AUDIO_TOKEN = "<audio>"
+# Stream arguments
+STREAM_START_TOKEN = "<|stream_start|>"
+STREAM_END_TOKEN = "<|stream_end|>"
+STREAM_IMAGE_TOKEN = "<stream_image>"
+STREAM_FPS = 2
+STREAM_IMAGE_SIZE = 224
+STREAM_DOWNSAMPLING = 4
+STREAM_MAX_FRAMES = 400
+MODAL_INDEX_MAP = {
+    "<image>": -200,
+    "<video>": -201,
+    "<audio>": -202,
+}
+subimage_token_num=196

videollama3/infer.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+import os
+import torch
+import sys
+sys.path.append('./')
+from videollama3 import disable_torch_init, model_init, mm_infer, get_model_output
+from videollama3.mm_utils import load_video
+import numpy as np
+from PIL import Image
+def infer_image(model, tokenizer):
+    image_path = 'demo/images/1.jpg'
+    image = Image.open(image_path)
+    image_data = np.array(image)
+    question = '<image>\nPlease describe the <region> in the image in detail.'
+    mask = np.load('demo/masks/demo0.npy')
+    masks = []
+    masks.append(mask)
+    masks = np.array(masks)
+    masks = torch.from_numpy(masks).to(torch.uint8)
+    mask_ids = [0]*len(masks)
+    output = get_model_output(
+        [image_data],
+        question,
+        model=model,
+        tokenizer=tokenizer,
+        masks=masks,
+        mask_ids=mask_ids,
+        modal='image',
+        image_downsampling=1,
+    )
+    print(output)
+def infer_video(model, tokenizer):
+    video_path = 'demo/videos/1.mp4'
+    question = '<video>\nPlease describe the <region> in the video in detail.'
+    frame_idx = 0 # mask from the first frame
+    video_tensor = load_video(video_path, fps=1, max_frames=768, frame_ids=[frame_idx])
+    mask = np.load('demo/masks/demo1.npy')
+    masks = []
+    masks.append(mask)
+    masks = np.array(masks)
+    masks = torch.from_numpy(masks).to(torch.uint8)
+    mask_ids = [0]*len(masks)
+    output = get_model_output(
+        video_tensor,
+        question,
+        model=model,
+        tokenizer=tokenizer,
+        masks=masks,
+        mask_ids=mask_ids,
+        modal='video',
+    )
+    print(output)
+def main():
+    disable_torch_init()
+    # fill in the model path here
+    model_path = '/mnt/workspace/workgroup/yuanyq/code/videollama3/ProjectX_region/work_dirs/VideoRefer-VideoLLaMA3-7B'
+    model, processor, tokenizer = model_init(model_path)
+    # image
+    infer_image(model, tokenizer)
+    # viideo
+    infer_video(model, tokenizer)
+if __name__=='__main__':
+    main()

videollama3/mm_utils.py ADDED Viewed

	@@ -0,0 +1,704 @@

+import ast
+import os
+import re
+import math
+import base64
+import traceback
+from io import BytesIO
+from typing import Optional
+import torch
+import torchvision.transforms.functional as VF
+import torch.nn.functional as F
+import numpy as np
+from transformers import StoppingCriteria
+import cv2
+import imageio
+import ffmpeg
+from PIL import Image
+from decord import VideoReader, cpu
+from .constants import NUM_FRAMES, MAX_FRAMES, NUM_FRAMES_PER_SECOND, MODAL_INDEX_MAP, DEFAULT_IMAGE_TOKEN
+from pycocotools import mask as maskUtils
+def resize_image_mask(images, masks, mask_ids, patch_size=14):
+    resize_images = []
+    resize_masks = []
+    mask_nums = []
+    for i, mask in enumerate(masks):
+        image = images[mask_ids[i]]
+        h, w = image.shape[:2]
+        if mask.sum()==0:
+            print('mask is none...')
+            mask = torch.ones((h, w))
+        rows, cols = np.where(mask == 1)
+        min_row, max_row = rows.min(), rows.max()
+        min_col, max_col = cols.min(), cols.max()
+        bbox = (max(0,min_row-patch_size*2), max(0,min_col-patch_size*2), min(h-1, max_row+patch_size*2), min(w-1, max_col+patch_size*2))
+        mask_h = bbox[2] - bbox[0]
+        mask_w = bbox[3] - bbox[1]
+        cropping_img = image[bbox[0]: bbox[2], bbox[1]: bbox[3], :]
+        cropping_mask = mask[bbox[0]: bbox[2], bbox[1]: bbox[3]]
+        scale_rate = math.ceil(math.sqrt(1960/mask.sum()))
+        if scale_rate==1:
+            if (mask.sum()/196)>100:
+                scale_rate = math.sqrt((mask.sum()/196)/100)
+                scale_rate = 1/scale_rate
+        resize_h = math.ceil((mask_h*scale_rate)/patch_size) * patch_size
+        resize_w = math.ceil((mask_w*scale_rate)/patch_size) * patch_size
+        resize_img = cv2.resize(cropping_img, (resize_w, resize_h))
+        resize_mask = F.interpolate(cropping_mask[None, None], size=(resize_h//patch_size, resize_w//patch_size), mode='bilinear', align_corners=False)[0,0]
+        mask_nums.append(min(10, int(resize_mask.sum())))
+        resize_images.append(resize_img)
+        resize_masks.append(resize_mask)
+    return resize_images, resize_masks, mask_nums
+def reshape_images_to_raw_grid(mm_features_raw, grid_thws):
+    start_idx=0
+    reshaped_features = []
+    for thw_group in grid_thws:
+        for tensor_thw in thw_group:
+            _, H, W = tensor_thw.squeeze().tolist()
+            num_elements = H * W
+            split_tensor = mm_features_raw[start_idx:start_idx + num_elements].view(H, W, -1)
+            reshaped_features.append(split_tensor)
+            start_idx += num_elements
+    assert len(mm_features_raw)==start_idx
+    return reshaped_features
+def annToMask(mask_ann, h=None, w=None):
+    if isinstance(mask_ann, list):
+        rles = maskUtils.frPyObjects(mask_ann, h, w)
+        rle = maskUtils.merge(rles)
+    elif isinstance(mask_ann['counts'], list):
+        # uncompressed RLE
+        rle = maskUtils.frPyObjects(mask_ann, h, w)
+    else:
+        # rle
+        rle = mask_ann
+    mask = maskUtils.decode(rle)
+    return mask
+def chunk_list(input_list, chunk_size):
+    return [input_list[i:i + chunk_size] for i in range(0, len(input_list), chunk_size)]
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def grid_divide(image, cell_size):
+    """
+    Divides an image into grid of a specified size.
+    Args:
+        image (PIL.Image.Image): The input image.
+        cell_size (int): The size of each cell.
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    grid = []
+    width, height = image.size
+    for i in range(0, height, cell_size):
+        row = []
+        for j in range(0, width, cell_size):
+            box = (j, i, j + cell_size, i + cell_size)
+            row.append(image.crop(box))
+        grid.append(row)
+    return grid
+def load_images(image_path):
+    if isinstance(image_path, str) and os.path.isfile(image_path):
+        images = [cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)]
+        # images = [Image.open(image_path).convert('RGB')]
+    elif isinstance(image_path, str) and os.path.isdir(image_path):
+        images = [cv2.cvtColor(cv2.imread(os.path.join(image_path, f)), cv2.COLOR_BGR2RGB) for f in sorted(os.listdir(image_path))]
+        # images = [Image.open(os.path.join(image_path, f)).convert('RGB') for f in sorted(os.listdir(image_path))]
+    elif isinstance(image_path, list) and isinstance(image_path[0], str):
+        images = [cv2.cvtColor(cv2.imread(f), cv2.COLOR_BGR2RGB) for f in image_path]
+        # images = [Image.open(f).convert('RGB') for f in image_path]
+    elif isinstance(image_path, list) and isinstance(image_path[0], Image.Image):
+        images = image_path
+    elif isinstance(image_path, Image.Image):
+        images = [image_path]
+    else:
+        print('image_path: ', image_path)
+        raise ValueError(f"Unsupported image path type: {image_path}")
+    return images
+def process_pad_image(image, padding_value=(0, 0, 0)):
+    image = expand2square(image, padding_value)
+    return [image]
+def find_closest_aspect_ratio(src_ratio, tgt_ratios, ori_size, tgt_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = ori_size[0] * ori_size[1]
+    for ratio in tgt_ratios:
+        tgt_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(src_ratio - tgt_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * tgt_size[0] * tgt_size[1] * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def process_dynamic_image(image, image_size=384, use_thumbnail=True):
+    # Grid Params:
+    min_num = 1
+    max_num = 12
+    if isinstance(image_size, int):
+        image_size = (image_size, image_size)
+    ori_size = image.size
+    aspect_ratio = ori_size[0] / ori_size[1]
+    # calculate the existing image aspect ratio
+    tgt_ratios = []
+    for n in range(min_num, max_num + 1):
+        tgt_ratios.extend([(i, j) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num])
+    tgt_ratios = set(tgt_ratios)
+    tgt_ratios = sorted(tgt_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    tgt_ratio = find_closest_aspect_ratio(aspect_ratio, tgt_ratios, ori_size, image_size)
+    # resize the image to the target size
+    tgt_width = image_size[0] * tgt_ratio[0]
+    tgt_height = image_size[1] * tgt_ratio[1]
+    resized_img = image.resize((tgt_width, tgt_height))
+    # NOTE: internvl2 style split the image into one column grids
+    # num_grids = tgt_ratio[0] * tgt_ratio[1]
+    # grid_images = []
+    # for i in range(num_grids):
+    #     box = (
+    #         (i %  tgt_ratio[0]) * image_size[0],
+    #         (i // tgt_ratio[0]) * image_size[1],
+    #         (i %  tgt_ratio[0] + 1) * image_size[0],
+    #         (i // tgt_ratio[0] + 1) * image_size[1],
+    #     )
+    #     # crop out the grid image
+    #     grid_images.append(resized_img.crop(box))
+    # assert len(grid_images) == num_grids
+    # grid_images = [grid_images]
+    # NOTE: eager implementation
+    # num_grids = tgt_ratio[0] * tgt_ratio[1]
+    # sub_grid_images = []
+    # tmp_grid_images = []
+    # for i in range(num_grids):
+    #     box = (
+    #         (i %  tgt_ratio[0]) * image_size[0],
+    #         (i // tgt_ratio[0]) * image_size[1],
+    #         (i %  tgt_ratio[0] + 1) * image_size[0],
+    #         (i // tgt_ratio[0] + 1) * image_size[1],
+    #     )
+    #     tmp_grid_images.append(resized_img.crop(box))
+    #     if (i + 1) % tgt_ratio[0] == 0:
+    #         sub_grid_images.append(tmp_grid_images)
+    #         tmp_grid_images = []
+    image_grid = grid_divide(resized_img, image_size[0])
+    if use_thumbnail:
+        thumbnail_img = image.resize((image_size[0], image_size[1]))
+        image_grid = [[thumbnail_img]] + image_grid
+    return image_grid
+def process_highres_image(image_path, image_size=384, use_thumbnail=True, padding_value=(0, 0, 0)):
+    # Grid Params:
+    grid_width = [1, 2, 3]
+    grid_width_real = [x * image_size for x in grid_width]
+    longest_side = max(image.size)
+    fit_grid_width_real = [x for x in grid_width_real if x >= longest_side]
+    if len(fit_grid_width_real) == 0:
+        select_size = max(grid_width_real)
+    else:
+        select_size = min(fit_grid_width_real)
+    image_padded = expand2square(image, padding_value)
+    image_padded = image_padded.resize((select_size, select_size))
+    image_grid = grid_divide(image_padded, image_size)
+    if use_thumbnail:
+        thumbnail_img = image.resize((image_size, image_size))
+        image_grid = [[thumbnail_img]] + image_grid
+    return image_grid
+def select_best_resolution(original_size, possible_resolutions):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float('inf')
+    for width, height in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+    return best_fit
+def process_anyres_image(image, image_size=384, use_thumbnail=True, padding_value=(0, 0, 0)):
+    """
+    Process an image with variable resolutions.
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    # Grid Params:
+    possible_grids = [(1, 1), (1, 2), (1, 3), (2, 1), (2, 2), (2, 3)]
+    possible_resolutions = [(x * image_size, y * image_size) for x, y in possible_grids]
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    # resize and padding image
+    nw, nh = best_resolution
+    ow, oh = image.size
+    scale_factor = min(nw / ow, nh / oh)
+    new_size = (int(ow * scale_factor), int(oh * scale_factor))
+    image_padded = Image.new("RGB", (nw, nh), padding_value)
+    image_padded.paste(image.resize(new_size), ((nw - new_size[0]) // 2, (nh - new_size[1]) // 2))
+    image_grid = grid_divide(image_padded, image_size)
+    if use_thumbnail:
+        thumbnail_img = image.resize((image_size, image_size))
+        image_grid = [[thumbnail_img]] + image_grid
+    return image_grid
+def process_adares_image(image_path, image_size=384, use_thumbnail=True):
+    # Grid Params:
+    min_num = 1
+    max_num = 12
+    if isinstance(image_size, int):
+        image_size = (image_size, image_size)
+    ori_size = image.size
+    aspect_ratio = ori_size[0] / ori_size[1]
+    # calculate the existing image aspect ratio
+    tgt_ratios = []
+    for n in range(min_num, max_num + 1):
+        tgt_ratios.extend([(i, j) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num])
+    tgt_ratios = set(tgt_ratios)
+    possible_resolutions = [(x * image_size[0], y * image_size[1]) for x, y in tgt_ratios]
+    # find the most possible resolution
+    best_resolution = select_best_resolution(ori_size, possible_resolutions)
+    # resize the image to the target size
+    resized_img = image.resize((best_resolution[0], best_resolution[1]))
+    image_grid = grid_divide(resized_img, image_size[0])
+    if use_thumbnail:
+        thumbnail_img = image.resize((image_size[0], image_size[1]))
+        image_grid = [[thumbnail_img]] + image_grid
+    return image_grid
+def process_images(image_path, processor, aspect_ratio='pad', image_size=384, use_thumbnail=True):
+    images = load_images(image_path)
+    padding_value = tuple(int(x*255) for x in processor.image_mean)
+    image_grids = []
+    for image in images:
+        if aspect_ratio == 'pad':
+            image_grid = process_pad_image(image, padding_value=padding_value)
+        elif aspect_ratio == 'dynamic':
+            image_grid = process_dynamic_image(image, image_size=image_size, use_thumbnail=use_thumbnail)
+        elif aspect_ratio == 'highres':
+            image_grid = process_highres_image(image, image_size=image_size, use_thumbnail=use_thumbnail, padding_value=padding_value)
+        elif aspect_ratio == 'anyres':
+            image_grid = process_anyres_image(image, image_size=image_size, use_thumbnail=use_thumbnail, padding_value=padding_value)
+        elif aspect_ratio == 'adares':
+            image_grid = process_adares_image(image, image_size=image_size, use_thumbnail=use_thumbnail)
+        else:
+            image_grid = [image]
+        image_grid = [processor.preprocess(image_row, return_tensors='pt', num_images=len(images)) for image_row in image_grid]
+        image_grids.append(image_grid)
+    return image_grids
+def frame_sample(duration, mode='uniform', num_frames=None, vid_fps=None, fps=None):
+    if mode == 'uniform':
+        assert num_frames is not None, "Number of frames must be provided for uniform sampling."
+        if duration <= num_frames:
+            return np.arange(duration).astype(int)
+        # NOTE: v1 version
+        # Calculate the size of each segment from which a frame will be extracted
+        # if duration <= num_frames:
+        #     return np.arange(duration).astype(int)
+        # seg_size = float(duration - 1) / num_frames
+        # frame_ids = []
+        # for i in range(num_frames):
+        #     # Calculate the start and end indices of each segment
+        #     start = seg_size * i
+        #     end   = seg_size * (i + 1)
+        #     # Append the middle index of the segment to the list
+        #     frame_ids.append((start + end) / 2)
+        # return np.round(np.array(frame_ids) + 1e-6).astype(int)
+        # NOTE: v0 version
+        return np.linspace(0, duration-1, num_frames, dtype=int)
+    elif mode == 'fps':
+        assert vid_fps is not None, "FPS must be provided for FPS sampling."
+        fps = fps if fps is not None else NUM_FRAMES_PER_SECOND
+        segment_len = min(vid_fps // fps, duration)
+        return np.arange(segment_len // 2, duration, segment_len, dtype=int)
+    else:
+        raise ImportError(f'Unsupported frame sampling mode: {mode}')
+def load_video_from_ids(video_path, s=None, e=None, fps=None, max_frames=None, temporal_factor=1, frame_ids=None):
+    if s is not None and e is not None:
+        s = s if s >= 0. else 0.
+        e = e if e >= 0. else 0.
+        if s > e:
+            s, e = e, s
+        elif s == e:
+            e = s + 1
+    # 1. Loading Video
+    if os.path.isdir(video_path):
+        frame_files = sorted(os.listdir(video_path))
+        vid_fps = 3
+        num_frames_of_video = len(frame_files)
+    elif video_path.endswith('.gif'):
+        gif_reader = imageio.get_reader(video_path)
+        vid_fps = 25
+        num_frames_of_video = len(gif_reader)
+    else:
+        vreader = VideoReader(video_path, ctx=cpu(0), num_threads=2)
+        # vreader = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+        vid_fps = vreader.get_avg_fps()
+        num_frames_of_video = len(vreader)
+    # 2. Determine frame range & Calculate frame indices
+    f_start = 0                       if s is None else max(int(s * vid_fps) - 1, 0)
+    f_end   = num_frames_of_video - 1 if e is None else min(int(e * vid_fps) - 1, num_frames_of_video - 1)
+    frame_indices = list(range(f_start, f_end + 1))
+    duration = len(frame_indices)
+    # 3. Sampling frame indices
+    max_frames = max_frames if max_frames is not None else MAX_FRAMES
+    if fps is not None and duration / vid_fps < max_frames:
+        try:
+            sampled_frame_indices = [frame_indices[i] for i in frame_sample(duration, mode='fps', vid_fps=vid_fps, fps=fps)]
+        except:
+            print('sampled_frame_indices error: ', )
+            sampled_frame_indices = [frame_indices[i] for i in frame_sample(duration, mode='uniform', num_frames=max_frames)]
+    else:
+        sampled_frame_indices = [frame_indices[i] for i in frame_sample(duration, mode='uniform', num_frames=max_frames)]
+    # 4. Acquire frame data
+    if os.path.isdir(video_path):
+        frames = [cv2.cvtColor(cv2.imread(os.path.join(video_path, frame_files[frame_idx])), cv2.COLOR_BGR2RGB) for frame_idx in sampled_frame_indices]
+    elif video_path.endswith('.gif'):
+        frames = [cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB) for idx, frame in enumerate(gif_reader) if idx in sampled_frame_indices]
+    else:
+        frames = vreader.get_batch(sampled_frame_indices).asnumpy()
+    # frames = frames.transpose(0, 3, 1, 2)
+    timestamps = [x / vid_fps for x in sampled_frame_indices]
+    if temporal_factor > 1:
+        pad_length = temporal_factor - len(frames) % temporal_factor
+        frames = np.concatenate([frames, frames[-1:].repeat(pad_length, axis=0)])
+        [timestamps.append(timestamps[-1] + 1 / fps) for _ in range(pad_length)]
+    # NOTE: pad the video with black frames
+    # while num_frames is not None and len(video_data) < num_frames:
+    #     video_data.append(Image.fromarray(np.zeros((*video_data[-1].size, 3), dtype=np.uint8)))
+    additional_frames = []
+    if frame_ids is not None:
+        if os.path.isdir(video_path):
+            additional_frames = [cv2.cvtColor(cv2.imread(os.path.join(video_path, frame_files[frame_idx])), cv2.COLOR_BGR2RGB) for frame_idx in frame_ids]
+        elif video_path.endswith('.gif'):
+            additional_frames = [cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB) for idx, frame in enumerate(gif_reader) if idx in frame_ids]
+        else:
+            additional_frames = vreader.get_batch(frame_ids).asnumpy()
+    return frames, timestamps, additional_frames
+def load_video(
+    video_path: str,
+    start_time: Optional[float] = None,
+    end_time: Optional[float] = None,
+    fps: Optional[float] = None,
+    max_frames: Optional[float] = None,
+    size: Optional[int] = None,
+    size_divisible: int = 1,
+    precise_time: bool = False,
+    verbose: bool = False,
+    temporal_factor: int = 1,
+    frame_ids = None
+):
+    """
+    Load and process a video file and return the frames and the timestamps of each frame.
+    Args:
+        video_path (str): Path to the video file.
+        start_time (float, optional): Start time in seconds. Defaults to None.
+        end_time (float, optional): End time in seconds. Defaults to None.
+        fps (float, optional): Frames per second. Defaults to None.
+        num_frames (float, optional): Number of frames to sample. Defaults to None.
+        size (int, optional): Size of the shortest side. Defaults to None.
+        size_divisible (int, optional): Size divisible by this number. Defaults to 1.
+        precise_time (bool, optional): Whether to use precise time. Defaults to False.
+        verbose (bool, optional): Print ffmpeg output. Defaults to False.
+    Returns:
+        frames (List[PIL.Image]): List of frames.
+        timestamps (List[float]): List of timestamps.
+    """
+    if start_time is not None and end_time is not None and end_time - start_time < 1:
+        return load_video_from_ids(video_path, start_time, end_time, fps=fps, max_frames=max_frames, frame_ids=frame_ids)
+    if os.path.isdir(video_path):
+        return load_video_from_ids(video_path, start_time, end_time, fps=fps, max_frames=max_frames, frame_ids=frame_ids)
+    if video_path.endswith('.gif'):
+        return load_video_from_ids(video_path, start_time, end_time, fps=fps, max_frames=max_frames, frame_ids=frame_ids)
+    probe = ffmpeg.probe(video_path)
+    duration = float(probe['format']['duration'])
+    video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
+    w, h = int(video_stream['width']), int(video_stream['height'])
+    kwargs, input_kwargs, output_kwargs = {}, {}, {}
+    do_trim = start_time is not None or end_time is not None
+    if start_time is not None:
+        new_start_time = max(float(video_stream['start_time']), start_time)
+        duration -= new_start_time - start_time
+        start_time = new_start_time
+    else:
+        start_time = float(video_stream['start_time'])
+    if end_time is not None:
+        duration = min(duration, end_time - start_time)
+    else:
+        duration = duration
+    if do_trim:
+        kwargs = {'ss': start_time, 't': duration}
+    if precise_time:
+        output_kwargs.update(kwargs)
+    else:
+        input_kwargs.update(kwargs)
+    if size is not None:
+        scale_factor = size / min(w, h)
+        new_w, new_h = round(w * scale_factor), round(h * scale_factor)
+    else:
+        new_w, new_h = w, h
+    new_w = new_w // size_divisible * size_divisible
+    new_h = new_h // size_divisible * size_divisible
+    # NOTE: It may result in unexpected number of frames in ffmpeg
+    # if calculate the fps directly according to max_frames
+    # NOTE: the below lines may hurt the performance
+    # if max_frames is not None and (fps is None or duration * fps > 2 * max_frames):
+    #     fps = max_frames / duration * 2
+    stream = ffmpeg.input(video_path, **input_kwargs)
+    if fps is not None:
+        stream = ffmpeg.filter(stream, "fps", fps=fps, round="down")
+    if new_w != w or new_h != h:
+        stream = ffmpeg.filter(stream, 'scale', new_w, new_h)
+    stream = ffmpeg.output(stream, "pipe:", format="rawvideo", pix_fmt="rgb24", **output_kwargs)
+    out, _ = ffmpeg.run(stream, capture_stdout=True, quiet=not verbose)
+    frames = np.frombuffer(out, np.uint8).reshape([-1, new_h, new_w, 3]).transpose([0, 3, 1, 2])
+    if fps is not None:
+        timestamps = np.arange(start_time, start_time + duration + 1 / fps, 1 / fps)[:len(frames)]
+    else:
+        timestamps = np.linspace(start_time, start_time + duration, len(frames))
+    max_frames = max_frames if max_frames is not None else MAX_FRAMES
+    if max_frames is not None and len(frames) > max_frames:
+        indices = np.linspace(0, len(frames) - 1, max_frames, dtype=int)
+        frames = frames[indices]
+        timestamps = [timestamps[i] for i in indices]
+    if temporal_factor > 1:
+        pad_length = temporal_factor - len(frames) % temporal_factor
+        frames = np.concatenate([frames, frames[-1:].repeat(pad_length, axis=0)])
+        [timestamps.append(timestamps[-1] + 1 / fps) for _ in range(pad_length)]
+    frames = [frame for frame in frames]
+    additional_frames = []
+    # print('frame_ids', frame_ids)
+    if frame_ids is not None:
+        vr = VideoReader(video_path, ctx=cpu(0))
+        additional_frames = vr.get_batch(frame_ids).asnumpy()
+    return frames, timestamps, additional_frames
+def process_video(video_path, processor, s=None, e=None, aspect_ratio='pad', num_frames=None):
+    fps = 1 if num_frames is None else None
+    # FFmpeg
+    frames, timestamps = load_video(video_path, s, e, fps=fps, max_frames=num_frames)
+    # Decord
+    # frames, timestamps = load_video_from_ids(video_path, s, e, fps=fps, max_frames=num_frames)
+    assert len(frames) == len(timestamps), "Number of frames and timestamps must match."
+    if aspect_ratio == 'pad':
+        frames = [expand2square(f, tuple(int(x*255) for x in processor.image_mean)) for f in frames]
+    if aspect_ratio == 'qwen2vl':
+        frames = [processor.preprocess(frame, return_tensors='pt', image_num=len(frames)) for frame in frames]
+        grid_frames = [frames]
+    else:
+        frames = processor.preprocess(frames, return_tensors='pt', image_num=len(frames))
+        grid_frames = [[frames]]
+    return grid_frames, timestamps
+def tokenizer_multimodal_token(prompt, tokenizer, multimodal_token=DEFAULT_IMAGE_TOKEN, return_tensors=None):
+    """Tokenize text and multimodal tag to input_ids.
+    Args:
+        prompt (str): Text prompt (w/ multimodal tag), e.g., '<video>\nDescribe the video.'
+        tokenizer (transformers.PreTrainedTokenizer): Tokenizer object.
+        multimodal_token (int): Token index corresponding to the multimodal tag.
+    """
+    multimodal_token_index = MODAL_INDEX_MAP.get(multimodal_token, None)
+    if multimodal_token_index is None:
+        input_ids = tokenizer(prompt, add_special_tokens=False).input_ids
+    else:
+        prompt_chunks = [tokenizer(chunk, add_special_tokens=False).input_ids for idx, chunk in enumerate(prompt.split(multimodal_token))]
+        input_ids = []
+        for i in range(1, 2 * len(prompt_chunks)):
+            if i % 2 == 1:
+                input_ids.extend(prompt_chunks[i // 2])
+            else:
+                input_ids.append(multimodal_token_index)
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith('checkpoint-'):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        self.max_keyword_len = 0
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            if len(cur_keyword_ids) > self.max_keyword_len:
+                self.max_keyword_len = len(cur_keyword_ids)
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            if (output_ids[0, -keyword_id.shape[0]:] == keyword_id).all():
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        outputs = []
+        for i in range(output_ids.shape[0]):
+            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
+        return all(outputs)

videollama3/model/__init__.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import os
+import warnings
+import shutil
+import torch
+from transformers import PretrainedConfig, AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
+from .projector import load_mm_projector
+from .videollama3_qwen2 import Videollama3Qwen2ForCausalLM, Videollama3Qwen2Config
+VLLMs = {
+    "videollama3_qwen2": Videollama3Qwen2ForCausalLM,
+}
+VLLMConfigs = {
+    "videollama3_qwen2": Videollama3Qwen2Config,
+}
+def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", **kwargs):
+    if 'token' in kwargs:
+        token = kwargs['token']
+    else:
+        token = None
+    # NOTE: auto device_map by default
+    # if want to put model into a single device, you can set device_map={"": "cuda:0"}
+    kwargs = {"device_map": device_map, **kwargs}
+    config = AutoConfig.from_pretrained(model_path)
+    config._attn_implementation = kwargs.pop('attn_implementation', "flash_attention_2") # default to flash_attention_2
+    torch_dtype = config.torch_dtype if hasattr(config, "torch_dtype") else kwargs.pop('torch_dtype', torch.float16)
+    if load_8bit:
+        kwargs['load_in_8bit'] = True
+    elif load_4bit:
+        # NOTE: High-version Transformers will report: """ValueError: You can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing `quantization_config` argument at the same time."""
+        # kwargs['load_in_4bit'] = True
+        kwargs['quantization_config'] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch_dtype,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4'
+        )
+    else:
+        kwargs['torch_dtype'] = torch_dtype
+    # judge model type
+    model_type = config.model_type if hasattr(config, "model_type") else kwargs.pop('model_type', "videollama3_qwen2")
+    # judge pretrain/finetune
+    is_alignment = getattr(config, "tune_mm_mlp_adapter", False) or getattr(config, "is_alignment", False)
+    # NOTE: lora/qlora model loading
+    if 'lora' in model_name.lower() or 'qlora' in model_name.lower():
+        cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
+        # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
+        # cfg_pretrained = AutoConfig.from_pretrained(model_path, token=token)
+        model_base = model_base if model_base is not None else cfg_pretrained._name_or_path
+        # NOTE: remove qlora training quantization config
+        if hasattr(lora_cfg_pretrained, 'quantization_config'):
+            del lora_cfg_pretrained.quantization_config
+        tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
+        print('Loading VideoLLaMA from base model...')
+        if 'qwen2' in model_base.lower():
+            model = Videollama3Qwen2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
+        else:
+            model = Videollama3Qwen2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
+        token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
+        if model.lm_head.weight.shape[0] != token_num:
+            model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+            model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+        print('Loading additional VideoLLaMA weights...')
+        if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
+            non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
+        else:
+            # this is probably from HF Hub
+            from huggingface_hub import hf_hub_download
+            def load_from_hf(repo_id, filename, subfolder=None):
+                cache_file = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=filename,
+                    subfolder=subfolder)
+                return torch.load(cache_file, map_location='cpu')
+            non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
+        non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
+        if any(k.startswith('model.model.') for k in non_lora_trainables):
+            non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
+        model.load_state_dict(non_lora_trainables, strict=False)
+        from peft import PeftModel
+        print('Loading LoRA weights...')
+        model = PeftModel.from_pretrained(model, model_path)
+        print('Merging LoRA weights...')
+        model = model.merge_and_unload()
+        print('Model is loaded...')
+    elif model_base is not None or '-base' in model_name.lower() or is_alignment:
+        # NOTE: Base/Pretrain model loading
+        print('Loading VideoLLaMA 2 from base model...')
+        cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
+        # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
+        # cfg_pretrained = AutoConfig.from_pretrained(model_path, token=token)
+        model_base = model_base if model_base is not None else cfg_pretrained._name_or_path
+        tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
+        if model_type in ['videollama3', 'videollama3_qwen2']:
+            model = Videollama3Qwen2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
+        else:
+            model = Videollama3Qwen2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
+        # NOTE; loading vision-language projector
+        # * old codes for loading local mm_projector.bin
+        # mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
+        # mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
+        # model.load_state_dict(mm_projector_weights, strict=False)
+        # * new codes which supports loading mm_projector.bin both offline and online
+        mm_projector_weights = load_mm_projector(model_path, token=token)
+        model.load_state_dict(mm_projector_weights, strict=False)
+    elif 'videollama' in model_type:
+        # NOTE: SFT model loading
+        print(model_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
+        if model_type in ['videollama3_qwen2']:
+            model = Videollama3Qwen2ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
+        else:
+            model = Videollama3Qwen2ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, token=token)
+        model = AutoModelForCausalLM.from_pretrained(model_path, config=config, **kwargs)
+    processor = None
+    if "videollama" in model_type:
+        vision_encoder = model.get_vision_encoder()
+        processor = vision_encoder.image_processor
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+    return tokenizer, model, processor, context_len

videollama3/model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (4.04 kB). View file

videollama3/model/__pycache__/encoder.cpython-310.pyc ADDED Viewed

Binary file (10.5 kB). View file

videollama3/model/__pycache__/processor.cpython-310.pyc ADDED Viewed

Binary file (12.2 kB). View file

videollama3/model/__pycache__/projector.cpython-310.pyc ADDED Viewed

Binary file (5.11 kB). View file

videollama3/model/__pycache__/region_encoder.cpython-310.pyc ADDED Viewed

Binary file (3.43 kB). View file

videollama3/model/__pycache__/videollama3_arch.cpython-310.pyc ADDED Viewed

Binary file (9.74 kB). View file

videollama3/model/__pycache__/videollama3_qwen2.cpython-310.pyc ADDED Viewed

Binary file (4.2 kB). View file

videollama3/model/damovl_encoder/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .configuration_damovl_encoder import DAMOVLVisionConfig
+from .image_processing import DAMOVLImageProcessor
+from .modeling_damovl_encoder import DAMOVLVisionModel

videollama3/model/damovl_encoder/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (407 Bytes). View file

videollama3/model/damovl_encoder/__pycache__/configuration_damovl_encoder.cpython-310.pyc ADDED Viewed

Binary file (1.96 kB). View file

videollama3/model/damovl_encoder/__pycache__/image_processing.cpython-310.pyc ADDED Viewed

Binary file (16.7 kB). View file

videollama3/model/damovl_encoder/__pycache__/modeling_damovl_encoder.cpython-310.pyc ADDED Viewed

Binary file (16.9 kB). View file

videollama3/model/damovl_encoder/configuration_damovl_encoder.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen2VL model configuration"""
+import os
+from typing import Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class DAMOVLVisionConfig(PretrainedConfig):
+    model_type = "damovl"
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        spatial_merge_size=1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.spatial_merge_size = spatial_merge_size
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # config_dict = config_dict["vision_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)

videollama3/model/damovl_encoder/image_processing.py ADDED Viewed

	@@ -0,0 +1,472 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Qwen2-VL."""
+import math
+from typing import Dict, List, Optional, Union
+import numpy as np
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    VideoInput,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.utils import TensorType, is_vision_available, logging
+logger = logging.get_logger(__name__)
+if is_vision_available():
+    from PIL import Image
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+    Returns:
+        list: A list of images.
+    """
+    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
+        return [img for img_list in images for img in img_list]
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+    elif is_valid_image(images):
+        return [images]
+    raise ValueError(f"Could not make batched images from {images}")
+# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
+def make_batched_videos(videos) -> List[VideoInput]:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+        return videos
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], Image.Image):
+            return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
+    raise ValueError(f"Could not make batched video from {videos}")
+def smart_resize(
+    height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
+):
+    """Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    if height < factor or width < factor:
+        scale = factor / min(height, width)
+        width = round(scale * width)
+        height = round(scale * height)
+    elif max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+class DAMOVLImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a DAMOVL image processor that dynamically resizes images based on the original images.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        min_pixels (`int`, *optional*, defaults to `56 * 56`):
+            The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
+            The max pixels of the image to resize the image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The spacial patch size of the vision encoder.
+        temporal_patch_size (`int`, *optional*, defaults to 2):
+            The temporal patch size of the vision encoder.
+        merge_size (`int`, *optional*, defaults to 2):
+            The merge size of the vision encoder to llm encoder.
+    """
+    model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 56 * 56,
+        max_pixels: int = 14 * 14 * 9477,
+        patch_size: int = 14,
+        merge_size: int = 1,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+        self.do_convert_rgb = do_convert_rgb
+        self.temporal_patch_size = 1
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        do_resize: bool = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        num_images: Optional[int] = 1,
+        image_downsampling: Optional[int] = None,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            vision_info (`List[Dict]`, *optional*):
+                Optional list of dictionaries containing additional information about vision inputs.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        images = make_list_of_images(images)
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            if do_resize:
+                max_pixels = int(self.max_pixels / (self.merge_size / image_downsampling)**2)
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=self.patch_size * image_downsampling,
+                    min_pixels=self.min_pixels,
+                    max_pixels=int(max_pixels // num_images),
+                )
+                image = resize(
+                    image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
+                )
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+        patches = np.array(processed_images)
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose(0, 3, 1, 2)
+        channel = patches.shape[1]
+        grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+        patches = patches.reshape(
+            channel,
+            grid_h // image_downsampling,
+            image_downsampling,
+            self.patch_size,
+            grid_w // image_downsampling,
+            image_downsampling,
+            self.patch_size,
+        )
+        patches = patches.transpose(1, 4, 2, 5, 0, 3, 6)
+        flatten_patches = patches.reshape(
+            grid_h * grid_w, channel * self.patch_size * self.patch_size
+        )
+        # print('image_downsampling', image_downsampling)
+        # flatten_patches1 = flatten_patches.reshape(grid_h, grid_w, channel, -1)
+        # from matplotlib import pyplot as plt
+        # plt.imshow(flatten_patches1[:,:,:,0])
+        # plt.savefig('8.png')
+        return flatten_patches, (1, grid_h, grid_w)
+    def preprocess(
+        self,
+        images: ImageInput,
+        videos: VideoInput = None,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        num_images: Optional[int] = 1,
+        image_downsampling: Optional[int] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            videos (`VideoInput`):
+                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        image_downsampling = image_downsampling if image_downsampling is not None else self.merge_size
+        if images is not None:
+            images = make_batched_images(images)
+        if videos is not None:
+            videos = make_batched_videos(videos)
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        if images is not None:
+            pixel_values, vision_grid_thws = [], []
+            for image in images:
+                patches, image_grid_thw = self._preprocess(
+                    image,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                    num_images=num_images,
+                    image_downsampling=image_downsampling,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(image_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
+        assert videos is None, "Not support video for now."
+        # NOTE: not support video for now
+        # if videos is not None:
+        #     pixel_values, vision_grid_thws = [], []
+        #     for images in videos:
+        #         patches, video_grid_thw = self._preprocess(
+        #             images,
+        #             do_resize=do_resize,
+        #             resample=resample,
+        #             do_rescale=do_rescale,
+        #             rescale_factor=rescale_factor,
+        #             do_normalize=do_normalize,
+        #             image_mean=image_mean,
+        #             image_std=image_std,
+        #             data_format=data_format,
+        #             do_convert_rgb=do_convert_rgb,
+        #             input_data_format=input_data_format,
+        #             image_num=image_num,
+        #         )
+        #         pixel_values.extend(patches)
+        #         vision_grid_thws.append(video_grid_thw)
+        #     pixel_values = np.array(pixel_values)
+        #     vision_grid_thws = np.array(vision_grid_thws)
+        #     data = {"pixel_values_videos": pixel_values, "video_grid_thw": vision_grid_thws}
+        return BatchFeature(data=data, tensor_type=return_tensors)

videollama3/model/damovl_encoder/modeling_damovl_encoder.py ADDED Viewed

	@@ -0,0 +1,542 @@

+# coding=utf-8
+# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Siglip model."""
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from torch.nn.init import _calculate_fan_in_and_fan_out
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (add_start_docstrings,
+                                add_start_docstrings_to_model_forward,
+                                is_flash_attn_2_available,
+                                is_flash_attn_greater_or_equal_2_10, logging,
+                                replace_return_docstrings)
+from .configuration_damovl_encoder import DAMOVLVisionConfig
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_varlen_func
+    from transformers.modeling_flash_attention_utils import \
+        _flash_attention_forward
+else:
+    flash_attn_varlen_func = None
+logger = logging.get_logger(__name__)
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    tensor.erfinv_()
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+    # Clamp to ensure it's in the proper range
+    tensor.clamp_(min=a, max=b)
+def trunc_normal_tf_(
+    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
+) -> torch.Tensor:
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsequently scaled and shifted by the mean and std args.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+    variance = scale / denom
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    orig_dtype = tensor.dtype
+    tensor = tensor.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    output = (tensor * cos) + (rotate_half(tensor) * sin)
+    output = output.to(orig_dtype)
+    return output
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+class DAMOVLVisionEmbeddings(nn.Module):
+    def __init__(self, config: DAMOVLVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = hidden_states.view(
+            -1, self.config.num_channels, self.patch_size, self.patch_size
+        )
+        patch_embeds = self.patch_embedding(hidden_states)  # shape = [*, width, grid, grid]
+        # embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        embeddings = patch_embeds.view(-1, self.embed_dim)
+        return embeddings
+class VisionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor = None,
+    ) -> torch.Tensor:
+        """Input shape: Time x Channel"""
+        q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(q_len, self.num_heads, self.head_dim)
+        value_states = value_states.view(q_len, self.num_heads, self.head_dim)
+        query_states = apply_rotary_pos_emb_vision(query_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        key_states = apply_rotary_pos_emb_vision(key_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        attention_mask = torch.zeros([1, q_len, q_len], device=q.device, dtype=torch.bool)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
+        query_states = query_states.transpose(0, 1)
+        key_states = key_states.transpose(0, 1)
+        value_states = value_states.transpose(0, 1)
+        attn_weights = torch.matmul(query_states, key_states.transpose(1, 2)) / math.sqrt(self.head_dim)
+        attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(q_len, -1)
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+class VisionFlashAttention2(VisionAttention):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    # Adapted from transformers.models.llama.modeling_llama.LlamaFlashAttention2.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor = None,
+    ) -> torch.Tensor:
+        q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(q_len, self.num_heads, self.head_dim)
+        value_states = value_states.view(q_len, self.num_heads, self.head_dim)
+        query_states = apply_rotary_pos_emb_vision(query_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        key_states = apply_rotary_pos_emb_vision(key_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        attn_output = flash_attn_varlen_func(query_states, key_states, value_states, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
+            q_len, -1
+        )
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+class VisionSdpaAttention(VisionAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "DAMOVLVisionModel is using VisionSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb=rotary_pos_emb,
+            )
+        seq_length = hidden_states.shape[0]
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(q_len, self.num_heads, self.head_dim)
+        value_states = value_states.view(q_len, self.num_heads, self.head_dim)
+        query_states = apply_rotary_pos_emb_vision(query_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        key_states = apply_rotary_pos_emb_vision(key_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
+        query_states = query_states.transpose(0, 1)
+        key_states = key_states.transpose(0, 1)
+        value_states = value_states.transpose(0, 1)
+        attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attention_mask, dropout_p=0.0)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+DAMOVL_VISION_ATTENTION_CLASSES = {
+    "eager": VisionAttention,
+    "flash_attention_2": VisionFlashAttention2,
+    "sdpa": VisionSdpaAttention,
+}
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->DAMOVL
+class DAMOVLVisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class DAMOVLVisionEncoderLayer(nn.Module):
+    def __init__(self, config: DAMOVLVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = DAMOVL_VISION_ATTENTION_CLASSES[config._attn_implementation](config=config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = DAMOVLVisionMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+    # Ignore copy
+    def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> torch.Tensor:
+        hidden_states = hidden_states + self.self_attn(
+            self.layer_norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
+        )
+        hidden_states = hidden_states + self.mlp(self.layer_norm2(hidden_states))
+        return hidden_states
+class DAMOVLPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = DAMOVLVisionConfig
+    base_model_prefix = "damovl"
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        "DAMOVLVisionEncoderLayer",
+        "DAMOVLVisionEmbeddings",
+    ]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, VisionAttention):
+            nn.init.xavier_uniform_(module.q_proj.weight)
+            nn.init.xavier_uniform_(module.k_proj.weight)
+            nn.init.xavier_uniform_(module.v_proj.weight)
+            nn.init.xavier_uniform_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, DAMOVLVisionMLP):
+            nn.init.xavier_uniform_(module.fc1.weight)
+            nn.init.xavier_uniform_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+class DAMOVLVisionEncoder(nn.Module):
+    def __init__(self, config: DAMOVLVisionConfig):
+        super().__init__()
+        self.config = config
+        head_dim = config.hidden_size // config.num_attention_heads
+        self.spatial_merge_size = config.spatial_merge_size
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+        self.layers = nn.ModuleList([DAMOVLVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def rot_pos_emb(self, grid_thw, strides):
+        pos_ids = []
+        for (t, h, w), stride in zip(grid_thw, strides):
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // stride,
+                stride,
+                w // stride,
+                stride,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // stride,
+                stride,
+                w // stride,
+                stride,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+    def forward(self, hidden_states, grid_thws, strides) -> torch.Tensor:
+        # BUG: These codes will cause deepspeed issue: `RuntimeError: disagreement between rank0 and rankx`
+        # rotary_pos_emb = []
+        # for thw in grid_thws:
+        #     rotary_pos_emb.append(self.rot_pos_emb(thw).unsqueeze(0))
+        # rotary_pos_emb1 = torch.cat(rotary_pos_emb, dim=1).squeeze(0)
+        # grid_thws = torch.cat(grid_thws, dim = 0)
+        # new version of creating rotary position embedding
+        # grid_thws shapes like [batch_flatten_image_num, 3]
+        # grid_thws = torch.cat(grid_thws, dim = 0) # is conducted in the `encoder.py`
+        rotary_pos_emb = self.rot_pos_emb(grid_thws, strides)
+        cu_seqlens = torch.repeat_interleave(grid_thws[:, 1] * grid_thws[:, 2], grid_thws[:, 0]).cumsum(dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        for blk in self.layers:
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    blk.__call__,
+                    hidden_states,
+                    cu_seqlens,
+                    rotary_pos_emb
+                )
+            else:
+                hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
+        return hidden_states
+class DAMOVLVisionTransformer(nn.Module):
+    def __init__(self, config: DAMOVLVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = DAMOVLVisionEmbeddings(config)
+        self.encoder = DAMOVLVisionEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+    def forward(self, hidden_states, grid_thws, strides) -> torch.Tensor:
+        # print(hidden_states)
+        # hidden_states = torch.cat(hidden_states, dim = 1)
+        hidden_states = self.embeddings(hidden_states)
+        hidden_states = self.encoder(hidden_states, grid_thws, strides)
+        hidden_states = self.post_layernorm(hidden_states)
+        return hidden_states
+class DAMOVLVisionModel(DAMOVLPreTrainedModel):
+    config_class = DAMOVLVisionConfig
+    main_input_name = "hidden_states"
+    def __init__(self, config: DAMOVLVisionConfig):
+        super().__init__(config)
+        self.vision_model = DAMOVLVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+    def forward(self, hidden_states, grid_thws, strides) -> torch.Tensor:
+        return self.vision_model(hidden_states=hidden_states, grid_thws=grid_thws, strides=strides)

videollama3/model/encoder.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import os
+import torch
+import torch.nn as nn
+from transformers import (CLIPImageProcessor, CLIPVisionConfig,
+                          CLIPVisionModel, SiglipImageProcessor,
+                          SiglipVisionConfig, SiglipVisionModel)
+from .qwen2vl_encoder import (Qwen2VisionTransformerPretrainedModel,
+                              Qwen2VLImageProcessor, Qwen2VLVisionConfig)
+from .damovl_encoder  import (DAMOVLImageProcessor, DAMOVLVisionModel)
+class CLIPVisionEncoder(nn.Module):
+    def __init__(self, vision_encoder, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_encoder_name = vision_encoder
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        if not delay_load:
+            self.attn_implementation = getattr(args, 'mm_attn_implementation', 'flash_attention_2')
+            self.load_model()
+        else:
+            # uncertain whether flash-attention-2 is supported during inference phase.
+            self.attn_implementation = 'sdpa' # 'eager'
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_encoder_name)
+    def load_model(self):
+        if self.is_loaded:
+            print('Vision tower is already loaded, `load model` call again, skipping.')
+            return
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_encoder_name)
+        self.vision_encoder = CLIPVisionModel.from_pretrained(self.vision_encoder_name,
+                                                            attn_implementation=self.attn_implementation)
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+    def forward(self, images, **kwargs):
+        images = torch.cat(images)
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_encoder(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_encoder(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_encoder.dtype
+    @property
+    def device(self):
+        return self.vision_encoder.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_encoder.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+    @property
+    def image_size(self):
+        return self.config.image_size
+class SiglipVisionEncoder(nn.Module):
+    def __init__(self, vision_encoder, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_encoder_name = vision_encoder
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        if not delay_load:
+            self.attn_implementation = getattr(args, 'mm_attn_implementation', 'flash_attention_2')
+            self.load_model()
+        else:
+            # uncertain whether flash-attention-2 is supported during inference phase.
+            self.attn_implementation = 'sdpa' # 'eager'
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_encoder_name)
+    def load_model(self):
+        if self.is_loaded:
+            print('Vision tower is already loaded, `load model` call again, skipping.')
+            return
+        self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_encoder_name)
+        self.vision_encoder = SiglipVisionModel.from_pretrained(self.vision_encoder_name,
+                                                              attn_implementation=self.attn_implementation)
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+    def forward(self, images, **kwargs):
+        images = torch.cat(images)
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_encoder(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_encoder(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_encoder.dtype
+    @property
+    def device(self):
+        return self.vision_encoder.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_encoder.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+    @property
+    def image_size(self):
+        return self.config.image_size
+class Qwen2VLVisionEncoder(nn.Module):
+    def __init__(self, vision_encoder, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_encoder_name = vision_encoder
+        self.select_layer = args.mm_vision_select_layer
+        if not delay_load:
+            self.attn_implementation = getattr(args, 'mm_attn_implementation', 'flash_attention_2')
+            self.load_model(args)
+        else:
+            # uncertain whether flash-attention-2 is supported during inference phase.
+            self.attn_implementation = 'sdpa' # 'eager'
+            self.cfg_only = Qwen2VLVisionConfig.from_pretrained(self.vision_encoder_name)
+    def load_model(self, args):
+        if self.is_loaded:
+            print('Vision tower is already loaded, `load model` call again, skipping.')
+            return
+        # merge_size is set to 1 by default, because STAGE1, STAGE1.5, STAGE2 are trained with merge_size=1
+        # for stage 3, the merge_size is set to 2 by argments.
+        self.image_processor = Qwen2VLImageProcessor.from_pretrained(self.vision_encoder_name)
+        self.image_processor.merge_size = args.spatial_merge_size
+        # NOTE: The maximum number of vision tokens is 8192 by default.
+        mm_max_length = args.mm_max_length if hasattr(args, 'mm_max_length') else 9477 // (args.spatial_merge_size**2)
+        self.image_processor.max_pixels = mm_max_length * (args.spatial_merge_size**2 * self.image_processor.patch_size**2)
+        self.image_processor.size["max_pixels"] = self.image_processor.max_pixels
+        # merge_size is fixed to 1 for STAGE1, STAGE1.5, STAGE2, STAGE3 in encoder and can be modified in connector.
+        self.cfg_only = Qwen2VLVisionConfig.from_pretrained(self.vision_encoder_name)
+        self.cfg_only.spatial_merge_size = args.spatial_merge_size
+        self.vision_encoder = Qwen2VisionTransformerPretrainedModel.from_pretrained(
+            self.vision_encoder_name,
+            config=self.cfg_only,
+            torch_dtype=args.torch_dtype,
+            attn_implementation=self.attn_implementation)
+        self.is_loaded = True
+    def forward(self, images, grid_thws, strides, **kwargs):
+        images    = [image    for sub_images in images for image in sub_images]
+        grid_thws = [grid_thw for sub_grid_thws in grid_thws for grid_thw in sub_grid_thws]
+        strides = [stride for sub_strides in strides for stride in sub_strides]
+        images = torch.cat(images, dim=0)
+        grid_thws = torch.cat(grid_thws, dim=0)
+        image_features = self.vision_encoder(images, grid_thws, strides=strides)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_encoder.dtype
+    @property
+    def device(self):
+        return self.vision_encoder.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_encoder.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return -1
+    @property
+    def num_patches_per_side(self):
+        return -1
+    @property
+    def image_size(self):
+        return 14 * self.vision_encoder.config.spatial_merge_size
+class DAMOVLVisionEncoder(nn.Module):
+    def __init__(self, vision_encoder, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_encoder_name = vision_encoder
+        self.args = args
+        if not delay_load:
+            self.attn_implementation = getattr(args, 'mm_attn_implementation', 'flash_attention_2')
+            self.load_model(self.args)
+        else:
+            # uncertain whether flash-attention-2 is supported during inference phase.
+            self.attn_implementation = 'sdpa' # 'eager'
+            self.cfg_only = DAMOVLVisionConfig.from_pretrained(self.vision_encoder_name)
+    def load_model(self, args):
+        if self.is_loaded:
+            print('Vision tower is already loaded, `load model` call again, skipping.')
+            return
+        # merge_size is set to 1 by default, because STAGE1, STAGE1.5, STAGE2 are trained with merge_size=1
+        # for stage 3, the merge_size is set to 2 by argments.
+        self.image_processor = DAMOVLImageProcessor.from_pretrained(self.vision_encoder_name)
+        self.image_processor.merge_size = args.spatial_merge_size
+        # NOTE: The maximum number of vision tokens is 8192 by default.
+        mm_max_length = args.mm_max_length if hasattr(args, 'mm_max_length') else 9477 // (args.spatial_merge_size**2)
+        self.image_processor.max_pixels = mm_max_length * (args.spatial_merge_size**2 * self.image_processor.patch_size**2)
+        self.image_processor.size["max_pixels"] = self.image_processor.max_pixels
+        # merge_size is fixed to 1 for STAGE1, STAGE1.5, STAGE2, STAGE3 in encoder and can be modified in connector.
+        self.cfg_only = Qwen2VLVisionConfig.from_pretrained(self.vision_encoder_name)
+        self.cfg_only.spatial_merge_size = args.spatial_merge_size
+        self.vision_encoder = DAMOVLVisionModel.from_pretrained(
+            self.vision_encoder_name,
+            spatial_merge_size=args.spatial_merge_size,
+            torch_dtype=args.torch_dtype,
+            attn_implementation=self.attn_implementation)
+        self.is_loaded = True
+    def forward(self, images, grid_thws, strides, **kwargs):
+        images    = [image    for sub_images in images for image in sub_images]
+        grid_thws = [grid_thw for sub_grid_thws in grid_thws for grid_thw in sub_grid_thws]
+        strides = [stride for sub_strides in strides for stride in sub_strides]
+        images = torch.cat(images, dim=0)
+        grid_thws = torch.cat(grid_thws, dim=0)
+        image_features = self.vision_encoder(images, grid_thws, strides)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_encoder.dtype
+    @property
+    def device(self):
+        return self.vision_encoder.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_encoder.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return -1
+    @property
+    def num_patches_per_side(self):
+        return -1
+    @property
+    def image_size(self):
+        return 14 * self.vision_encoder.config.spatial_merge_size
+def build_vision_encoder(vision_encoder_cfg, **kwargs):
+    vision_encoder = getattr(vision_encoder_cfg, 'mm_vision_encoder', getattr(vision_encoder_cfg, 'vision_encoder', None))
+    vision_encoder = DAMOVLVisionEncoder(vision_encoder, args=vision_encoder_cfg, **kwargs)
+    return vision_encoder

videollama3/model/processor.py ADDED Viewed

	@@ -0,0 +1,366 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for VideoLLaMA3.
+"""
+import copy
+import math
+import warnings
+from typing import List, Union, Dict, Optional
+import torch
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, VideoInput
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+import sys
+sys.path.append(".")
+from videollama3.constants import DEFAULT_IMAGE_TOKEN, IGNORE_INDEX
+DEFAULT_CHAT_TEMPLATE = """
+{%- set identifier = 'im' %}
+{% for message in messages %}
+    {% if message['role'] == 'stream' %}
+        {% set identifier = 'stream' %}
+    {% else %}
+        {% set identifier = 'im' %}
+    {% endif %}
+    {{- '<|' + identifier + '_start|>' + message['role'] + '\n' -}}
+    {% if message['content'] is string %}
+        {{- message['content'] + '<|' + identifier + '_end|>\n' -}}
+    {% else %}
+        {% for content in message['content'] %}
+            {% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}
+                {% if 'time' in content %}
+                    {{- 'Time ' + content['time'] | round(1) | string + 's: ' -}}
+                {% endif %}
+"""
+DEFAULT_CHAT_TEMPLATE += """
+                {{- '%s\n' -}}
+""" % DEFAULT_IMAGE_TOKEN
+DEFAULT_CHAT_TEMPLATE += """
+            {% elif content['type'] == 'video' or 'video' in content or 'video_url' in content %}
+                {% for i in range(content['num_frames']) %}
+                    {% if 'time' in content %}
+                        {{- 'Time ' + content['time'][i] | round(1) | string + 's:' -}}
+                    {% endif %}
+                    {% if i < content['num_frames'] - 1 %}
+"""
+DEFAULT_CHAT_TEMPLATE += """
+                        {{- '%s,' -}}
+""" % DEFAULT_IMAGE_TOKEN
+DEFAULT_CHAT_TEMPLATE += """
+                    {% else %}
+"""
+DEFAULT_CHAT_TEMPLATE += """
+                        {{- '%s\n' -}}
+""" % DEFAULT_IMAGE_TOKEN
+DEFAULT_CHAT_TEMPLATE += """
+                    {% endif %}
+                {% endfor %}
+            {% elif 'text' in content %}
+                {{- content['text'] -}}
+            {% endif %}
+        {% endfor %}
+        {{- '<|' + identifier + '_end|>\n' -}}
+    {% endif %}
+{% endfor %}
+{% if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' -}}
+{% endif %}
+"""
+class Videollama3ProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+    }
+class Videollama3Processor(ProcessorMixin):
+    r"""
+    Modified from Qwen2VLProcessor
+    Args:
+        image_processor ([`Qwen2VLImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+    image_processor_class = "Qwen2VLImageProcessor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+        if chat_template is None:
+            chat_template = DEFAULT_CHAT_TEMPLATE
+        # super().__init__(image_processor, tokenizer, chat_template=chat_template)
+        tokenizer.chat_template = chat_template
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        self.generation_prompt = self._infer_generation_prompt()
+        self.generation_prompt_ids = self.tokenizer.encode(self.generation_prompt, return_tensors="pt")
+        self.generation_prompt_length = len(self.generation_prompt_ids[0])
+        self.image_token_id = self.tokenizer.convert_tokens_to_ids(DEFAULT_IMAGE_TOKEN)
+        self.eos_token_id = self.tokenizer.eos_token_id
+    def get_generation_prompt(self):
+        return self.generation_prompt
+    def get_generation_prompt_ids(self):
+        return self.generation_prompt_ids
+    def _infer_generation_prompt(self):
+        pseudo_message = [{"role": "user", "content": ""}]
+        instruction = self.tokenizer.apply_chat_template(pseudo_message, tokenize=False, add_generation_prompt=True)
+        conversation = self.tokenizer.apply_chat_template(pseudo_message, tokenize=False, add_generation_prompt=False)
+        return instruction.replace(conversation, "")
+    def _process_text_with_label(
+        self,
+        text: List[Dict],
+        image_grid_thw: torch.Tensor = None,
+        image_downsampling: Optional[int] = None,
+        **kwargs,
+    ):
+        assert kwargs.pop("return_tensors", "pt") == "pt", "Only PyTorch tensors are supported when return_labels=True."
+        assert isinstance(text[0], dict), "When return_labels=True, text must be a list of messages."
+        input_ids_list = []
+        targets_list = []
+        sample_types_list = []
+        image_idx = 0
+        for message_idx, message in enumerate(text):
+            # 1. set chat template and append image tokens
+            prompt = self.tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=False)
+            prompt_chunks = prompt.split(DEFAULT_IMAGE_TOKEN)
+            prompt = []
+            for chunk_idx in range(len(prompt_chunks) - 1):
+                prompt.append(prompt_chunks[chunk_idx])
+                thw = image_grid_thw[image_idx]
+                prompt.append(DEFAULT_IMAGE_TOKEN * (thw.prod() / image_downsampling**2).long())
+                image_idx += 1
+            prompt.append(prompt_chunks[-1])
+            prompt = "".join(prompt)
+            input_ids = self.tokenizer.encode(prompt, return_tensors="pt")[0]
+            input_ids_list.append(input_ids)
+            targets = torch.full_like(input_ids, IGNORE_INDEX)
+            sample_types = torch.full_like(input_ids, IGNORE_INDEX)
+            if message["role"] == "assistant":
+                targets[self.generation_prompt_length:-1] = input_ids[self.generation_prompt_length:-1].clone()
+            elif message["role"] == "stream":
+                diff = torch.diff((input_ids == self.image_token_id).float())
+                image_end_indices = torch.nonzero(diff < 0)[:, 0]
+                targets[image_end_indices + 1] = input_ids[image_end_indices + 1]
+                sample_types = targets.clone()
+                sample_types[torch.logical_and(sample_types > 0, sample_types != self.eos_token_id)] = 0
+                targets[-2] = input_ids[-2]    # <|im_end|>
+            # if message_idx > 0 and text[message_idx - 1]["role"] == "stream":
+            #     targets[0] = input_ids[0]
+            #     # TODO: consider non-special tokens
+            #     sample_types[0] = input_ids[0]
+            targets_list.append(targets)
+            sample_types_list.append(sample_types)
+        assert len(image_grid_thw) == image_idx, "Number of images does not match the number of image tokens in the text."
+        targets = torch.cat(targets_list)
+        sample_types = torch.cat(sample_types_list)
+        types, counts = torch.unique(sample_types[sample_types > -1], return_counts=True)
+        if len(types) > 0:
+            target_num_samples = counts.amin()
+            for type_id, type_count in zip(types, counts):
+                if type_count > target_num_samples:
+                    indices = torch.nonzero(sample_types == type_id)[:, 0]
+                    random_selector = torch.randperm(indices.size(0))[:-target_num_samples]
+                    targets[indices[random_selector]] = IGNORE_INDEX
+                    sample_types[indices[random_selector]] = -1
+        text_inputs = {
+            "input_ids": torch.cat(input_ids_list),
+            "labels": targets,
+        }
+        return text_inputs
+    def _process_text_without_label(
+        self,
+        text: Union[List[str], List[Dict]],
+        image_grid_thw: torch.Tensor = None,
+        image_downsampling: Optional[int] = None,
+        **kwargs,
+    ):
+        if isinstance(text[0], dict):
+            warnings.warn("Input text is a list of messages. Automatically convert it to a string with 'apply_chat_template' with generation prompt.")
+            text = [self.tokenizer.apply_chat_template(text, tokenize=False, add_generation_prompt=True)]
+        image_idx = 0
+        for i in range(len(text)):
+            while DEFAULT_IMAGE_TOKEN in text[i]:
+                thw = image_grid_thw[image_idx]
+                text[i] = text[i].replace(DEFAULT_IMAGE_TOKEN, "<placeholder>" * (thw.prod() / image_downsampling**2).long(), 1)
+                image_idx += 1
+            text[i] = text[i].replace("<placeholder>", DEFAULT_IMAGE_TOKEN)
+        assert len(image_grid_thw) == image_idx, "Number of images does not match the number of image tokens in the text."
+        text_inputs = self.tokenizer(text, **kwargs)
+        return text_inputs
+    def _process_text(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput], List[Dict]],
+        image_grid_thw: torch.Tensor = None,
+        image_downsampling: Optional[int] = None,
+        return_labels: bool = False,
+        **kwargs,
+    ):
+        if not isinstance(text, (list, tuple)):
+            text = [text]
+        assert len(text), "At least one text must be provided."
+        if return_labels:
+            return self._process_text_with_label(text, image_grid_thw, image_downsampling, **kwargs)
+        return self._process_text_without_label(text, image_grid_thw, image_downsampling, **kwargs)
+    def _process_image(
+        self,
+        images: ImageInput = None,
+        image_downsampling: Optional[int] = None,
+        **kwargs,
+    ):
+        if image_downsampling is None:
+            image_downsampling = self.image_processor.merge_size
+        image_inputs = {
+            "images": [],
+            "grid_thws": [],
+            "image_downsampling": image_downsampling
+        }
+        if images is not None and len(images) > 0:
+            num_images = kwargs.get('num_images', len(images))
+            if 'num_images' in kwargs:
+                kwargs.pop('num_images')
+            for image in images:
+                outputs = self.image_processor(images=image, num_images=num_images, image_downsampling=image_downsampling, **kwargs)
+                # images shapes like: [tensor([patches, 1176]), ...]
+                # grid_thws shapes like: tensor([num_images, 3])
+                # flatten_patches1 = outputs["pixel_values"].reshape(26, 46, 3, -1)
+                # from matplotlib import pyplot as plt
+                # plt.imshow(flatten_patches1[:,:,:,0])
+                # plt.savefig('9.png')
+                image_inputs["images"].append(outputs["pixel_values"]) #正常的
+                # flatten_patches1 = image_inputs["images"][0].reshape(26, 46, 3, -1)
+                # from matplotlib import pyplot as plt
+                # plt.imshow(flatten_patches1[:,:,:,0])
+                # plt.savefig('12.png')
+                image_inputs["grid_thws"].append(outputs["image_grid_thw"])
+        return image_inputs
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput], List[Dict]] = None,
+        images: ImageInput = None,
+        image_downsampling: Optional[int] = None,
+        return_labels: bool = False,
+        **kwargs: Unpack[Videollama3ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            Videollama3ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        output_kwargs["text_kwargs"].pop("padding")
+        output_kwargs["text_kwargs"].pop("padding_side")
+        image_inputs = self._process_image(images, image_downsampling, **output_kwargs["images_kwargs"])
+        text_inputs = self._process_text(text, image_inputs["grid_thws"], image_downsampling, return_labels, **output_kwargs["text_kwargs"])
+        return BatchFeature(data={**text_inputs, **image_inputs})
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

videollama3/model/projector.py ADDED Viewed

	@@ -0,0 +1,160 @@

+#    Copyright 2024 Alibaba DAMO Academy
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import math
+import os
+import re
+import einops
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import LayerNorm, LayerNorm2d
+from timm.models.regnet import RegStage
+from transformers import TRANSFORMERS_CACHE
+def parse_snapshot_folder(repo_id, cache_dir=None, repo_type="model"):
+    revision = "main"
+    # 1. parse the downloaded cache folder
+    if cache_dir is None:
+        cache_dir = TRANSFORMERS_CACHE
+    else:
+        cache_dir = cache_dir
+    object_id = repo_id.replace("/", "--")
+    repo_cache = os.path.join(cache_dir, f"{repo_type}s--{object_id}")
+    # 2. resolve refs (for instance to convert main to the associated commit sha)
+    refs_dir = os.path.join(repo_cache, "refs")
+    if os.path.isdir(refs_dir):
+        revision_file = os.path.join(refs_dir, revision)
+        if os.path.isfile(revision_file):
+            with open(revision_file) as f:
+                revision = f.read()
+    # 3. acquire the snapshot folder
+    folder = os.path.join(repo_cache, "snapshots", revision)
+    return folder
+def load_mm_projector(model_path, cache_dir=None, token=None):
+    if os.path.exists(os.path.join(model_path, 'mm_projector.bin')):
+        is_local = True
+        folder = model_path
+    else:
+        is_local = False
+        folder = parse_snapshot_folder(model_path, cache_dir=cache_dir, repo_type="model")
+        if not os.path.exists(os.path.join(folder, 'mm_projector.bin')):
+            # downloading from remote repo
+            from huggingface_hub import snapshot_download
+            snapshot_download(repo_id=model_path, cache_dir=cache_dir, token=token)
+    mm_projector_weights = torch.load(os.path.join(folder, 'mm_projector.bin'), map_location='cpu')
+    mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
+    return mm_projector_weights
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": 'identity'}
+def build_mlp(depth, hidden_size, output_hidden_size):
+    modules = [nn.Linear(hidden_size, output_hidden_size)]
+    for _ in range(1, depth):
+        modules.append(nn.GELU())
+        modules.append(nn.Linear(output_hidden_size, output_hidden_size))
+    return nn.Sequential(*modules)
+class SimSpatialConv(nn.Module):
+    def __init__(self, config, downsample=(2, 2), padding=1, depth=1, mlp_depth=2):
+        super().__init__()
+        self.encoder_hidden_size = encoder_hidden_size = config.mm_hidden_size
+        self.output_hidden_size = output_hidden_size = config.hidden_size
+        self.downsample = downsample
+        self.padding = padding
+        self.sampler = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.encoder_hidden_size,
+                out_channels=4 * self.encoder_hidden_size,
+                kernel_size=self.downsample,
+                stride=self.downsample,
+                padding=self.padding,
+                bias=True
+            ),
+            nn.SiLU(),
+        )
+        self.readout = build_mlp(mlp_depth, 4 * self.encoder_hidden_size, self.output_hidden_size)
+    def forward(self, x):
+        hw = int(x.size(1) ** 0.5)
+        x = einops.rearrange(x, "b (h w) d -> b d h w", h=hw, w=hw)
+        x = self.sampler(x)
+        x = einops.rearrange(x, "b d h w -> b (h w) d")
+        x = self.readout(x)
+        return x
+    def cal_proj_size(self, input_size):
+        if isinstance(input_size, int):
+            input_size = (input_size, input_size)
+        height = math.ceil((input_size[0] + self.padding) / self.downsample[0])
+        width  = math.ceil((input_size[1] + self.padding) / self.downsample[1])
+        return height * width
+class MlpGeluProjector(nn.Module):
+    def __init__(self, config, projector_type):
+        super().__init__()
+        mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
+        mlp_depth = int(mlp_gelu_match.group(1))
+        self.readout = build_mlp(mlp_depth, config.mm_hidden_size, config.hidden_size)
+    def forward(self, x):
+        x = self.readout(x)
+        return x
+    def cal_proj_size(self, input_size):
+        if isinstance(input_size, int):
+            input_size = (input_size, input_size)
+        height = input_size[0]
+        width  = input_size[1]
+        return height * width
+def build_vision_projector(config, delay_load=False, **kwargs):
+    # videollama3 projector only support image-wise operation now, i.e., prohibit the temporal aggregation
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+    if projector_type == "linear":
+        # NOTE: for both linear and mlp2x_gelu projector type, mean pooling is adopted to aggreate video features
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+    elif  projector_type == "simp_spatial_conv":
+        return SimSpatialConv(config)
+    elif projector_type.startswith("mlp"):
+        return MlpGeluProjector(config, projector_type)
+    if projector_type == 'identity':
+        return IdentityMap()
+    raise ValueError(f'Unknown projector type: {projector_type}')

videollama3/model/qwen2vl_encoder/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .configuration_qwen2vl_encoder import Qwen2VLVisionConfig
+from .image_processing import Qwen2VLImageProcessor
+from .modeling_qwen2vl_encoder import Qwen2VisionTransformerPretrainedModel

videollama3/model/qwen2vl_encoder/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (432 Bytes). View file

videollama3/model/qwen2vl_encoder/__pycache__/configuration_qwen2vl_encoder.cpython-310.pyc ADDED Viewed

Binary file (1.92 kB). View file

videollama3/model/qwen2vl_encoder/__pycache__/image_processing.cpython-310.pyc ADDED Viewed

Binary file (16.9 kB). View file

videollama3/model/qwen2vl_encoder/__pycache__/modeling_qwen2vl_encoder.cpython-310.pyc ADDED Viewed

Binary file (12.7 kB). View file

videollama3/model/qwen2vl_encoder/configuration_qwen2vl_encoder.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen2VL model configuration"""
+import os
+from typing import Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Qwen2VLVisionConfig(PretrainedConfig):
+    model_type = "qwen2_vl"
+    def __init__(
+        self,
+        depth=32,
+        embed_dim=1280,
+        hidden_size=3584,
+        hidden_act="quick_gelu",
+        mlp_ratio=4,
+        num_heads=16,
+        in_channels=3,
+        patch_size=14,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.depth = depth
+        self.embed_dim = embed_dim
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # if config_dict.get("model_type") == "qwen2_vl":
+        #     config_dict = config_dict["vision_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)

videollama3/model/qwen2vl_encoder/image_processing.py ADDED Viewed

	@@ -0,0 +1,469 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Qwen2-VL."""
+import math
+from typing import Dict, List, Optional, Union
+import numpy as np
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    VideoInput,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.utils import TensorType, is_vision_available, logging
+logger = logging.get_logger(__name__)
+if is_vision_available():
+    from PIL import Image
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+    Returns:
+        list: A list of images.
+    """
+    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
+        return [img for img_list in images for img in img_list]
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+    elif is_valid_image(images):
+        return [images]
+    raise ValueError(f"Could not make batched images from {images}")
+# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
+def make_batched_videos(videos) -> List[VideoInput]:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+        return videos
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], Image.Image):
+            return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
+    raise ValueError(f"Could not make batched video from {videos}")
+def smart_resize(
+    height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
+):
+    """Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    if height < factor or width < factor:
+        scale = factor / min(height, width)
+        width = round(scale * width)
+        height = round(scale * height)
+    elif max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+class Qwen2VLImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Qwen2-VL image processor that dynamically resizes images based on the original images.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        min_pixels (`int`, *optional*, defaults to `56 * 56`):
+            The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
+            The max pixels of the image to resize the image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The spacial patch size of the vision encoder.
+        temporal_patch_size (`int`, *optional*, defaults to 2):
+            The temporal patch size of the vision encoder.
+        merge_size (`int`, *optional*, defaults to 2):
+            The merge size of the vision encoder to llm encoder.
+    """
+    model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 56 * 56,
+        max_pixels: int = 28 * 28 * 1280,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        merge_size: int = 2,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+        self.do_convert_rgb = do_convert_rgb
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        do_resize: bool = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        num_images: Optional[int] = 1,
+        image_downsampling: Optional[int] = None,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            vision_info (`List[Dict]`, *optional*):
+                Optional list of dictionaries containing additional information about vision inputs.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        images = make_list_of_images(images)
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            if do_resize:
+                max_pixels = int(self.max_pixels / (self.merge_size / image_downsampling)**2)
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=self.patch_size * image_downsampling,
+                    min_pixels=self.min_pixels,
+                    max_pixels=int(max_pixels // num_images),
+                )
+                image = resize(
+                    image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
+                )
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+        patches = np.array(processed_images)
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose(0, 3, 1, 2)
+        if patches.shape[0] == 1:
+            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // self.temporal_patch_size
+        grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+        patches = patches.reshape(
+            grid_t,
+            self.temporal_patch_size,
+            channel,
+            grid_h // image_downsampling,
+            image_downsampling,
+            self.patch_size,
+            grid_w // image_downsampling,
+            image_downsampling,
+            self.patch_size,
+        )
+        patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
+        flatten_patches = patches.reshape(
+            grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size
+        )
+        return flatten_patches, (grid_t, grid_h, grid_w)
+    def preprocess(
+        self,
+        images: ImageInput,
+        videos: VideoInput = None,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        num_images: Optional[int] = 1,
+        image_downsampling: Optional[int] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            videos (`VideoInput`):
+                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        image_downsampling = image_downsampling if image_downsampling is not None else self.merge_size
+        if images is not None:
+            images = make_batched_images(images)
+        if videos is not None:
+            videos = make_batched_videos(videos)
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        if images is not None:
+            pixel_values, vision_grid_thws = [], []
+            for image in images:
+                patches, image_grid_thw = self._preprocess(
+                    image,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                    num_images=num_images,
+                    image_downsampling=image_downsampling,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(image_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
+        if videos is not None:
+            pixel_values, vision_grid_thws = [], []
+            for images in videos:
+                patches, video_grid_thw = self._preprocess(
+                    images,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                    num_images=num_images,
+                    image_downsampling=image_downsampling,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(video_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {"pixel_values_videos": pixel_values, "video_grid_thw": vision_grid_thws}
+        return BatchFeature(data=data, tensor_type=return_tensors)

videollama3/model/qwen2vl_encoder/modeling_qwen2vl_encoder.py ADDED Viewed

	@@ -0,0 +1,367 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Qwen2-VL model."""
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.nn import CrossEntropyLoss, LayerNorm
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, StaticCache
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (add_start_docstrings,
+                                add_start_docstrings_to_model_forward,
+                                is_flash_attn_2_available,
+                                is_flash_attn_greater_or_equal_2_10, logging,
+                                replace_return_docstrings)
+from .configuration_qwen2vl_encoder import Qwen2VLVisionConfig
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_varlen_func
+    from transformers.modeling_flash_attention_utils import \
+        _flash_attention_forward
+else:
+    flash_attn_varlen_func = None
+logger = logging.get_logger(__name__)
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    orig_dtype = tensor.dtype
+    tensor = tensor.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    output = (tensor * cos) + (rotate_half(tensor) * sin)
+    output = output.to(orig_dtype)
+    return output
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+class PatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        in_channels: int = 3,
+        embed_dim: int = 1152,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+        kernel_size = [temporal_patch_size, patch_size, patch_size]
+        self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.proj.weight.dtype
+        hidden_states = hidden_states.view(
+            -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
+        )
+        hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
+        return hidden_states
+class PatchMerger(nn.Module):
+    def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        self.ln_q = LayerNorm(context_dim, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(self.hidden_size, self.hidden_size),
+            nn.GELU(),
+            nn.Linear(self.hidden_size, dim),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
+        return x
+class VisionMlp(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int, hidden_act: str) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(dim, hidden_dim)
+        self.act = ACT2FN[hidden_act]
+        self.fc2 = nn.Linear(hidden_dim, dim)
+    def forward(self, x) -> torch.Tensor:
+        return self.fc2(self.act(self.fc1(x)))
+class VisionAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+        self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        attention_mask = torch.full(
+            [1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype
+        )
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+        attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
+        attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
+        attn_output = torch.matmul(attn_weights, v)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+class VisionFlashAttention2(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+        self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
+            seq_length, -1
+        )
+        attn_output = self.proj(attn_output)
+        return attn_output
+class VisionSdpaAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+        self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+        attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+QWEN2_VL_VISION_ATTENTION_CLASSES = {
+    "eager": VisionAttention,
+    "flash_attention_2": VisionFlashAttention2,
+    "sdpa": VisionSdpaAttention,
+}
+class Qwen2VLVisionBlock(nn.Module):
+    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
+        super().__init__()
+        self.norm1 = LayerNorm(config.embed_dim, eps=1e-6)
+        self.norm2 = LayerNorm(config.embed_dim, eps=1e-6)
+        mlp_hidden_dim = int(config.embed_dim * config.mlp_ratio)
+        self.attn = QWEN2_VL_VISION_ATTENTION_CLASSES[attn_implementation](
+            config.embed_dim, num_heads=config.num_heads
+        )
+        self.mlp = VisionMlp(dim=config.embed_dim, hidden_dim=mlp_hidden_dim, hidden_act=config.hidden_act)
+    def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> torch.Tensor:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+class Qwen2VLPreTrainedModel(PreTrainedModel):
+    config_class = Qwen2VLVisionConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2VLVisionBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv3d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel):
+    config_class = Qwen2VLVisionConfig
+    _no_split_modules = ["Qwen2VLVisionBlock"]
+    def __init__(self, config) -> None:
+        super().__init__(config)
+        self.spatial_merge_size = config.spatial_merge_size
+        self.gradient_checkpointing = False
+        self.patch_embed = PatchEmbed(
+            patch_size=config.patch_size,
+            temporal_patch_size=config.temporal_patch_size,
+            in_channels=config.in_channels,
+            embed_dim=config.embed_dim,
+        )
+        head_dim = config.embed_dim // config.num_heads
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+        self.blocks = nn.ModuleList(
+            [Qwen2VLVisionBlock(config, config._attn_implementation) for _ in range(config.depth)]
+        )
+        #
+        # if self.spatial_merge_size > 1:
+        #     self.merger = PatchMerger(dim=config.hidden_size, context_dim=config.embed_dim)
+    def get_dtype(self) -> torch.dtype:
+        return self.blocks[0].mlp.fc2.weight.dtype
+    def get_device(self) -> torch.device:
+        return self.blocks[0].mlp.fc2.weight.device
+    def rot_pos_emb(self, grid_thw, strides):
+        pos_ids = []
+        for (t, h, w), stride in zip(grid_thw, strides):
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // stride,
+                stride,
+                w // stride,
+                stride,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // stride,
+                stride,
+                w // stride,
+                stride,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+    def forward(self, hidden_states, grid_thws, strides) -> torch.Tensor:
+        hidden_states  = self.patch_embed(hidden_states)
+        # BUG: These codes will cause deepspeed issue: `RuntimeError: disagreement between rank0 and rankx`
+        # rotary_pos_emb = []
+        # for thw in grid_thws:
+        #     rotary_pos_emb.append(self.rot_pos_emb(thw).unsqueeze(0))
+        # rotary_pos_emb1 = torch.cat(rotary_pos_emb, dim=1).squeeze(0)
+        # grid_thws = torch.cat(grid_thws, dim = 0)
+        # new version of creating rotary position embedding
+        # grid_thws shapes like [batch_flatten_image_num, 3]
+        # grid_thws = torch.cat(grid_thws, dim = 0) # is conducted in the `encoder.py`
+        rotary_pos_emb = self.rot_pos_emb(grid_thws, strides)
+        cu_seqlens = torch.repeat_interleave(grid_thws[:, 1] * grid_thws[:, 2], grid_thws[:, 0]).cumsum(dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        for blk in self.blocks:
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    blk.__call__,
+                    hidden_states,
+                    cu_seqlens,
+                    rotary_pos_emb
+                )
+            else:
+                hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
+        # if self.spatial_merge_size > 1:
+        #     hidden_states = self.merger(hidden_states)
+        return hidden_states