travis-xia commited on Jun 17

Commit

e358a10

verified ·

1 Parent(s): 07aec7c

Upload folder using huggingface_hub

Browse files

Files changed (34) hide show

.ipynb_checkpoints/mm_utils-checkpoint.py +855 -0
.ipynb_checkpoints/modeling_qwen2_flash-checkpoint.py +0 -0
added_tokens.json +24 -0
config.json +231 -0
constants.py +12 -0
conversation.py +592 -0
generation_config.json +15 -0
merges.txt +0 -0
mm_projector_builder.py +165 -0
mm_utils.py +855 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +905 -0
modeling_qwen2_flash.py +0 -0
modeling_videochat_flash.py +729 -0
optimizer.pt +3 -0
rng_state_0.pth +3 -0
rng_state_1.pth +3 -0
rng_state_2.pth +3 -0
rng_state_3.pth +3 -0
rng_state_4.pth +3 -0
rng_state_5.pth +3 -0
rng_state_6.pth +3 -0
rng_state_7.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +31 -0
tokenizer.json +0 -0
tokenizer_config.json +208 -0
trainer_state.json +2821 -0
training_args.bin +3 -0
vision_tower_builder.py +632 -0
vocab.json +0 -0

.ipynb_checkpoints/mm_utils-checkpoint.py ADDED Viewed

	@@ -0,0 +1,855 @@

+from PIL import Image
+from io import BytesIO
+import base64
+import math
+import ast
+import re
+import torch
+from transformers import StoppingCriteria
+from .constants import IMAGE_TOKEN_INDEX
+import random
+import os
+import io
+import av
+import cv2
+import imageio
+from decord import VideoReader
+import numpy as np
+from torchvision.transforms.functional import pil_to_tensor
+######################## load video ########################
+def get_index(num_frames, num_segments):
+    seg_size = float(num_frames - 1) / num_segments
+    start = int(seg_size / 2)
+    offsets = np.array([
+        start + int(np.round(seg_size * idx)) for idx in range(num_segments)
+    ])
+    return offsets
+def pts_to_secs(pts: int, time_base: float, start_pts: int) -> float:
+    """
+    Converts a present time with the given time base and start_pts offset to seconds.
+    Returns:
+        time_in_seconds (float): The corresponding time in seconds.
+    https://github.com/facebookresearch/pytorchvideo/blob/main/pytorchvideo/data/utils.py#L54-L64
+    """
+    if pts == math.inf:
+        return math.inf
+    return int(pts - start_pts) * time_base
+def get_pyav_video_duration(video_reader):
+    video_stream = video_reader.streams.video[0]
+    video_duration = pts_to_secs(
+        video_stream.duration,
+        video_stream.time_base,
+        video_stream.start_time
+    )
+    return float(video_duration)
+def get_frame_indices(num_frames, vlen, sample='middle', fix_start=None, input_fps=1, min_num_frames=1, max_num_frames=-1, local_num_frames=8):
+    if min_num_frames > vlen:
+        if sample == 'dynamic_fps1':
+            min_num_frames = (vlen // local_num_frames) * local_num_frames
+        else:
+            min_num_frames = vlen
+    if sample == 'dynamic_fps1':
+        duration = float(vlen) / input_fps
+        num_segments = int(duration // local_num_frames)
+        if num_segments == 0:
+            num_frames = local_num_frames
+        else:
+            num_frames = local_num_frames * num_segments
+        if max_num_frames > 0:
+            num_frames = min(num_frames, max_num_frames)
+        sample = "middle" # NOTE
+        # logger.info(f"? is OK (img), duation={duration} frames={num_frames}!!!!")
+    num_frames = max(min_num_frames, num_frames)
+    # print(f"\033[0;31m vlen={vlen}, input_fps={input_fps} num_frames={num_frames} \033[0m")
+    if sample in ["rand", "middle"]: # uniform sampling
+        acc_samples = min(num_frames, vlen)
+        # split the video into `acc_samples` intervals, and sample from each interval.
+        intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
+        ranges = []
+        for idx, interv in enumerate(intervals[:-1]):
+            ranges.append((interv, intervals[idx + 1] - 1))
+        if sample == 'rand':
+            try:
+                frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
+            except:
+                frame_indices = np.random.permutation(vlen)[:acc_samples]
+                frame_indices.sort()
+                frame_indices = list(frame_indices)
+        elif fix_start is not None:
+            frame_indices = [x[0] + fix_start for x in ranges]
+        elif sample == 'middle':
+            frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
+        else:
+            raise NotImplementedError
+        if len(frame_indices) < num_frames:  # padded with last frame
+            padded_frame_indices = [frame_indices[-1]] * num_frames
+            padded_frame_indices[:len(frame_indices)] = frame_indices
+            frame_indices = padded_frame_indices
+    elif "fps" in sample:  # fps0.5, sequentially sample frames at 0.5 fps
+        output_fps = float(sample[3:])
+        duration = float(vlen) / input_fps
+        delta = 1 / output_fps  # gap between frames, this is also the clip length each frame represents
+        frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
+        frame_indices = np.around(frame_seconds * input_fps).astype(int)
+        frame_indices = [e for e in frame_indices if e < vlen]
+        if max_num_frames > 0 and len(frame_indices) > max_num_frames:
+            frame_indices = frame_indices[:max_num_frames]
+            # frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames)
+    else:
+        raise ValueError(f"Not support sample type: {sample}")
+    return frame_indices
+def read_frames_av(video_path, num_frames, sample='rand', client=None, fix_start=None, min_num_frames=1, max_num_frames=-1, clip=None, local_num_frames=8):
+    if clip is not None:
+        raise NotImplementedError("av don't support clip!!!")
+    if 's3://' in video_path:
+        video_bytes = client.get(video_path)
+        byteio = io.BytesIO(video_bytes)
+        byteio.seek(0)
+        reader = av.open(byteio)
+    else:
+        byteio = None
+        reader = av.open(video_path)
+    frames = [f.to_rgb().to_ndarray() for f in reader.decode(video=0)]
+    vlen = len(frames)
+    duration = get_pyav_video_duration(reader)
+    fps = vlen / float(duration)
+    frame_indices = get_frame_indices(
+        num_frames, vlen, sample=sample, fix_start=fix_start,
+        input_fps=fps, min_num_frames=min_num_frames, max_num_frames=max_num_frames, local_num_frames=local_num_frames
+    )
+    frames = np.stack([frames[idx] for idx in frame_indices])  # (T, H, W, C), torch.uint8
+    # frames = frames.permute(0, 3, 1, 2)  # (T, C, H, W), torch.uint8
+    if byteio != None:
+        byteio.close()
+    reader.close()
+    return frames, frame_indices, float(fps), duration
+def read_frames_gif(
+        video_path, num_frames, sample='rand', fix_start=None,
+        min_num_frames=1, max_num_frames=-1, client=None, clip=None, local_num_frames=8
+    ):
+    if clip is not None:
+        raise NotImplementedError("Gif don't support clip!!!")
+    if 's3://' in video_path:
+        video_bytes = client.get(video_path)
+        byteio = io.BytesIO(video_bytes)
+        gif = imageio.get_reader(byteio)
+    else:
+        byteio = None
+        gif = imageio.get_reader(video_path)
+    vlen = len(gif)
+    fps = 1.
+    duration = vlen / fps
+    frame_indices = get_frame_indices(
+        num_frames, vlen, sample=sample, fix_start=fix_start,
+        min_num_frames=min_num_frames,
+        max_num_frames=max_num_frames, local_num_frames=local_num_frames,
+        input_fps=fps
+    )
+    frames = []
+    min_h = min_w = 100000
+    hw_set = set()
+    for index, frame in enumerate(gif):
+        # for index in frame_idxs:
+        if index in frame_indices:
+            frame = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)
+            frame = frame.astype(np.uint8)
+            # # (H x W x C) to (C x H x W)
+            # frame = frame.permute(2, 0, 1)
+            frames.append(frame)
+            hw_set.add(frame.shape)
+            if frame.shape[0] < min_h:
+                min_h = frame.shape[0]
+            if frame.shape[1] < min_w:
+                min_w = frame.shape[1]
+    # print(hw_set, min_h, min_w)
+    if len(hw_set) > 1:
+        frames = [i[:min_h, :min_w] for i in frames]
+    frames = np.stack(frames)  # .float() / 255
+    if byteio != None:
+        byteio.close()
+    return frames, frame_indices, float(fps), duration # for tgif
+def read_frames_decord(
+        video_path, num_frames, sample='rand', fix_start=None, min_num_frames=1,
+        max_num_frames=-1, client=None, clip=None, local_num_frames=8
+    ):
+    if video_path.endswith('.avi'):
+        return read_frames_av(video_path=video_path, num_frames=num_frames, sample=sample,
+                    fix_start=fix_start, min_num_frames=min_num_frames, max_num_frames=max_num_frames,
+                    client=client, clip=clip, local_num_frames=local_num_frames)
+    if 's3://' in video_path:
+        video_bytes = client.get(video_path)
+        if video_bytes is None or len(video_bytes) == 0:
+            raise ValueError(f"Can't read byte from {video_path}!")
+        byteio = io.BytesIO(video_bytes)
+        video_reader = VideoReader(byteio, num_threads=1)
+    else:
+        byteio = None
+        video_reader = VideoReader(video_path, num_threads=1)
+    vlen = len(video_reader)
+    fps = video_reader.get_avg_fps()
+    duration = vlen / float(fps)
+    if clip:
+        start, end = clip
+        start = max(0, start)
+        end = min(duration - 0.1, end)
+        duration = end - start
+        vlen = int(duration * fps)
+        start_index = int(start * fps)
+    frame_indices = get_frame_indices(
+        num_frames, vlen, sample=sample, fix_start=fix_start,
+        input_fps=fps, min_num_frames=min_num_frames, max_num_frames=max_num_frames, local_num_frames=local_num_frames
+    )
+    if clip:
+        frame_indices = [f + start_index for f in frame_indices]
+    # print(fps, frame_indices)
+    frames = video_reader.get_batch(frame_indices).asnumpy()  # (T, H, W, C), torch.uint8
+    # https://github.com/dmlc/decord/issues/208
+    video_reader.seek(0)
+    if byteio != None:
+        byteio.close()
+    # frames = frames.permute(0, 3, 1, 2)  # (T, C, H, W), torch.uint8
+    return frames, frame_indices, float(fps), duration
+def read_frames_img(
+        video_path, num_frames, sample='rand', fix_start=None, min_num_frames=1,
+        max_num_frames=-1, client=None, clip=None, local_num_frames=8
+    ):
+    def extract_frame_number(filename):
+        # Extract the numeric part from the filename using regular expressions
+        if filename.endswith('.jpg'):
+            match = re.search(r'_(\d+).jpg$', filename)
+        elif filename.endswith('.jpeg'):
+            match = re.search(r'_(\d+).jpeg$', filename)
+        elif filename.endswith('.png'):
+            match = re.search(r'_(\d+).png$', filename)
+        else:
+            raise NotImplementedError(f"Wrong filename: {filename}")
+        return int(match.group(1)) if match else -1
+    def sort_frames(frame_paths):
+        # Extract filenames from each path and sort by their numeric part
+        return sorted(frame_paths, key=lambda x: extract_frame_number(os.path.basename(x)))
+    # img_list=[]
+    if "s3://" in video_path:
+        img_list = sort_frames(client.list(video_path))
+    else:
+        img_list = sort_frames(list(os.listdir(video_path)))
+    if 'tvqa' in video_path.lower():
+        fps = 3.0
+    else:
+        fps = 1.0
+    if clip is not None:
+        start = float(clip[0])
+        end = float(clip[1])
+        start = max(0, start)
+        end = min(len(img_list) / fps, end)
+        vlen = (end - start) * fps
+    else:
+        vlen = len(img_list)
+    duration = vlen / fps
+    if min_num_frames > vlen:
+        if sample == 'dynamic_fps1':
+            min_num_frames = (vlen // local_num_frames) * local_num_frames
+        else:
+            min_num_frames = vlen
+    if sample == 'dynamic_fps1':
+        num_segments = int(duration // local_num_frames)
+        if num_segments == 0:
+            num_frames = local_num_frames
+        else:
+            num_frames = local_num_frames * num_segments
+        num_frames = min(num_frames, max_num_frames)
+        num_frames = max(min_num_frames, num_frames)
+    num_frames = int(num_frames)
+    if clip is not None:
+        def _get_index_by_time(start_sec, end_sec, num_segments=8, fps=1., max_frame=9999):
+            start_idx = max(1, round(start_sec * fps))
+            end_idx = min(round(end_sec * fps), max_frame)
+            seg_size = float(end_idx - start_idx) / (num_segments - 1)
+            offsets = np.array([start_idx + int(np.round(seg_size * idx)) for idx in range(num_segments)])
+            return offsets
+        frame_indices = _get_index_by_time(float(clip[0]), float(clip[1]), num_segments=num_frames, fps=fps, max_frame=len(img_list)-1)
+    else:
+        frame_indices = get_frame_indices(
+            num_frames, vlen, sample=sample, fix_start=fix_start,
+            min_num_frames=min_num_frames,
+            max_num_frames=max_num_frames, local_num_frames=local_num_frames
+        )
+    imgs = []
+    for idx in frame_indices:
+        frame_fname = os.path.join(video_path, img_list[idx])
+        if "s3://" in video_path:
+            img_bytes = client.get(frame_fname)
+        else:
+            with open(frame_fname, 'rb') as f:
+                img_bytes = f.read()
+        img_np = np.frombuffer(img_bytes, np.uint8)
+        img = cv2.imdecode(img_np, cv2.IMREAD_COLOR)
+        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+        imgs.append(img)
+    frames = np.array(imgs, dtype=np.uint8)
+    return frames, frame_indices, fps, duration
+VIDEO_READER_FUNCS = {
+    'av': read_frames_av,
+    'decord': read_frames_decord,
+    'gif': read_frames_gif,
+    'img': read_frames_img,
+    'frame': read_frames_img
+}
+def load_video(video_path, max_num_frames=512, media_dict=None): #, media_dict):
+    if media_dict is None:
+        media_dict = {'video_read_type': 'decord'}
+    if type(video_path) != str:
+        assert len(video_path) == 1, video_path
+        video_path = video_path[0]
+    if 'start' in media_dict:
+        clip = [media_dict['start'], media_dict['end']]
+    else:
+        clip = None
+    if 's3://' in video_path:
+        from petrel_client.client import Client
+        client = Client(conf_path='~/petreloss.conf')
+    else:
+        client = None
+    frames, frame_indices, fps, duration = VIDEO_READER_FUNCS[media_dict['video_read_type']](video_path=video_path, num_frames=max_num_frames, sample='dynamic_fps1', fix_start=None, min_num_frames=64, max_num_frames=max_num_frames, client=client, clip=clip, local_num_frames=8)
+    sec = [str(round(f / fps, 1)) for f in frame_indices]
+    msg = f"\nThe video lasts for {duration:.2f} seconds, and {len(sec)} frames are uniformly sampled from it. "
+    return frames, msg
+######################## load video ########################
+def resize_and_center_crop(image, shortest_edge_length):
+    # Calculate new dimensions and resize
+    aspect_ratio = float(image.width) / float(image.height)
+    if aspect_ratio > 1:
+        new_width = int(shortest_edge_length * aspect_ratio)
+        new_height = shortest_edge_length
+    else:
+        new_width = shortest_edge_length
+        new_height = int(shortest_edge_length / aspect_ratio)
+    resized_image = image.resize((new_width, new_height), Image.ANTIALIAS)
+    # Calculate the position and perform the center crop
+    left = (new_width - shortest_edge_length) / 2
+    top = (new_height - shortest_edge_length) / 2
+    right = (new_width + shortest_edge_length) / 2
+    bottom = (new_height + shortest_edge_length) / 2
+    cropped_image = resized_image.crop((left, top, right, bottom))
+    return cropped_image
+def auto_pad_images(image, grid_params):
+    assert isinstance(image, Image.Image), "Input should be a Pillow Image"
+    assert len(grid_params) > 0, "Grid parameters should not be empty"
+    # Step 1: Calculate and find the closest aspect ratio
+    input_width, input_height = image.size
+    input_aspect_ratio = input_width / input_height
+    candidate_resolutions = [(w / h, w, h) for w in grid_params for h in grid_params]
+    closest_aspect_ratio = min(candidate_resolutions, key=lambda x: abs(input_aspect_ratio - x[0]))
+    candidate_resolutions = [(x[1], x[2]) for x in candidate_resolutions if abs(x[0] - closest_aspect_ratio[0]) < 1e-3]
+    target_resolution = min(candidate_resolutions, key=lambda res: abs(max(input_width, input_height) / max(res) - 1))
+    resize_width, resize_height = target_resolution
+    if input_width > input_height:
+        resize_height = int(resize_width / input_aspect_ratio)
+    else:
+        resize_width = int(resize_height * input_aspect_ratio)
+    resized_image = image.resize((resize_width, resize_height), Image.ANTIALIAS)
+    # Step 5: Pad the resized image if necessary to match the target resolution
+    pad_width = target_resolution[0] - resize_width
+    pad_height = target_resolution[1] - resize_height
+    padded_image = Image.new("RGB", target_resolution, color=(0, 0, 0))
+    padded_image.paste(resized_image, (pad_width // 2, pad_height // 2))
+    return padded_image
+def extract_patches(image, patch_size, overlap_ratio):
+    assert isinstance(image, Image.Image), "Input should be a Pillow Image"
+    assert patch_size > 0, "Patch size should be greater than 0"
+    assert 0 <= overlap_ratio < 1, "Overlap ratio should be between 0 and 1"
+    W, H = image.size
+    patches = []
+    stride = int(patch_size * (1 - overlap_ratio))
+    num_patches_y = (H - patch_size) // stride + 1
+    num_patches_x = (W - patch_size) // stride + 1
+    y_start = (H - (num_patches_y - 1) * stride - patch_size) // 2
+    x_start = (W - (num_patches_x - 1) * stride - patch_size) // 2
+    for y in range(y_start, y_start + num_patches_y * stride, stride):
+        for x in range(x_start, x_start + num_patches_x * stride, stride):
+            patch = image.crop((x, y, x + patch_size, y + patch_size))
+            patches.append(patch)
+    return patches
+def process_highres_image_crop_split(image, data_args, processor=None):
+    crop_resolution = data_args.image_crop_resolution
+    split_resolution = data_args.image_split_resolution
+    if processor is None:
+        processor = data_args.image_processor
+    image_crop = resize_and_center_crop(image, crop_resolution)
+    image_patches = extract_patches(image_crop, patch_size=split_resolution, overlap_ratio=0)
+    image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches]
+    return torch.stack(image_patches, dim=0)
+def process_highres_image(image, processor, grid_pinpoints):
+    grid_params = [int(x) for x in grid_pinpoints.split(",")]
+    width_height = max(image.size)
+    fit_grid_params = [x for x in grid_params if x >= width_height]
+    if len(fit_grid_params) == 0:
+        select_size = max(grid_params)
+    else:
+        select_size = min(fit_grid_params)
+    # FIXME: always select the 448
+    select_size = max(grid_params)
+    image_padded = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+    # FIXME: this seems to be a bug that it always resizes instead of padding
+    image_original_resize = image.resize((processor.size["shortest_edge"], processor.size["shortest_edge"]))
+    image_padded = image_padded.resize((select_size, select_size))
+    image_patches = extract_patches(image_padded, patch_size=processor.size["shortest_edge"], overlap_ratio=0)
+    image_patches = [image_original_resize] + image_patches
+    image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches]
+    return torch.stack(image_patches, dim=0)
+def select_best_resolution(original_size, possible_resolutions, max_resolutions, patch_size):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float("inf")
+    for width, height in possible_resolutions:
+        if max_resolutions != None and (width * height != patch_size * patch_size):
+            if (width * height+patch_size*patch_size) > max_resolutions: # NOTE 要算一个global
+                continue
+        # Calculate the downscaled size to keep the aspect ratio
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        # Calculate effective and wasted resolutions
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+    # print(f"original_size={original_size}, possible_resolutions={possible_resolutions}, max_resolutions={max_resolutions}, best_fit={best_fit}")
+    assert best_fit is not None, f"Can't find suitable fit in {possible_resolutions} at max:{max_resolutions}"
+    return best_fit
+def resize_and_pad_image(image, target_resolution):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+    # Determine which dimension (width or height) to fill
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+    if scale_w < scale_h:
+        # Width will be filled completely
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        # Height will be filled completely
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+    # Resize the image
+    resized_image = image.resize((new_width, new_height))
+    # Create a new image with the target size and paste the resized image onto it
+    new_image = Image.new("RGB", (target_width, target_height), (0, 0, 0))
+    paste_x = (target_width - new_width) // 2
+    paste_y = (target_height - new_height) // 2
+    new_image.paste(resized_image, (paste_x, paste_y))
+    return new_image
+def divide_to_patches(image, patch_size):
+    """
+    Divides an image into patches of a specified size.
+    Args:
+        image (PIL.Image.Image): The input image.
+        patch_size (int): The size of each patch.
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    patches = []
+    width, height = image.size
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            box = (j, i, j + patch_size, i + patch_size)
+            patch = image.crop(box)
+            patches.append(patch)
+    return patches
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size, max_resolutions=None):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+    Args:
+        image_size (tuple): The size of the input image in the format (width, height).
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+        patch_size (int): The size of each image patch.
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+        assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
+        # Use regex to extract the range from the input string
+        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+        range_start = tuple(map(int, matches[0]))
+        range_end = tuple(map(int, matches[-1]))
+        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+        grid_pinpoints = [(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)]
+        # Multiply all elements by patch_size
+        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    width, height = select_best_resolution(image_size, possible_resolutions, max_resolutions=max_resolutions, patch_size=patch_size)
+    # print("get width/patch size", width, patch_size, flush=True)
+    return width // patch_size, height // patch_size
+def process_anyres_image(image, processor, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    raise NotImplementedError
+    # Convert grid_pinpoints from string to list
+    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+        try:
+            patch_size = processor.size[0]
+        except Exception as e:
+            patch_size = processor.size["shortest_edge"]
+        assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
+        # Use regex to extract the range from the input string
+        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+        range_start = tuple(map(int, matches[0]))
+        range_end = tuple(map(int, matches[-1]))
+        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+        grid_pinpoints = [(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)]
+        # Multiply all elements by patch_size
+        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    image_padded = resize_and_pad_image(image, best_resolution)
+    patches = divide_to_patches(image_padded, processor.crop_size["height"])
+    # FIXME: this seems to be a bug that it resizes instead of pad.
+    # but to keep it consistent with previous, i will keep it as it is
+    # TODO: uncomment below to ablate with the padding
+    if isinstance(processor.size, dict):
+        shortest_edge = processor.size["shortest_edge"]
+    else:
+        shortest_edge = min(processor.size)
+    image_original_resize = image.resize((shortest_edge, shortest_edge))
+    # image_padded_square = expand2square(image, tuple(int(x*255) for x in processor.image_mean))
+    # image_original_resize = image_padded_square.resize((processor.size['shortest_edge'], processor.size['shortest_edge']))
+    image_patches = [image_original_resize] + patches
+    image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches]
+    # print("image.size", image.size, "len(image_patches):",  len(image_patches), "patch_size:", image_patches[0].shape)
+    return torch.stack(image_patches, dim=0)
+def process_anyres_image_nopad(image, processor, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    # Convert grid_pinpoints from string to list
+    try:
+        patch_size = processor.size[0]
+    except Exception as e:
+        patch_size = processor.size["shortest_edge"]
+    assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
+    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+        # Use regex to extract the range from the input string
+        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+        range_start = tuple(map(int, matches[0]))
+        range_end = tuple(map(int, matches[-1]))
+        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+        grid_pinpoints = [(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)]
+        # Multiply all elements by patch_size
+        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions, max_resolutions=None, patch_size=patch_size) # 目前图像无限制
+    # image_padded = resize_and_pad_image(image, best_resolution)
+    patches = divide_to_patches(image.resize(best_resolution), patch_size)
+    # FIXME: this seems to be a bug that it resizes instead of pad.
+    # but to keep it consistent with previous, i will keep it as it is
+    # TODO: uncomment below to ablate with the padding
+    if isinstance(processor.size, dict):
+        shortest_edge = processor.size["shortest_edge"]
+    else:
+        shortest_edge = min(processor.size)
+    image_original_resize = image.resize((shortest_edge, shortest_edge))
+    # image_padded_square = expand2square(image, tuple(int(x*255) for x in processor.image_mean))
+    # image_original_resize = image_padded_square.resize((processor.size['shortest_edge'], processor.size['shortest_edge']))
+    image_patches = [image_original_resize] + patches
+    image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches]
+    # raise ValueError(f"image.size: {image.size} len(image_patches): {len(image_patches)}, patch_size:, {image_patches[0].shape}, possible_resolutions:, {possible_resolutions}, best: {best_resolution}")
+    return torch.stack(image_patches, dim=0)
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def process_images(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+    new_images = []
+    if image_aspect_ratio == "highres":
+        raise NotImplementedError
+        for image in images:
+            image = process_highres_image(image, image_processor, model_cfg.image_grid_pinpoints)
+            new_images.append(image)
+    elif "anyres" in image_aspect_ratio:
+        for image in images:
+            if "nopad" in image_aspect_ratio:
+                image = process_anyres_image_nopad(image, image_processor, model_cfg.image_grid_pinpoints)
+            else:
+                image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
+            new_images.append(image)
+    elif image_aspect_ratio == "crop_split":
+        raise NotImplementedError
+        for image in images:
+            image = process_highres_image_crop_split(image, model_cfg, image_processor)
+            new_images.append(image)
+    elif image_aspect_ratio == "pad":
+        for image in images:
+            image = expand2square(image, tuple(int(x * 255) for x in image_processor.image_mean))
+            image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+            new_images.append(image)
+    else:
+        return image_processor.preprocess(images, return_tensors="pt")["pixel_values"]
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images
+def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == "pt":
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
+    return input_ids
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith("checkpoint-"):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
+        offset = min(output_ids.shape[1] - self.start_len, 3)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            if output_ids[0, -keyword_id.shape[0] :] == keyword_id:
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False

.ipynb_checkpoints/modeling_qwen2_flash-checkpoint.py ADDED Viewed

The diff for this file is too large to render. See raw diff

added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

config.json ADDED Viewed

	@@ -0,0 +1,231 @@

+{
+  "_name_or_path": "/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/xiacheng-240108120111/vcflash0607_brandnew/videochat-flash_qwen2_5_all_compress_train_0",
+  "architectures": [
+    "VideoChatFlashQwenForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "modeling_videochat_flash.VideoChatFlashQwenConfig",
+    "AutoModel": "modeling_videochat_flash.VideoChatFlashQwenForCausalLM"
+  },
+  "bos_token_id": 151643,
+  "dual_chunk_attention_config": {
+    "chunk_size": 262144,
+    "local_size": 8192,
+    "original_max_position_embeddings": 262144
+  },
+  "eos_token_id": 151645,
+  "frame_aspect_ratio": "square",
+  "frame_grid_pinpoints": null,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "image_aspect_ratio": "anyres_nopad",
+  "image_crop_resolution": null,
+  "image_grid_pinpoints": [
+    [
+      224,
+      224
+    ],
+    [
+      224,
+      448
+    ],
+    [
+      224,
+      672
+    ],
+    [
+      224,
+      896
+    ],
+    [
+      224,
+      1120
+    ],
+    [
+      224,
+      1344
+    ],
+    [
+      448,
+      224
+    ],
+    [
+      448,
+      448
+    ],
+    [
+      448,
+      672
+    ],
+    [
+      448,
+      896
+    ],
+    [
+      448,
+      1120
+    ],
+    [
+      448,
+      1344
+    ],
+    [
+      672,
+      224
+    ],
+    [
+      672,
+      448
+    ],
+    [
+      672,
+      672
+    ],
+    [
+      672,
+      896
+    ],
+    [
+      672,
+      1120
+    ],
+    [
+      672,
+      1344
+    ],
+    [
+      896,
+      224
+    ],
+    [
+      896,
+      448
+    ],
+    [
+      896,
+      672
+    ],
+    [
+      896,
+      896
+    ],
+    [
+      896,
+      1120
+    ],
+    [
+      896,
+      1344
+    ],
+    [
+      1120,
+      224
+    ],
+    [
+      1120,
+      448
+    ],
+    [
+      1120,
+      672
+    ],
+    [
+      1120,
+      896
+    ],
+    [
+      1120,
+      1120
+    ],
+    [
+      1120,
+      1344
+    ],
+    [
+      1344,
+      224
+    ],
+    [
+      1344,
+      448
+    ],
+    [
+      1344,
+      672
+    ],
+    [
+      1344,
+      896
+    ],
+    [
+      1344,
+      1120
+    ],
+    [
+      1344,
+      1344
+    ]
+  ],
+  "image_split_resolution": null,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "llm_compress_layer_list": [
+    8,
+    16,
+    24
+  ],
+  "llm_compress_type": "attention",
+  "llm_image_token_ratio_list": [
+    1.0,
+    0.5,
+    0.25,
+    0.125
+  ],
+  "max_num_pixels": 14745600000,
+  "max_position_embeddings": 64000,
+  "max_window_layers": 28,
+  "min_slow_num_frames": 4,
+  "mm_close_init": false,
+  "mm_hidden_size": 1024,
+  "mm_llm_compress": false,
+  "mm_local_num_frames": 4,
+  "mm_newline_position": "nothing",
+  "mm_num_compress_latents": 128,
+  "mm_num_compress_query_type": "learnable",
+  "mm_patch_merge_type": "spatial_nopad",
+  "mm_pos_num_frames": 8,
+  "mm_projector_lr": null,
+  "mm_projector_type": "tome16_mlp_hd64",
+  "mm_resampler_type": null,
+  "mm_spatial_pool_mode": "bilinear",
+  "mm_tunable_parts": "mm_vision_tower,mm_mlp_adapter,mm_language_model",
+  "mm_use_im_patch_token": false,
+  "mm_use_im_start_end": false,
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -2,
+  "mm_vision_tower": "umt-large",
+  "mm_vision_tower_lr": 2e-06,
+  "model_type": "videochat_flash_qwen",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "pos_skipping_range": 4096,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 32768,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.39.2",
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_pos_skipping": false,
+  "use_sliding_window": false,
+  "vision_encode_type": "video_image",
+  "vision_tower_pretrained": null,
+  "vocab_size": 152064
+}

constants.py ADDED Viewed

	@@ -0,0 +1,12 @@

+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"

conversation.py ADDED Viewed

	@@ -0,0 +1,592 @@

+import dataclasses
+from enum import auto, Enum
+from typing import List, Any, Dict, Union, Tuple
+import re
+import base64
+from io import BytesIO
+from PIL import Image
+from transformers import AutoTokenizer
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    CHATML = auto()
+    LLAMA_2 = auto()
+    LLAMA_3 = auto()
+    QWEN = auto()
+    GEMMA = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    tokenizer_id: str = ""
+    tokenizer: Any = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: Union[str, List[str]] = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0]
+            if "mmtag" in self.version:
+                init_msg = init_msg.replace("<image>", "").strip()
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            elif not init_msg.startswith("<image>"):
+                init_msg = init_msg.replace("<image>", "").strip()
+                messages[0] = (init_role, "<image>\n" + init_msg)
+            else:
+                messages[0] = (init_role, init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = "" if self.system == "" else self.system + self.sep + "\n"
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, images, _ = message
+                        message = "<image>" * len(images) + message
+                    ret += role + "\n" + message + self.sep + "\n"
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA_3:
+            chat_template_messages = [{"role": "system", "content": self.system}]
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, images = message
+                        message = "<image>" * len(images) + message
+                    chat_template_messages.append({"role": role, "content": message})
+            # print(chat_template_messages)
+            return self.tokenizer.apply_chat_template(chat_template_messages, tokenize=False, add_generation_prompt=True)
+            # ret = "" if self.system == "" else self.system + self.sep + "\n"
+            # for role, message in messages:
+            #     if message:
+            #         if type(message) is tuple:
+            #             message, images = message
+            #             message = "<image>" * len(images) + message
+            #         ret += role + "\n" + message + self.sep + "\n"
+            #     else:
+            #         ret += role + "\n"
+            # return ret
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.GEMMA:
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                assert role == self.roles[i % 2], "Conversation should alternate user/assistant/user/assistant/..."
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0:
+                        message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def process_image(self, image, image_process_mode, return_pil=False, image_format="PNG"):
+        if image_process_mode == "Pad":
+            def expand2square(pil_img, background_color=(122, 116, 104)):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+            image = expand2square(image)
+        elif image_process_mode in ["Default", "Crop"]:
+            pass
+        elif image_process_mode == "Resize":
+            image = image.resize((336, 336))
+        else:
+            raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+        if type(image) is not Image.Image:
+            image = Image.open(image).convert("RGB")
+        max_hw, min_hw = max(image.size), min(image.size)
+        aspect_ratio = max_hw / min_hw
+        max_len, min_len = 672, 448
+        shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+        longest_edge = int(shortest_edge * aspect_ratio)
+        W, H = image.size
+        if H > W:
+            H, W = longest_edge, shortest_edge
+        else:
+            H, W = shortest_edge, longest_edge
+        image = image.resize((W, H))
+        if return_pil:
+            return image
+        else:
+            buffered = BytesIO()
+            image.save(buffered, format=image_format)
+            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            return img_b64_str
+    def get_images(self, return_pil=False, return_path=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    if type(image) != list:
+                        image = [image]
+                    for img in image:
+                        if not return_path and self.is_image_file(img):
+                            img = self.process_image(img, image_process_mode, return_pil=return_pil)
+                        else:
+                            images.append(img)
+        return images
+    def is_image_file(self, filename):
+        image_extensions = [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"]
+        return any(filename.lower().endswith(ext) for ext in image_extensions)
+    def is_video_file(self, filename):
+        video_extensions = [".mp4", ".mov", ".avi", ".mkv", ".wmv", ".flv", ".mpeg", ".mpg"]
+        return any(filename.lower().endswith(ext) for ext in video_extensions)
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    if type(image) != list:
+                        image = [image]
+                    if len(image) == 1:
+                        msg = "<image>\n" + msg.replace("<image>", "").strip()
+                    else:
+                        msg = re.sub(r"(<image>)\n(?=<image>)", r"\1 ", msg)
+                    img_str_list = []
+                    for img in image:
+                        if self.is_image_file(img):
+                            img_b64_str = self.process_image(img, "Default", return_pil=False, image_format="JPEG")
+                            img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" style="max-width: 256px; max-height: 256px; width: auto; height: auto; object-fit: contain;"/>'
+                            img_str_list.append(img_str)
+                        elif self.is_video_file(img):
+                            ret.append(((img,), None))
+                    msg = msg.strip()
+                    img_place_holder = ""
+                    for img_str in img_str_list:
+                        img_place_holder += f"{img_str}\n\n"
+                    if len(img_str_list) > 0:
+                        msg = f"{img_place_holder}\n\n{msg}"
+                    if len(msg) > 0:
+                        ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(system=self.system, roles=self.roles, messages=[[x, y] for x, y in self.messages], offset=self.offset, sep_style=self.sep_style, sep=self.sep, sep2=self.sep2, version=self.version)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. " "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=[
+        ["Human", "What are the key differences between renewable and non-renewable energy sources?"],
+        [
+            "Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n",
+        ],
+    ],
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. " "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. " "You are able to understand the visual content that the user provides, " "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+# conv_llava_llama_3 = Conversation(
+#     system="You are a helpful language and vision assistant. " "You are able to understand the visual content that the user provides, " "and assist the user with a variety of tasks using natural language.",
+#     roles=("user", "assistant"),
+#     version="llama_v3",
+#     messages=[],
+#     offset=0,
+#     sep="<|eot_id|>",
+#     sep_style=SeparatorStyle.LLAMA_3,
+#     tokenizer_id="meta-llama/Meta-Llama-3-8B-Instruct",
+#     tokenizer=AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct"),
+#     stop_token_ids=[128009],
+# )
+conv_mistral_instruct = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="",
+    sep2="</s>",
+)
+conv_llava_llama_2_simple = Conversation(
+    system="Answer the questions about the visual content that the user provides.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_llava_llama_2_mmtag = Conversation(
+    system="Answer the questions about the visual content that the user provides." "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2_mmtag",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_qwen = Conversation(
+    system="""<|im_start|>system
+You are a helpful assistant.""",
+    roles=("<|im_start|>user", "<|im_start|>assistant"),
+    version="qwen",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.CHATML,
+    sep="<|im_end|>",
+)
+conv_internlm_2 = Conversation(
+    system="""<|im_start|>system
+You are a helpful assistant.""",
+    roles=("<|im_start|>user", "<|im_start|>assistant"),
+    version="internlm_2",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.CHATML,
+    sep="<|im_end|>",
+)
+conv_gemma_instruct = Conversation(system="", roles=("<start_of_turn>user\n", "<start_of_turn>model\n"), version="gemma", messages=[], offset=0, sep_style=SeparatorStyle.GEMMA, sep="<end_of_turn>\n")
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. " "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+    "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. " "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+    "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+conv_mistral_orca = Conversation(
+    system="""<|im_start|>system
+You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_mistral_zephyr = Conversation(
+    system="""<|system|>
+You are a helpful AI assistant.""",
+    roles=("<|user|>\n", "<|assistant|>\n"),
+    version="mpt",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="</s>",
+)
+conv_mistral_direct = Conversation(
+    system="""<|im_start|>system
+Answer the questions.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_chatml_direct = Conversation(
+    system="""<|im_start|>system
+Answer the questions.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+default_conversation = conv_vicuna_v0
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "llama_2": conv_llama_2,
+    "mistral_instruct": conv_mistral_instruct,
+    "mistral_orca": conv_mistral_orca,
+    "mistral_zephyr": conv_mistral_zephyr,
+    "mistral_direct": conv_mistral_direct,
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "chatml_direct": conv_chatml_direct,
+    "llava_v0": conv_llava_v0,
+    "llava_v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "llava_v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+    # "llava_llama_3": conv_llava_llama_3,
+    "llava_llama_2_simple": conv_llava_llama_2_simple,
+    "llava_llama_2_mmtag": conv_llava_llama_2_mmtag,
+    "llava_mistral_instruct": conv_mistral_instruct,
+    "mpt": conv_mpt,
+    "qwen_1_5": conv_qwen,
+    "qwen_2": conv_qwen,
+    "internlm_2": conv_internlm_2,
+    "gemma_instruct": conv_gemma_instruct,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())
+    print(default_conversation)

generation_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "_attn_implementation": "flash_attention_2",
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.39.2"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

mm_projector_builder.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import torch
+import torch.nn as nn
+from typing import Callable, Tuple
+def bipartite_soft_matching(
+    metric: torch.Tensor,
+    r: int,
+) -> Tuple[Callable, Callable]:
+    """
+    Applies ToMe with a balanced matching set (50%, 50%).
+    Input size is [batch, tokens, channels].
+    r indicates the number of tokens to remove (max 50% of tokens).
+    """
+    protected = 0
+    t = metric.shape[1]
+    r = min(r, (t - protected) // 2)
+    assert r > 0, r
+    with torch.no_grad():
+        metric = metric / metric.norm(dim=-1, keepdim=True)
+        a, b = metric[..., ::2, :], metric[..., 1::2, :]
+        scores = a @ b.transpose(-1, -2)
+        node_max, node_idx = scores.max(dim=-1)
+        edge_idx = node_max.argsort(dim=-1, descending=True)[..., None]
+        unm_idx = edge_idx[..., r:, :]  # Unmerged Tokens
+        src_idx = edge_idx[..., :r, :]  # Merged Tokens
+        dst_idx = node_idx[..., None].gather(dim=-2, index=src_idx)
+    def merge(x: torch.Tensor, mode="mean") -> torch.Tensor:
+        src, dst = x[..., ::2, :], x[..., 1::2, :]
+        n, t1, c = src.shape
+        unm = src.gather(dim=-2, index=unm_idx.expand(n, t1 - r, c))
+        src = src.gather(dim=-2, index=src_idx.expand(n, r, c))
+        dst = dst.scatter_add(-2, dst_idx.expand(n, r, c), src) # , reduce=mode)
+        return torch.cat([unm, dst], dim=1)
+    def unmerge(x: torch.Tensor) -> torch.Tensor:
+        unm_len = unm_idx.shape[1]
+        unm, dst = x[..., :unm_len, :], x[..., unm_len:, :]
+        n, _, c = unm.shape
+        src = dst.gather(dim=-2, index=dst_idx.expand(n, r, c))
+        out = torch.zeros(n, metric.shape[1], c, device=x.device, dtype=x.dtype)
+        out[..., 1::2, :] = dst
+        out.scatter_(dim=-2, index=(2 * unm_idx).expand(n, unm_len, c), src=unm)
+        out.scatter_(dim=-2, index=(2 * src_idx).expand(n, r, c), src=src)
+        return out
+    return merge, unmerge
+def merge_wavg(
+    merge: Callable, x: torch.Tensor, size: torch.Tensor = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Applies the merge function by taking a weighted average based on token size.
+    Returns the merged tensor and the new token sizes.
+    """
+    if size is None:
+        size = torch.ones_like(x[..., 0, None])
+    x = merge(x * size, mode="sum")
+    size = merge(size, mode="sum")
+    x = x / size
+    return x, size
+class ToMe16_mlp_hd64(nn.Module):
+    def __init__(self, config, vision_cfg):
+        super().__init__()
+        self._config = config
+        self.mm_hidden_size = config.mm_hidden_size
+        self.hw = vision_cfg.image_size // vision_cfg.patch_size
+        self.num_attention_heads = vision_cfg.num_attention_heads
+        self.mlp = nn.Sequential(nn.Linear(config.mm_hidden_size, config.hidden_size),
+                    nn.GELU(),
+                    nn.Linear(config.hidden_size, config.hidden_size))
+        self.max_pos_hw = self.hw
+        self.max_pos_num_frames = config.mm_pos_num_frames
+        self.num_image_patches_per_side = 8
+        self.num_frame_patches_per_side = 4
+    def merge_tokens(self, x, target_num_token):
+        r"""
+        x = torch.randn(10, 2560, c)
+        x = merge_tokens(x, r_merge_list=[1280])
+        """
+        size = None
+        b, p, c = x.shape
+        tmp_p = p
+        r_merge_list = []
+        assert tmp_p > target_num_token, f"{tmp_p} should greater than {target_num_token}"
+        while tmp_p != target_num_token:
+            if tmp_p - target_num_token <= (tmp_p // 2):
+                r_merge_list.append(tmp_p - target_num_token)
+                break
+            else:
+                r_merge_list.append(tmp_p // 2)
+                tmp_p = tmp_p - (tmp_p // 2)
+        head = self.num_attention_heads
+        dim = c // head
+        for r in r_merge_list:
+            metric = x.reshape(b, p, head, dim).mean(2) # [b, p, c//head]
+            merge, _ = bipartite_soft_matching(
+                metric,
+                r
+            )
+            x, size = merge_wavg(merge, x, size)
+            _, p, _ = x.shape
+        return x
+    def forward(self, x, compress=False, local_num_frames=-1): # 单帧64
+        height = width = self.hw
+        assert height * width == x.shape[1]
+        if local_num_frames != -1 and local_num_frames != 1:
+            assert compress is True
+        if compress:
+            if local_num_frames != -1:
+                num_frames = local_num_frames
+                x = x.reshape(x.shape[0] // local_num_frames, -1, x.shape[-1])
+            else:
+                num_frames = x.shape[0]
+                x = x.reshape(1, -1, x.shape[-1])
+            num_tome_tokens = 16 * num_frames
+        else:
+            num_tome_tokens = 64
+        x = self.merge_tokens(x, target_num_token=num_tome_tokens)
+        x = self.mlp(x)
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": "tome16_mlp_hd64"}
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, "mm_projector_type", "linear")
+    if projector_type == 'tome16_mlp_hd64':
+        return ToMe16_mlp_hd64(config, kwargs["vision_cfg"])
+    raise ValueError(f"Unknown projector type: {projector_type}")

mm_utils.py ADDED Viewed

	@@ -0,0 +1,855 @@

+from PIL import Image
+from io import BytesIO
+import base64
+import math
+import ast
+import re
+import torch
+from transformers import StoppingCriteria
+from .constants import IMAGE_TOKEN_INDEX
+import random
+import os
+import io
+import av
+import cv2
+import imageio
+from decord import VideoReader
+import numpy as np
+from torchvision.transforms.functional import pil_to_tensor
+######################## load video ########################
+def get_index(num_frames, num_segments):
+    seg_size = float(num_frames - 1) / num_segments
+    start = int(seg_size / 2)
+    offsets = np.array([
+        start + int(np.round(seg_size * idx)) for idx in range(num_segments)
+    ])
+    return offsets
+def pts_to_secs(pts: int, time_base: float, start_pts: int) -> float:
+    """
+    Converts a present time with the given time base and start_pts offset to seconds.
+    Returns:
+        time_in_seconds (float): The corresponding time in seconds.
+    https://github.com/facebookresearch/pytorchvideo/blob/main/pytorchvideo/data/utils.py#L54-L64
+    """
+    if pts == math.inf:
+        return math.inf
+    return int(pts - start_pts) * time_base
+def get_pyav_video_duration(video_reader):
+    video_stream = video_reader.streams.video[0]
+    video_duration = pts_to_secs(
+        video_stream.duration,
+        video_stream.time_base,
+        video_stream.start_time
+    )
+    return float(video_duration)
+def get_frame_indices(num_frames, vlen, sample='middle', fix_start=None, input_fps=1, min_num_frames=1, max_num_frames=-1, local_num_frames=8):
+    if min_num_frames > vlen:
+        if sample == 'dynamic_fps1':
+            min_num_frames = (vlen // local_num_frames) * local_num_frames
+        else:
+            min_num_frames = vlen
+    if sample == 'dynamic_fps1':
+        duration = float(vlen) / input_fps
+        num_segments = int(duration // local_num_frames)
+        if num_segments == 0:
+            num_frames = local_num_frames
+        else:
+            num_frames = local_num_frames * num_segments
+        if max_num_frames > 0:
+            num_frames = min(num_frames, max_num_frames)
+        sample = "middle" # NOTE
+        # logger.info(f"? is OK (img), duation={duration} frames={num_frames}!!!!")
+    num_frames = max(min_num_frames, num_frames)
+    # print(f"\033[0;31m vlen={vlen}, input_fps={input_fps} num_frames={num_frames} \033[0m")
+    if sample in ["rand", "middle"]: # uniform sampling
+        acc_samples = min(num_frames, vlen)
+        # split the video into `acc_samples` intervals, and sample from each interval.
+        intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
+        ranges = []
+        for idx, interv in enumerate(intervals[:-1]):
+            ranges.append((interv, intervals[idx + 1] - 1))
+        if sample == 'rand':
+            try:
+                frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
+            except:
+                frame_indices = np.random.permutation(vlen)[:acc_samples]
+                frame_indices.sort()
+                frame_indices = list(frame_indices)
+        elif fix_start is not None:
+            frame_indices = [x[0] + fix_start for x in ranges]
+        elif sample == 'middle':
+            frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
+        else:
+            raise NotImplementedError
+        if len(frame_indices) < num_frames:  # padded with last frame
+            padded_frame_indices = [frame_indices[-1]] * num_frames
+            padded_frame_indices[:len(frame_indices)] = frame_indices
+            frame_indices = padded_frame_indices
+    elif "fps" in sample:  # fps0.5, sequentially sample frames at 0.5 fps
+        output_fps = float(sample[3:])
+        duration = float(vlen) / input_fps
+        delta = 1 / output_fps  # gap between frames, this is also the clip length each frame represents
+        frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
+        frame_indices = np.around(frame_seconds * input_fps).astype(int)
+        frame_indices = [e for e in frame_indices if e < vlen]
+        if max_num_frames > 0 and len(frame_indices) > max_num_frames:
+            frame_indices = frame_indices[:max_num_frames]
+            # frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames)
+    else:
+        raise ValueError(f"Not support sample type: {sample}")
+    return frame_indices
+def read_frames_av(video_path, num_frames, sample='rand', client=None, fix_start=None, min_num_frames=1, max_num_frames=-1, clip=None, local_num_frames=8):
+    if clip is not None:
+        raise NotImplementedError("av don't support clip!!!")
+    if 's3://' in video_path:
+        video_bytes = client.get(video_path)
+        byteio = io.BytesIO(video_bytes)
+        byteio.seek(0)
+        reader = av.open(byteio)
+    else:
+        byteio = None
+        reader = av.open(video_path)
+    frames = [f.to_rgb().to_ndarray() for f in reader.decode(video=0)]
+    vlen = len(frames)
+    duration = get_pyav_video_duration(reader)
+    fps = vlen / float(duration)
+    frame_indices = get_frame_indices(
+        num_frames, vlen, sample=sample, fix_start=fix_start,
+        input_fps=fps, min_num_frames=min_num_frames, max_num_frames=max_num_frames, local_num_frames=local_num_frames
+    )
+    frames = np.stack([frames[idx] for idx in frame_indices])  # (T, H, W, C), torch.uint8
+    # frames = frames.permute(0, 3, 1, 2)  # (T, C, H, W), torch.uint8
+    if byteio != None:
+        byteio.close()
+    reader.close()
+    return frames, frame_indices, float(fps), duration
+def read_frames_gif(
+        video_path, num_frames, sample='rand', fix_start=None,
+        min_num_frames=1, max_num_frames=-1, client=None, clip=None, local_num_frames=8
+    ):
+    if clip is not None:
+        raise NotImplementedError("Gif don't support clip!!!")
+    if 's3://' in video_path:
+        video_bytes = client.get(video_path)
+        byteio = io.BytesIO(video_bytes)
+        gif = imageio.get_reader(byteio)
+    else:
+        byteio = None
+        gif = imageio.get_reader(video_path)
+    vlen = len(gif)
+    fps = 1.
+    duration = vlen / fps
+    frame_indices = get_frame_indices(
+        num_frames, vlen, sample=sample, fix_start=fix_start,
+        min_num_frames=min_num_frames,
+        max_num_frames=max_num_frames, local_num_frames=local_num_frames,
+        input_fps=fps
+    )
+    frames = []
+    min_h = min_w = 100000
+    hw_set = set()
+    for index, frame in enumerate(gif):
+        # for index in frame_idxs:
+        if index in frame_indices:
+            frame = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)
+            frame = frame.astype(np.uint8)
+            # # (H x W x C) to (C x H x W)
+            # frame = frame.permute(2, 0, 1)
+            frames.append(frame)
+            hw_set.add(frame.shape)
+            if frame.shape[0] < min_h:
+                min_h = frame.shape[0]
+            if frame.shape[1] < min_w:
+                min_w = frame.shape[1]
+    # print(hw_set, min_h, min_w)
+    if len(hw_set) > 1:
+        frames = [i[:min_h, :min_w] for i in frames]
+    frames = np.stack(frames)  # .float() / 255
+    if byteio != None:
+        byteio.close()
+    return frames, frame_indices, float(fps), duration # for tgif
+def read_frames_decord(
+        video_path, num_frames, sample='rand', fix_start=None, min_num_frames=1,
+        max_num_frames=-1, client=None, clip=None, local_num_frames=8
+    ):
+    if video_path.endswith('.avi'):
+        return read_frames_av(video_path=video_path, num_frames=num_frames, sample=sample,
+                    fix_start=fix_start, min_num_frames=min_num_frames, max_num_frames=max_num_frames,
+                    client=client, clip=clip, local_num_frames=local_num_frames)
+    if 's3://' in video_path:
+        video_bytes = client.get(video_path)
+        if video_bytes is None or len(video_bytes) == 0:
+            raise ValueError(f"Can't read byte from {video_path}!")
+        byteio = io.BytesIO(video_bytes)
+        video_reader = VideoReader(byteio, num_threads=1)
+    else:
+        byteio = None
+        video_reader = VideoReader(video_path, num_threads=1)
+    vlen = len(video_reader)
+    fps = video_reader.get_avg_fps()
+    duration = vlen / float(fps)
+    if clip:
+        start, end = clip
+        start = max(0, start)
+        end = min(duration - 0.1, end)
+        duration = end - start
+        vlen = int(duration * fps)
+        start_index = int(start * fps)
+    frame_indices = get_frame_indices(
+        num_frames, vlen, sample=sample, fix_start=fix_start,
+        input_fps=fps, min_num_frames=min_num_frames, max_num_frames=max_num_frames, local_num_frames=local_num_frames
+    )
+    if clip:
+        frame_indices = [f + start_index for f in frame_indices]
+    # print(fps, frame_indices)
+    frames = video_reader.get_batch(frame_indices).asnumpy()  # (T, H, W, C), torch.uint8
+    # https://github.com/dmlc/decord/issues/208
+    video_reader.seek(0)
+    if byteio != None:
+        byteio.close()
+    # frames = frames.permute(0, 3, 1, 2)  # (T, C, H, W), torch.uint8
+    return frames, frame_indices, float(fps), duration
+def read_frames_img(
+        video_path, num_frames, sample='rand', fix_start=None, min_num_frames=1,
+        max_num_frames=-1, client=None, clip=None, local_num_frames=8
+    ):
+    def extract_frame_number(filename):
+        # Extract the numeric part from the filename using regular expressions
+        if filename.endswith('.jpg'):
+            match = re.search(r'_(\d+).jpg$', filename)
+        elif filename.endswith('.jpeg'):
+            match = re.search(r'_(\d+).jpeg$', filename)
+        elif filename.endswith('.png'):
+            match = re.search(r'_(\d+).png$', filename)
+        else:
+            raise NotImplementedError(f"Wrong filename: {filename}")
+        return int(match.group(1)) if match else -1
+    def sort_frames(frame_paths):
+        # Extract filenames from each path and sort by their numeric part
+        return sorted(frame_paths, key=lambda x: extract_frame_number(os.path.basename(x)))
+    # img_list=[]
+    if "s3://" in video_path:
+        img_list = sort_frames(client.list(video_path))
+    else:
+        img_list = sort_frames(list(os.listdir(video_path)))
+    if 'tvqa' in video_path.lower():
+        fps = 3.0
+    else:
+        fps = 1.0
+    if clip is not None:
+        start = float(clip[0])
+        end = float(clip[1])
+        start = max(0, start)
+        end = min(len(img_list) / fps, end)
+        vlen = (end - start) * fps
+    else:
+        vlen = len(img_list)
+    duration = vlen / fps
+    if min_num_frames > vlen:
+        if sample == 'dynamic_fps1':
+            min_num_frames = (vlen // local_num_frames) * local_num_frames
+        else:
+            min_num_frames = vlen
+    if sample == 'dynamic_fps1':
+        num_segments = int(duration // local_num_frames)
+        if num_segments == 0:
+            num_frames = local_num_frames
+        else:
+            num_frames = local_num_frames * num_segments
+        num_frames = min(num_frames, max_num_frames)
+        num_frames = max(min_num_frames, num_frames)
+    num_frames = int(num_frames)
+    if clip is not None:
+        def _get_index_by_time(start_sec, end_sec, num_segments=8, fps=1., max_frame=9999):
+            start_idx = max(1, round(start_sec * fps))
+            end_idx = min(round(end_sec * fps), max_frame)
+            seg_size = float(end_idx - start_idx) / (num_segments - 1)
+            offsets = np.array([start_idx + int(np.round(seg_size * idx)) for idx in range(num_segments)])
+            return offsets
+        frame_indices = _get_index_by_time(float(clip[0]), float(clip[1]), num_segments=num_frames, fps=fps, max_frame=len(img_list)-1)
+    else:
+        frame_indices = get_frame_indices(
+            num_frames, vlen, sample=sample, fix_start=fix_start,
+            min_num_frames=min_num_frames,
+            max_num_frames=max_num_frames, local_num_frames=local_num_frames
+        )
+    imgs = []
+    for idx in frame_indices:
+        frame_fname = os.path.join(video_path, img_list[idx])
+        if "s3://" in video_path:
+            img_bytes = client.get(frame_fname)
+        else:
+            with open(frame_fname, 'rb') as f:
+                img_bytes = f.read()
+        img_np = np.frombuffer(img_bytes, np.uint8)
+        img = cv2.imdecode(img_np, cv2.IMREAD_COLOR)
+        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+        imgs.append(img)
+    frames = np.array(imgs, dtype=np.uint8)
+    return frames, frame_indices, fps, duration
+VIDEO_READER_FUNCS = {
+    'av': read_frames_av,
+    'decord': read_frames_decord,
+    'gif': read_frames_gif,
+    'img': read_frames_img,
+    'frame': read_frames_img
+}
+def load_video(video_path, max_num_frames=512, media_dict=None): #, media_dict):
+    if media_dict is None:
+        media_dict = {'video_read_type': 'decord'}
+    if type(video_path) != str:
+        assert len(video_path) == 1, video_path
+        video_path = video_path[0]
+    if 'start' in media_dict:
+        clip = [media_dict['start'], media_dict['end']]
+    else:
+        clip = None
+    if 's3://' in video_path:
+        from petrel_client.client import Client
+        client = Client(conf_path='~/petreloss.conf')
+    else:
+        client = None
+    frames, frame_indices, fps, duration = VIDEO_READER_FUNCS[media_dict['video_read_type']](video_path=video_path, num_frames=max_num_frames, sample='dynamic_fps1', fix_start=None, min_num_frames=64, max_num_frames=max_num_frames, client=client, clip=clip, local_num_frames=8)
+    sec = [str(round(f / fps, 1)) for f in frame_indices]
+    msg = f"\nThe video lasts for {duration:.2f} seconds, and {len(sec)} frames are uniformly sampled from it. "
+    return frames, msg
+######################## load video ########################
+def resize_and_center_crop(image, shortest_edge_length):
+    # Calculate new dimensions and resize
+    aspect_ratio = float(image.width) / float(image.height)
+    if aspect_ratio > 1:
+        new_width = int(shortest_edge_length * aspect_ratio)
+        new_height = shortest_edge_length
+    else:
+        new_width = shortest_edge_length
+        new_height = int(shortest_edge_length / aspect_ratio)
+    resized_image = image.resize((new_width, new_height), Image.ANTIALIAS)
+    # Calculate the position and perform the center crop
+    left = (new_width - shortest_edge_length) / 2
+    top = (new_height - shortest_edge_length) / 2
+    right = (new_width + shortest_edge_length) / 2
+    bottom = (new_height + shortest_edge_length) / 2
+    cropped_image = resized_image.crop((left, top, right, bottom))
+    return cropped_image
+def auto_pad_images(image, grid_params):
+    assert isinstance(image, Image.Image), "Input should be a Pillow Image"
+    assert len(grid_params) > 0, "Grid parameters should not be empty"
+    # Step 1: Calculate and find the closest aspect ratio
+    input_width, input_height = image.size
+    input_aspect_ratio = input_width / input_height
+    candidate_resolutions = [(w / h, w, h) for w in grid_params for h in grid_params]
+    closest_aspect_ratio = min(candidate_resolutions, key=lambda x: abs(input_aspect_ratio - x[0]))
+    candidate_resolutions = [(x[1], x[2]) for x in candidate_resolutions if abs(x[0] - closest_aspect_ratio[0]) < 1e-3]
+    target_resolution = min(candidate_resolutions, key=lambda res: abs(max(input_width, input_height) / max(res) - 1))
+    resize_width, resize_height = target_resolution
+    if input_width > input_height:
+        resize_height = int(resize_width / input_aspect_ratio)
+    else:
+        resize_width = int(resize_height * input_aspect_ratio)
+    resized_image = image.resize((resize_width, resize_height), Image.ANTIALIAS)
+    # Step 5: Pad the resized image if necessary to match the target resolution
+    pad_width = target_resolution[0] - resize_width
+    pad_height = target_resolution[1] - resize_height
+    padded_image = Image.new("RGB", target_resolution, color=(0, 0, 0))
+    padded_image.paste(resized_image, (pad_width // 2, pad_height // 2))
+    return padded_image
+def extract_patches(image, patch_size, overlap_ratio):
+    assert isinstance(image, Image.Image), "Input should be a Pillow Image"
+    assert patch_size > 0, "Patch size should be greater than 0"
+    assert 0 <= overlap_ratio < 1, "Overlap ratio should be between 0 and 1"
+    W, H = image.size
+    patches = []
+    stride = int(patch_size * (1 - overlap_ratio))
+    num_patches_y = (H - patch_size) // stride + 1
+    num_patches_x = (W - patch_size) // stride + 1
+    y_start = (H - (num_patches_y - 1) * stride - patch_size) // 2
+    x_start = (W - (num_patches_x - 1) * stride - patch_size) // 2
+    for y in range(y_start, y_start + num_patches_y * stride, stride):
+        for x in range(x_start, x_start + num_patches_x * stride, stride):
+            patch = image.crop((x, y, x + patch_size, y + patch_size))
+            patches.append(patch)
+    return patches
+def process_highres_image_crop_split(image, data_args, processor=None):
+    crop_resolution = data_args.image_crop_resolution
+    split_resolution = data_args.image_split_resolution
+    if processor is None:
+        processor = data_args.image_processor
+    image_crop = resize_and_center_crop(image, crop_resolution)
+    image_patches = extract_patches(image_crop, patch_size=split_resolution, overlap_ratio=0)
+    image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches]
+    return torch.stack(image_patches, dim=0)
+def process_highres_image(image, processor, grid_pinpoints):
+    grid_params = [int(x) for x in grid_pinpoints.split(",")]
+    width_height = max(image.size)
+    fit_grid_params = [x for x in grid_params if x >= width_height]
+    if len(fit_grid_params) == 0:
+        select_size = max(grid_params)
+    else:
+        select_size = min(fit_grid_params)
+    # FIXME: always select the 448
+    select_size = max(grid_params)
+    image_padded = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+    # FIXME: this seems to be a bug that it always resizes instead of padding
+    image_original_resize = image.resize((processor.size["shortest_edge"], processor.size["shortest_edge"]))
+    image_padded = image_padded.resize((select_size, select_size))
+    image_patches = extract_patches(image_padded, patch_size=processor.size["shortest_edge"], overlap_ratio=0)
+    image_patches = [image_original_resize] + image_patches
+    image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches]
+    return torch.stack(image_patches, dim=0)
+def select_best_resolution(original_size, possible_resolutions, max_resolutions, patch_size):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float("inf")
+    for width, height in possible_resolutions:
+        if max_resolutions != None and (width * height != patch_size * patch_size):
+            if (width * height+patch_size*patch_size) > max_resolutions: # NOTE 要算一个global
+                continue
+        # Calculate the downscaled size to keep the aspect ratio
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        # Calculate effective and wasted resolutions
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+    # print(f"original_size={original_size}, possible_resolutions={possible_resolutions}, max_resolutions={max_resolutions}, best_fit={best_fit}")
+    assert best_fit is not None, f"Can't find suitable fit in {possible_resolutions} at max:{max_resolutions}"
+    return best_fit
+def resize_and_pad_image(image, target_resolution):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+    # Determine which dimension (width or height) to fill
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+    if scale_w < scale_h:
+        # Width will be filled completely
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        # Height will be filled completely
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+    # Resize the image
+    resized_image = image.resize((new_width, new_height))
+    # Create a new image with the target size and paste the resized image onto it
+    new_image = Image.new("RGB", (target_width, target_height), (0, 0, 0))
+    paste_x = (target_width - new_width) // 2
+    paste_y = (target_height - new_height) // 2
+    new_image.paste(resized_image, (paste_x, paste_y))
+    return new_image
+def divide_to_patches(image, patch_size):
+    """
+    Divides an image into patches of a specified size.
+    Args:
+        image (PIL.Image.Image): The input image.
+        patch_size (int): The size of each patch.
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    patches = []
+    width, height = image.size
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            box = (j, i, j + patch_size, i + patch_size)
+            patch = image.crop(box)
+            patches.append(patch)
+    return patches
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size, max_resolutions=None):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+    Args:
+        image_size (tuple): The size of the input image in the format (width, height).
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+        patch_size (int): The size of each image patch.
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+        assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
+        # Use regex to extract the range from the input string
+        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+        range_start = tuple(map(int, matches[0]))
+        range_end = tuple(map(int, matches[-1]))
+        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+        grid_pinpoints = [(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)]
+        # Multiply all elements by patch_size
+        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    width, height = select_best_resolution(image_size, possible_resolutions, max_resolutions=max_resolutions, patch_size=patch_size)
+    # print("get width/patch size", width, patch_size, flush=True)
+    return width // patch_size, height // patch_size
+def process_anyres_image(image, processor, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    raise NotImplementedError
+    # Convert grid_pinpoints from string to list
+    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+        try:
+            patch_size = processor.size[0]
+        except Exception as e:
+            patch_size = processor.size["shortest_edge"]
+        assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
+        # Use regex to extract the range from the input string
+        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+        range_start = tuple(map(int, matches[0]))
+        range_end = tuple(map(int, matches[-1]))
+        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+        grid_pinpoints = [(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)]
+        # Multiply all elements by patch_size
+        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    image_padded = resize_and_pad_image(image, best_resolution)
+    patches = divide_to_patches(image_padded, processor.crop_size["height"])
+    # FIXME: this seems to be a bug that it resizes instead of pad.
+    # but to keep it consistent with previous, i will keep it as it is
+    # TODO: uncomment below to ablate with the padding
+    if isinstance(processor.size, dict):
+        shortest_edge = processor.size["shortest_edge"]
+    else:
+        shortest_edge = min(processor.size)
+    image_original_resize = image.resize((shortest_edge, shortest_edge))
+    # image_padded_square = expand2square(image, tuple(int(x*255) for x in processor.image_mean))
+    # image_original_resize = image_padded_square.resize((processor.size['shortest_edge'], processor.size['shortest_edge']))
+    image_patches = [image_original_resize] + patches
+    image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches]
+    # print("image.size", image.size, "len(image_patches):",  len(image_patches), "patch_size:", image_patches[0].shape)
+    return torch.stack(image_patches, dim=0)
+def process_anyres_image_nopad(image, processor, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    # Convert grid_pinpoints from string to list
+    try:
+        patch_size = processor.size[0]
+    except Exception as e:
+        patch_size = processor.size["shortest_edge"]
+    assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
+    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+        # Use regex to extract the range from the input string
+        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+        range_start = tuple(map(int, matches[0]))
+        range_end = tuple(map(int, matches[-1]))
+        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+        grid_pinpoints = [(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)]
+        # Multiply all elements by patch_size
+        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions, max_resolutions=None, patch_size=patch_size) # 目前图像无限制
+    # image_padded = resize_and_pad_image(image, best_resolution)
+    patches = divide_to_patches(image.resize(best_resolution), patch_size)
+    # FIXME: this seems to be a bug that it resizes instead of pad.
+    # but to keep it consistent with previous, i will keep it as it is
+    # TODO: uncomment below to ablate with the padding
+    if isinstance(processor.size, dict):
+        shortest_edge = processor.size["shortest_edge"]
+    else:
+        shortest_edge = min(processor.size)
+    image_original_resize = image.resize((shortest_edge, shortest_edge))
+    # image_padded_square = expand2square(image, tuple(int(x*255) for x in processor.image_mean))
+    # image_original_resize = image_padded_square.resize((processor.size['shortest_edge'], processor.size['shortest_edge']))
+    image_patches = [image_original_resize] + patches
+    image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches]
+    # raise ValueError(f"image.size: {image.size} len(image_patches): {len(image_patches)}, patch_size:, {image_patches[0].shape}, possible_resolutions:, {possible_resolutions}, best: {best_resolution}")
+    return torch.stack(image_patches, dim=0)
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def process_images(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+    new_images = []
+    if image_aspect_ratio == "highres":
+        raise NotImplementedError
+        for image in images:
+            image = process_highres_image(image, image_processor, model_cfg.image_grid_pinpoints)
+            new_images.append(image)
+    elif "anyres" in image_aspect_ratio:
+        for image in images:
+            if "nopad" in image_aspect_ratio:
+                image = process_anyres_image_nopad(image, image_processor, model_cfg.image_grid_pinpoints)
+            else:
+                image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
+            new_images.append(image)
+    elif image_aspect_ratio == "crop_split":
+        raise NotImplementedError
+        for image in images:
+            image = process_highres_image_crop_split(image, model_cfg, image_processor)
+            new_images.append(image)
+    elif image_aspect_ratio == "pad":
+        for image in images:
+            image = expand2square(image, tuple(int(x * 255) for x in image_processor.image_mean))
+            image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+            new_images.append(image)
+    else:
+        return image_processor.preprocess(images, return_tensors="pt")["pixel_values"]
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images
+def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == "pt":
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
+    return input_ids
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith("checkpoint-"):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
+        offset = min(output_ids.shape[1] - self.start_len, 3)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            if output_ids[0, -keyword_id.shape[0] :] == keyword_id:
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:537d4d043801ae35a670de1e06426d44c4f5f76309421e29d044076e613e5227
+size 4976862092

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12d78ef55e9ed386a42a333a874072a6539ad0cc0c40bdd074fb9a6794e2fd4b
+size 4907184040

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b478af52e54e5afeeac088d1577d38380fcde890215d188f5b048c2d0a1fd588
+size 4995708404

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c8e978c21fa37f46e61ce180028db5b3ad3cb7baba61a24d18cc4a7a54bb330
+size 1274188592

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,905 @@

+{
+  "metadata": {
+    "total_size": 16153832560
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_b_proj_nope.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_b_proj_nope.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_nope_scale_factor": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj_pe.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj_pe.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.kv_a_proj_nope.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.softmax_temperature": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_b_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_b_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_b_proj_nope.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_b_proj_nope.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_nope_scale_factor": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj_pe.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj_pe.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.kv_a_proj_nope.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.softmax_temperature": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_b_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_b_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_b_proj_nope.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_b_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_nope_scale_factor": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj_pe.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj_pe.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.kv_a_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.softmax_temperature": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_b_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_b_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_b_proj_nope.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_b_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_nope_scale_factor": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj_pe.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj_pe.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.kv_a_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.softmax_temperature": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_b_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_b_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_b_proj_nope.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_b_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_nope_scale_factor": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj_pe.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj_pe.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.kv_a_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.softmax_temperature": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_b_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_b_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_b_proj_nope.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_b_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_nope_scale_factor": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj_pe.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj_pe.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.kv_a_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.softmax_temperature": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_b_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_b_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_b_proj_nope.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_b_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_nope_scale_factor": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj_pe.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj_pe.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.kv_a_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.softmax_temperature": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_b_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_b_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_b_proj_nope.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_b_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_nope_scale_factor": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj_pe.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj_pe.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.kv_a_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.softmax_temperature": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_b_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_b_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_b_proj_nope.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_b_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_nope_scale_factor": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj_pe.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj_pe.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.kv_a_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.softmax_temperature": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_b_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_b_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_b_proj_nope.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_b_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_nope_scale_factor": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj_pe.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj_pe.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.kv_a_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.softmax_temperature": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_b_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_b_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_b_proj_nope.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_b_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_nope_scale_factor": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj_pe.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj_pe.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.kv_a_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.softmax_temperature": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_b_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_b_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_b_proj_nope.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_b_proj_nope.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_nope_scale_factor": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj_pe.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj_pe.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.kv_a_proj_nope.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.softmax_temperature": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_b_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_b_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_b_proj_nope.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_b_proj_nope.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_nope_scale_factor": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj_pe.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj_pe.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.kv_a_proj_nope.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.softmax_temperature": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_b_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_b_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_b_proj_nope.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_b_proj_nope.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_nope_scale_factor": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj_pe.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj_pe.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.kv_a_proj_nope.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.softmax_temperature": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_b_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_b_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_b_proj_nope.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_b_proj_nope.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_nope_scale_factor": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj_pe.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj_pe.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.kv_a_proj_nope.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.softmax_temperature": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_b_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_b_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_b_proj_nope.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_b_proj_nope.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_nope_scale_factor": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj_pe.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj_pe.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.kv_a_proj_nope.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.softmax_temperature": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_b_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_b_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_b_proj_nope.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_b_proj_nope.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_nope_scale_factor": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj_pe.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj_pe.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.kv_a_proj_nope.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.softmax_temperature": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_b_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_b_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_b_proj_nope.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_b_proj_nope.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_nope_scale_factor": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj_pe.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj_pe.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.kv_a_proj_nope.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.softmax_temperature": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_b_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_b_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_b_proj_nope.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_b_proj_nope.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_nope_scale_factor": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj_pe.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj_pe.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.kv_a_proj_nope.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.softmax_temperature": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_b_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_b_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_b_proj_nope.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_b_proj_nope.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_nope_scale_factor": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj_pe.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj_pe.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.kv_a_proj_nope.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.softmax_temperature": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_b_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_b_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_b_proj_nope.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_b_proj_nope.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_nope_scale_factor": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj_pe.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj_pe.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.kv_a_proj_nope.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.softmax_temperature": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_b_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_b_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_b_proj_nope.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_b_proj_nope.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_nope_scale_factor": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj_pe.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj_pe.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.kv_a_proj_nope.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.softmax_temperature": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_b_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_b_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_b_proj_nope.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_b_proj_nope.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_nope_scale_factor": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj_pe.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj_pe.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.kv_a_proj_nope.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.softmax_temperature": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_b_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_b_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_b_proj_nope.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_b_proj_nope.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_nope_scale_factor": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj_pe.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj_pe.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.kv_a_proj_nope.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.softmax_temperature": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_b_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_b_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_b_proj_nope.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_b_proj_nope.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_nope_scale_factor": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj_pe.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj_pe.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.kv_a_proj_nope.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.softmax_temperature": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_b_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_b_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_b_proj_nope.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_b_proj_nope.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_nope_scale_factor": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj_pe.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj_pe.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.kv_a_proj_nope.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.softmax_temperature": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_b_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_b_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_b_proj_nope.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_b_proj_nope.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_nope_scale_factor": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj_pe.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj_pe.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.kv_a_proj_nope.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.softmax_temperature": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_b_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_b_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_b_proj_nope.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_b_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_nope_scale_factor": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj_pe.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj_pe.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.kv_a_proj_nope.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.softmax_temperature": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_b_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_b_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.mm_projector.mlp.0.bias": "model-00004-of-00004.safetensors",
+    "model.mm_projector.mlp.0.weight": "model-00004-of-00004.safetensors",
+    "model.mm_projector.mlp.2.bias": "model-00004-of-00004.safetensors",
+    "model.mm_projector.mlp.2.weight": "model-00004-of-00004.safetensors",
+    "model.norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.0.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.0.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.0.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.0.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.0.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.0.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.0.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.0.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.0.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.0.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.0.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.0.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.0.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.1.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.1.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.1.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.1.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.1.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.1.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.1.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.1.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.1.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.1.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.1.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.1.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.1.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.10.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.10.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.10.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.10.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.10.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.10.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.10.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.10.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.10.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.10.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.10.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.10.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.10.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.11.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.11.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.11.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.11.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.11.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.11.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.11.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.11.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.11.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.11.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.11.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.11.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.11.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.12.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.12.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.12.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.12.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.12.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.12.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.12.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.12.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.12.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.12.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.12.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.12.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.12.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.13.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.13.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.13.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.13.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.13.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.13.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.13.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.13.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.13.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.13.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.13.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.13.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.13.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.14.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.14.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.14.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.14.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.14.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.14.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.14.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.14.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.14.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.14.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.14.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.14.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.14.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.15.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.15.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.15.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.15.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.15.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.15.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.15.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.15.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.15.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.15.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.15.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.15.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.15.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.16.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.16.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.16.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.16.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.16.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.16.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.16.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.16.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.16.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.16.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.16.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.16.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.16.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.17.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.17.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.17.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.17.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.17.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.17.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.17.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.17.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.17.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.17.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.17.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.17.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.17.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.18.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.18.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.18.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.18.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.18.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.18.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.18.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.18.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.18.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.18.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.18.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.18.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.18.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.19.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.19.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.19.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.19.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.19.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.19.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.19.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.19.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.19.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.19.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.19.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.19.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.19.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.2.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.2.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.2.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.2.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.2.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.2.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.2.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.2.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.2.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.2.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.2.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.2.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.2.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.20.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.20.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.20.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.20.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.20.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.20.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.20.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.20.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.20.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.20.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.20.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.20.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.20.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.21.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.21.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.21.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.21.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.21.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.21.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.21.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.21.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.21.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.21.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.21.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.21.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.21.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.22.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.22.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.22.attn.q_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.22.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.22.attn.v_bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.22.mlp.fc1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.22.mlp.fc1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.22.mlp.fc2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.22.mlp.fc2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.22.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.22.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.22.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.22.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.3.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.3.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.3.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.3.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.3.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.3.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.3.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.3.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.3.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.3.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.3.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.3.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.3.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.4.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.4.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.4.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.4.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.4.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.4.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.4.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.4.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.4.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.4.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.4.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.4.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.4.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.5.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.5.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.5.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.5.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.5.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.5.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.5.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.5.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.5.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.5.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.5.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.5.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.5.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.6.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.6.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.6.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.6.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.6.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.6.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.6.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.6.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.6.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.6.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.6.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.6.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.6.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.7.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.7.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.7.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.7.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.7.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.7.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.7.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.7.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.7.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.7.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.7.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.7.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.7.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.8.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.8.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.8.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.8.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.8.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.8.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.8.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.8.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.8.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.8.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.8.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.8.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.8.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.9.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.9.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.9.attn.q_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.9.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.9.attn.v_bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.9.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.9.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.9.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.9.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.9.norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.9.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.9.norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.blocks.9.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.patch_embed.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.patch_embed.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.vision_layernorm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.encoder.vision_layernorm.weight": "model-00004-of-00004.safetensors"
+  }
+}

modeling_qwen2_flash.py ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_videochat_flash.py ADDED Viewed

	@@ -0,0 +1,729 @@

+#    Copyright 2024
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from abc import ABC, abstractmethod
+import re
+import torch
+import torch.nn as nn
+import random
+from typing import List, Optional, Tuple, Union, Dict
+from transformers import AutoConfig, AutoModelForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+from transformers import Qwen2Config
+# from llava.model.language_model.vision_tower_builder import build_vision_tower
+# from llava.model.language_model.mm_projector_builder import build_vision_projector
+# from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_TOKEN
+# from llava.conversation import conv_templates, SeparatorStyle
+# from llava.mm_utils import tokenizer_image_token, KeywordsStoppingCriteria, get_anyres_image_grid_shape, load_video
+# # from .modeling_qwen2_flash import Qwen2Model_Flash, Qwen2ForCausalLM_Flash
+# # from .modeling_qwen2_flash_v7 import Qwen2Model_Flash, Qwen2ForCausalLM_Flash
+# from llava.model.modeling_qwen2_flash import Qwen2Model_Flash, Qwen2ForCausalLM_Flash
+from .vision_tower_builder import build_vision_tower
+from .mm_projector_builder import build_vision_projector
+from .constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_TOKEN
+from .conversation import conv_templates, SeparatorStyle
+from .mm_utils import tokenizer_image_token, KeywordsStoppingCriteria, get_anyres_image_grid_shape, load_video
+# from .modeling_qwen2_flash import Qwen2Model_Flash, Qwen2ForCausalLM_Flash
+# from .modeling_qwen2_flash_v7 import Qwen2Model_Flash, Qwen2ForCausalLM_Flash
+from .modeling_qwen2_flash import Qwen2Model_Flash, Qwen2ForCausalLM_Flash
+class LlavaMetaModel:
+    def __init__(self, config):
+        super(LlavaMetaModel, self).__init__(config)
+        if hasattr(config, "mm_vision_tower"):
+            delay_load = getattr(config, "delay_load", False)
+            self.vision_tower = build_vision_tower(config, delay_load=delay_load)
+            self.mm_projector = build_vision_projector(config, vision_cfg=self.vision_tower.config)
+            if "unpad" in getattr(config, "mm_patch_merge_type", ""):
+                self.image_newline = nn.Parameter(torch.empty(config.hidden_size, dtype=self.dtype))
+            if "nopad" in getattr(config, "mm_patch_merge_type", "") and getattr(self.config, "mm_newline_position", "nothing") != "nothing":
+                self.frame_newline = nn.Parameter(torch.empty(config.hidden_size, dtype=self.dtype))
+    def get_vision_tower(self):
+        vision_tower = getattr(self, "vision_tower", None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        vision_tower = model_args.vision_tower
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+        mm_patch_merge_type = model_args.mm_patch_merge_type
+        self.config.mm_vision_tower = vision_tower
+        self.config.vision_tower_pretrained = getattr(model_args, "vision_tower_pretrained", "")
+        if self.get_vision_tower() is None:
+            vision_tower = build_vision_tower(model_args)
+            if fsdp is not None and len(fsdp) > 0:
+                self.vision_tower = [vision_tower]
+            else:
+                self.vision_tower = vision_tower
+        else:
+            if fsdp is not None and len(fsdp) > 0:
+                vision_tower = self.vision_tower[0]
+            else:
+                vision_tower = self.vision_tower
+            vision_tower.load_model()
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(model_args, "mm_projector_type", "linear")
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+        self.config.mm_patch_merge_type = mm_patch_merge_type
+        if getattr(self, "mm_projector", None) is None:
+            self.mm_projector = build_vision_projector(self.config, vision_cfg=vision_tower.config)
+            if "unpad" in mm_patch_merge_type:
+                embed_std = 1 / torch.sqrt(torch.tensor(self.config.hidden_size, dtype=self.dtype))
+                self.image_newline = nn.Parameter(torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std)
+            if "nopad" in getattr(self.config, "mm_patch_merge_type", "") and getattr(self.config, "mm_newline_position", "nothing") != "nothing":
+                embed_std = 1 / torch.sqrt(torch.tensor(self.config.hidden_size, dtype=self.dtype))
+                self.frame_newline = nn.Parameter(torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std)
+        else:
+            # In case it is frozen by LoRA
+            for p in self.mm_projector.parameters():
+                p.requires_grad = True
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location="cpu")
+            def get_w(weights, keyword):
+                return {k.split(keyword + ".")[1]: v for k, v in weights.items() if keyword in k}
+            if self.config.mm_projector_type =='lxh_qformer':
+                incompatible_keys = self.mm_projector.load_state_dict(get_w(mm_projector_weights, "mm_projector"), strict=False)
+            else:
+                incompatible_keys = self.mm_projector.load_state_dict(get_w(mm_projector_weights, "mm_projector"))
+            print(f"Loaded mm projector weights from {pretrain_mm_mlp_adapter}. Incompatible keys: {incompatible_keys}")
+class LlavaMetaForCausalLM(ABC):
+    @abstractmethod
+    def get_model(self):
+        pass
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+    def encode_video_image(self, images_list, video_idx_in_batch):
+        # video encoder编码后按图像的connector处理
+        bs = len(images_list)
+        concat_images = []
+        concat_videos = []
+        for idx, image in enumerate(images_list):
+            if idx in video_idx_in_batch:
+                concat_videos.append(image)
+            else:
+                concat_images.append(image)
+        # print(concat_videos[0].shape)
+        has_image = len(concat_images) > 0
+        has_video = len(concat_videos) > 0
+        mm_local_num_frames = getattr(self.config, "mm_local_num_frames", -1)
+        assert mm_local_num_frames != -1
+        if has_image:
+            image_split_sizes = [image.shape[0] for image in concat_images]
+            concat_images = torch.cat([image.unsqueeze(1) for image in concat_images], dim=0)
+            # print("input vit image.shape:", concat_images.shape)
+            images_features = self.get_model().get_vision_tower()(concat_images) # B_i, N, D
+            images_features = torch.split(images_features, image_split_sizes)
+        if has_video:
+            video_split_sizes = [video.shape[0] // mm_local_num_frames for video in concat_videos]
+            concat_videos = torch.cat([video.reshape(video.shape[0] // mm_local_num_frames, mm_local_num_frames, video.shape[1], video.shape[2], video.shape[3]) for video in concat_videos], dim=0)
+            # print("input vit video.shape:", concat_videos.shape)
+            videos_features = self.get_model().get_vision_tower()(concat_videos) # B_v, N, D
+            videos_features = [v.reshape(-1, v.shape[-2] // mm_local_num_frames, v.shape[-1]) for v in torch.split(videos_features, video_split_sizes)]
+        all_videos_or_images_features = []
+        img_idx = 0
+        vid_idx = 0
+        for idx in range(bs):
+            if idx in video_idx_in_batch:
+                feat = self.get_model().mm_projector(videos_features[vid_idx], compress=True, local_num_frames=getattr(self.config, "mm_local_num_frames", -1))
+                vid_idx += 1
+            else:
+                feat = self.get_model().mm_projector(images_features[img_idx], compress=False)
+                img_idx += 1
+            # print("video_idx_in_batch:", video_idx_in_batch)
+            all_videos_or_images_features.append(feat)
+        if has_video:
+            assert vid_idx == len(videos_features), f"vid: {vid_idx} != {len(videos_features)}"
+        if has_image:
+            assert img_idx == len(images_features), f"img: {img_idx} != {len(images_features)}"
+        return all_videos_or_images_features
+    def prepare_inputs_labels_for_multimodal(self, input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities=["image"], image_sizes=None):
+        assert type(modalities) is list, modalities
+        mm_llm_compress = getattr(self.config, "mm_llm_compress", False)
+        # 初始化这些变量，无论 mm_llm_compress 是否为 True
+        first_image_token_position = []
+        text_prompt_lens = []
+        if mm_llm_compress:
+            self.model.llm_compress_type = getattr(self.config, "llm_compress_type", "attention")
+            self.model.llm_compress_layer_list = getattr(self.config, "llm_compress_layer_list", [8, 16, 24])
+            self.model.llm_image_token_ratio_list = getattr(self.config, "llm_image_token_ratio_list", [1.0, 0.5, 0.25, 0.125])
+        else:
+            self.model.llm_compress_type = "attention"
+            self.model.llm_compress_layer_list = []
+            self.model.llm_image_token_ratio_list = []
+        if images is None:
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+        vision_tower = self.get_vision_tower()
+        # rank_print(modalities)
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+        if type(images) is list or images.ndim == 5:
+            if type(images) is list:
+                images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
+            video_idx_in_batch = []
+            for _ in range(len(modalities)):
+                if modalities[_] == "video":
+                    video_idx_in_batch.append(_)
+            images_list = []
+            for image in images:
+                if image.ndim == 4:
+                    images_list.append(image)
+                else:
+                    images_list.append(image.unsqueeze(0))
+            vision_encode_type = getattr(self.config, "vision_encode_type", "image")
+            mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
+            image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
+            frame_aspect_ratio = getattr(self.config, "frame_aspect_ratio", "square")
+            mm_newline_position = getattr(self.config, "mm_newline_position", "nothing")
+            if vision_encode_type == "video_image": # video backbone, process video with compress
+                image_features = self.encode_video_image(images_list, video_idx_in_batch=video_idx_in_batch)
+            else:
+                raise NotImplementedError(vision_encode_type)
+            if mm_patch_merge_type == "flat":
+                image_features = [x.flatten(0, 1) for x in image_features]
+            elif mm_patch_merge_type.startswith("spatial"):
+                new_image_features = []
+                for image_idx, image_feature in enumerate(image_features):
+                    if image_idx in video_idx_in_batch:  # video operations
+                        if "anyres" in frame_aspect_ratio:
+                            raise NotImplementedError
+                        else:
+                            frame_feature = image_feature
+                        if "pad" in mm_patch_merge_type:
+                            if mm_newline_position == 'one_token':
+                                frame_feature = frame_feature.flatten(0, 1)
+                                if "unpad" in mm_patch_merge_type:
+                                    frame_feature = torch.cat((frame_feature, self.model.image_newline[None].to(frame_feature.device)), dim=0)
+                                else:
+                                    frame_feature = torch.cat((frame_feature, self.model.frame_newline[None].to(frame_feature.device)), dim=0)
+                            elif mm_newline_position == 'nothing':
+                                frame_feature = frame_feature.flatten(0, 1)
+                            else:
+                                raise NotImplementedError("add pad please!!")
+                        else:
+                            frame_feature = frame_feature.flatten(0, 1)
+                        # print(f"final video frame_feature.shape: {frame_feature.shape}")
+                        image_feature = frame_feature
+                    elif image_feature.shape[0] > 1:  # multi patches and multi images operations
+                        base_image_feature = image_feature[0]
+                        image_feature = image_feature[1:]
+                        origin_size = image_feature.shape
+                        height = width = self.get_model().mm_projector.num_image_patches_per_side
+                        assert height * width == base_image_feature.shape[0], f"height:{height}, width: {width}, base_image_feature: {base_image_feature.shape}"
+                        if "anyres_max" in image_aspect_ratio:
+                            matched_anyres_max_num_patches = re.match(r"anyres_max_(\d+)", image_aspect_ratio)
+                            if matched_anyres_max_num_patches:
+                                max_num_patches = int(matched_anyres_max_num_patches.group(1))
+                        if "anyres" in image_aspect_ratio:
+                            if hasattr(self.get_vision_tower(), "image_size"):
+                                vision_tower_image_size = self.get_vision_tower().image_size
+                            else:
+                                raise ValueError("vision_tower_image_size is not found in the vision tower.")
+                            try:
+                                num_patch_width, num_patch_height = get_anyres_image_grid_shape(image_sizes[image_idx], self.config.image_grid_pinpoints, vision_tower_image_size, max_resolutions=None)
+                            except Exception as e:
+                                print(f"Error: {e}")
+                                raise e
+                                # num_patch_width, num_patch_height = 2, 2
+                            image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
+                        else:
+                            raise NotImplementedError(image_aspect_ratio)
+                            image_feature = image_feature.view(2, 2, height, width, -1)
+                        if "maxpool2x2" in mm_patch_merge_type:
+                            raise NotImplementedError
+                        elif "unpad" in mm_patch_merge_type and "anyres_max" in image_aspect_ratio and matched_anyres_max_num_patches:
+                            raise NotImplementedError
+                        elif "unpad" in mm_patch_merge_type:
+                            raise NotImplementedError
+                        else:
+                            image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
+                            image_feature = image_feature.flatten(0, 3)
+                        if "nobase" in mm_patch_merge_type:
+                            pass
+                        else:
+                            try:
+                                image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+                            except Exception as e:
+                                raise ValueError(f"{num_patch_width} {num_patch_height} now: base_image_feature: {base_image_feature.shape}, {image_feature.shape}, image_sizes[image_idx]: {image_sizes[image_idx]}, origin_size: {origin_size}, {image_sizes[image_idx]}, {self.config.image_grid_pinpoints}, {vision_tower_image_size}")
+                    else:  # single image operations
+                        image_feature = image_feature[0]
+                        if "unpad" in mm_patch_merge_type:
+                            image_feature = torch.cat((image_feature, self.model.image_newline[None]), dim=0)
+                    # print(f"image/video_feature.shape: {image_feature.shape}")
+                    new_image_features.append(image_feature)
+                image_features = new_image_features
+            else:
+                raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
+        else:
+            # raise NotImplementedError(f"images.shape={images.shape},  modalities={modalities}")
+            image_features = self.encode_image(images)
+        # TODO: image start / end is not implemented here to support pretraining.
+        if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(self.config, "mm_use_im_start_end", False):
+            raise NotImplementedError
+        # print(f"Total images len(image_features: {len(image_features)}")
+        # Let's just add dummy tensors if they do not exist,
+        # it is a headache to deal with None all the time.
+        # But it is not ideal, and if you have a better idea,
+        # please open an issue / submit a PR, thanks.
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        # rank_print("Inserting Images embedding")
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            # 无论 mm_llm_compress 是否为 True，都记录图像位置信息
+            image_index = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist()
+            if mm_llm_compress and len(image_index) > 1:
+                # 只有在 mm_llm_compress 模式下才检查单图像/视频限制
+                assert len(image_index) == 1, f"Only support single image/video: {image_index}"
+            if len(image_index) == 0:
+                first_image_token_position.append(-1)
+            else:
+                first_image_token_position.append(image_index[0])
+            # 记录输入指令长度
+            if not self.training:
+                if len(image_index) == 0:
+                    assert num_images == 0, num_images
+                else:
+                    if mm_llm_compress:
+                        assert num_images == 1, f"num_images={num_images}"
+                text_prompt_lens.append(cur_input_ids.shape[0] - num_images)
+            # print(f"num_images={num_images}")
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+            image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    try:
+                        cur_image_features = image_features[cur_image_idx]
+                    except IndexError:
+                        print(f"cur_image_idx={cur_image_idx} is not ok")
+                        cur_image_features = image_features[cur_image_idx - 1]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
+            cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
+            # import pdb; pdb.set_trace()
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+        # 无论 mm_llm_compress 是否为 True，都设置这些属性
+        self.model.first_image_token_position = first_image_token_position
+        self.model.text_prompt_lens = text_prompt_lens
+        self.model.num_image_token_lens = [image_feature.shape[0] for image_feature in image_features]
+        self.model.image_sizes = image_sizes  # 保存图片尺寸信息
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None)
+        # rank_print("Finishing Inserting")
+        new_input_embeds = [x[:tokenizer_model_max_length] for x, modality in zip(new_input_embeds, modalities)]
+        new_labels = [x[:tokenizer_model_max_length] for x, modality in zip(new_labels, modalities)]
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+        # print("Prepare pos id")
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, "tokenizer_padding_side", "right") == "left":
+                new_input_embeds_padded.append(torch.cat((torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device), cur_new_embed), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+            else:
+                new_input_embeds_padded.append(torch.cat((cur_new_embed, torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+        # print("tokenizer padding")
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+        if _position_ids is None:
+            position_ids = None
+        if getattr(self.config, "use_pos_skipping", False) and self.training:
+            position_ids = torch.arange(new_input_embeds.size(1), device=new_input_embeds.device).unsqueeze(0).to(new_input_embeds.device)
+            split_position = random.randint(0, new_input_embeds.size(1))
+            left_add = random.randint(0, self.config.pos_skipping_range)
+            right_add = random.randint(left_add, self.config.pos_skipping_range)
+            position_ids[:, :split_position] += left_add
+            position_ids[:, split_position:] += right_add
+        # import pdb; pdb.set_trace()
+        # print("Finish preparing")
+        return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
+    def initialize_vision_tokenizer(self, model_args, tokenizer):
+        if model_args.mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+        if model_args.mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+            if model_args.pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location="cpu")
+                embed_tokens_weight = mm_projector_weights["model.embed_tokens.weight"]
+                assert num_new_tokens == 2
+                if input_embeddings.shape == embed_tokens_weight.shape:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
+                elif embed_tokens_weight.shape[0] == num_new_tokens:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                else:
+                    raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
+        elif model_args.mm_use_im_patch_token:
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = False
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+class VideoChatFlashQwenConfig(Qwen2Config):
+    model_type = "videochat_flash_qwen"
+class VideoChatFlashQwenModel(LlavaMetaModel, Qwen2Model_Flash):
+    config_class = VideoChatFlashQwenConfig
+    def __init__(self, config: VideoChatFlashQwenConfig):
+        super(VideoChatFlashQwenModel, self).__init__(config)
+class VideoChatFlashQwenForCausalLM(LlavaMetaForCausalLM, Qwen2ForCausalLM_Flash):
+    config_class = VideoChatFlashQwenConfig
+    def __init__(self, config):
+        print("#######using VideoChatFlashQwenForCausalLM#######")
+        # super(Qwen2ForCausalLM, self).__init__(config)
+        Qwen2ForCausalLM_Flash.__init__(self, config)
+        config.model_type = "videochat_flash_qwen"
+        # config.rope_scaling = None
+        self.model = VideoChatFlashQwenModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+        modalities: Optional[List[str]] = ["image"],
+        dpo_forward: Optional[bool] = False,
+        cache_position=None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if inputs_embeds is None:
+            (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes)
+        # print("inputs_embeds.shape:", inputs_embeds.shape)
+        if dpo_forward:
+            raise NotImplementedError
+        else:
+            return super().forward(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                labels=labels,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        modalities: Optional[List[str]] = ["image"],
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        if images is not None:
+            (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, modalities, image_sizes=image_sizes)
+        else:
+            self.model.image_token_posi = [-1]
+            self.model.prompt_len = None
+            self.model.image_tokens = [0]
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+        return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs)
+    @torch.no_grad()
+    def chat(self,
+        video_path,
+        tokenizer,
+        user_prompt,
+        chat_history=None,
+        return_history=True,
+        max_num_frames=512,
+        media_dict=None,
+        generation_config={}):
+        frames, time_msg  = load_video(video_path, max_num_frames=max_num_frames, media_dict=media_dict)
+        image_sizes = [frames[0].shape[:2]]
+        frames = [self.get_vision_tower().image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].to(self.model.dtype).cuda()]
+        conv = conv_templates["qwen_2"].copy()
+        if chat_history is None or len(chat_history) == 0:
+            user_prompt = f'{DEFAULT_IMAGE_TOKEN}\n{time_msg.strip()} {user_prompt}'
+        else:
+            assert DEFAULT_IMAGE_TOKEN in chat_history[0]['content'], chat_history
+            for msg in chat_history:
+                conv.append_message(msg['role'], msg['content'])
+        conv.append_message(conv.roles[0], user_prompt)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
+        if tokenizer.pad_token_id is None:
+            if "qwen" in tokenizer.name_or_path.lower():
+                print("Setting pad token to bos token for qwen model.")
+                tokenizer.pad_token_id = 151643
+        attention_masks = input_ids.ne(tokenizer.pad_token_id).long().cuda()
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        with torch.inference_mode():
+            output_ids = self.generate(
+                inputs=input_ids,
+                images=frames,
+                attention_mask=attention_masks,
+                modalities=["video"],
+                image_sizes=image_sizes,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria],
+                **generation_config
+            )
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[: -len(stop_str)]
+        outputs = outputs.strip()
+        # print(f"\033[91m== Question: \033[0m\n{prompt}\n")
+        # print(f"\033[91m== Response: \033[0m\n{outputs}\n")
+        if chat_history is None:
+            chat_history = []
+        chat_history.append({"role":conv.roles[0], "content":user_prompt})
+        chat_history.append({"role":conv.roles[1], "content":outputs})
+        if return_history:
+            return outputs, chat_history
+        else:
+            return outputs
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs)
+        if images is not None:
+            inputs["images"] = images
+        if image_sizes is not None:
+            inputs["image_sizes"] = image_sizes
+        return inputs
+AutoConfig.register("videochat_flash_qwen", VideoChatFlashQwenConfig)
+AutoModelForCausalLM.register(VideoChatFlashQwenConfig, VideoChatFlashQwenForCausalLM)

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48697e6f44f6bdfe0ad925a45a9a2df793a136b4c53c1e41f11ef68e2d9a824c
+size 617358074

rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad8a35afd8967cbb748405387e44426e43ad127028e826eddc9b67d2ca873c85
+size 15984

rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f338ce80d7c441076bfc8c53b84067a0181f5a14e80c13d5acb8150b659f4d73
+size 15984

rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9fbc9fa428939be10b46779f0eb5cd833e0da426b1cbdee77b3a55b6952235b
+size 15984

rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac55dba0b79d5fa4699d239da2f966d52040d576d31234ac8d4632e6956481bc
+size 15984

rng_state_4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af2d0c015100768ffa23faf3b6c2d54ea89eb045603e30e55cd211e06ff34972
+size 15984

rng_state_5.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c60a1b40608e34bc801c8231f97b81c53b5290dfaed1b9cd0ccbeca29574a991
+size 15984

rng_state_6.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ad6a142a403eb9aafc4a3a9a856bca648fe31fd22d796867baca31fb13656aa
+size 15984

rng_state_7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38bc23a138cc800b22881742c0f3f9a71731a9a7111c6058a0077e6274d21773
+size 15984

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4eb3584f7b416bcaf3353a3f8d191c7b44eaba6cf7f601a4d00ebf29a582e0d
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2821 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.4840235588739972,
+  "eval_steps": 500,
+  "global_step": 4000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 25.625,
+      "learning_rate": 6.180469715698394e-07,
+      "loss": 1.7354,
+      "step": 10
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 27.0,
+      "learning_rate": 1.2360939431396788e-06,
+      "loss": 1.7147,
+      "step": 20
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 23.375,
+      "learning_rate": 1.854140914709518e-06,
+      "loss": 1.7215,
+      "step": 30
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 22.375,
+      "learning_rate": 2.4721878862793575e-06,
+      "loss": 1.6731,
+      "step": 40
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 18.875,
+      "learning_rate": 3.090234857849197e-06,
+      "loss": 1.6182,
+      "step": 50
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 15.3125,
+      "learning_rate": 3.708281829419036e-06,
+      "loss": 1.5128,
+      "step": 60
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 9.9375,
+      "learning_rate": 4.326328800988875e-06,
+      "loss": 1.4061,
+      "step": 70
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 7.46875,
+      "learning_rate": 4.944375772558715e-06,
+      "loss": 1.332,
+      "step": 80
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 4.46875,
+      "learning_rate": 5.562422744128554e-06,
+      "loss": 1.1765,
+      "step": 90
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 4.25,
+      "learning_rate": 6.180469715698394e-06,
+      "loss": 1.165,
+      "step": 100
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.140625,
+      "learning_rate": 6.798516687268234e-06,
+      "loss": 1.0473,
+      "step": 110
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.390625,
+      "learning_rate": 7.416563658838072e-06,
+      "loss": 0.9959,
+      "step": 120
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.93359375,
+      "learning_rate": 8.034610630407912e-06,
+      "loss": 1.0219,
+      "step": 130
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.82421875,
+      "learning_rate": 8.65265760197775e-06,
+      "loss": 0.9495,
+      "step": 140
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.6796875,
+      "learning_rate": 9.27070457354759e-06,
+      "loss": 0.9312,
+      "step": 150
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.66796875,
+      "learning_rate": 9.88875154511743e-06,
+      "loss": 0.94,
+      "step": 160
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.71484375,
+      "learning_rate": 1.0506798516687269e-05,
+      "loss": 0.9317,
+      "step": 170
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.703125,
+      "learning_rate": 1.1124845488257108e-05,
+      "loss": 0.9579,
+      "step": 180
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.59375,
+      "learning_rate": 1.1742892459826947e-05,
+      "loss": 0.8969,
+      "step": 190
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.64453125,
+      "learning_rate": 1.2360939431396788e-05,
+      "loss": 0.9221,
+      "step": 200
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5859375,
+      "learning_rate": 1.2978986402966625e-05,
+      "loss": 0.8965,
+      "step": 210
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.62890625,
+      "learning_rate": 1.3597033374536467e-05,
+      "loss": 0.9268,
+      "step": 220
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.53515625,
+      "learning_rate": 1.4215080346106304e-05,
+      "loss": 0.9049,
+      "step": 230
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.6171875,
+      "learning_rate": 1.4833127317676143e-05,
+      "loss": 0.9096,
+      "step": 240
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.64453125,
+      "learning_rate": 1.5451174289245984e-05,
+      "loss": 0.8934,
+      "step": 250
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.671875,
+      "learning_rate": 1.6069221260815823e-05,
+      "loss": 0.8824,
+      "step": 260
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.84765625,
+      "learning_rate": 1.6687268232385662e-05,
+      "loss": 0.9045,
+      "step": 270
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5625,
+      "learning_rate": 1.73053152039555e-05,
+      "loss": 0.8949,
+      "step": 280
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.625,
+      "learning_rate": 1.792336217552534e-05,
+      "loss": 0.8893,
+      "step": 290
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.482421875,
+      "learning_rate": 1.854140914709518e-05,
+      "loss": 0.8906,
+      "step": 300
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.54296875,
+      "learning_rate": 1.915945611866502e-05,
+      "loss": 0.8883,
+      "step": 310
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.828125,
+      "learning_rate": 1.977750309023486e-05,
+      "loss": 0.8665,
+      "step": 320
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.66015625,
+      "learning_rate": 2.0395550061804696e-05,
+      "loss": 0.8452,
+      "step": 330
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.51953125,
+      "learning_rate": 2.1013597033374538e-05,
+      "loss": 0.8943,
+      "step": 340
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.490234375,
+      "learning_rate": 2.1631644004944377e-05,
+      "loss": 0.8802,
+      "step": 350
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.5234375,
+      "learning_rate": 2.2249690976514216e-05,
+      "loss": 0.8798,
+      "step": 360
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.46484375,
+      "learning_rate": 2.2867737948084055e-05,
+      "loss": 0.8778,
+      "step": 370
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.55859375,
+      "learning_rate": 2.3485784919653894e-05,
+      "loss": 0.8314,
+      "step": 380
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.494140625,
+      "learning_rate": 2.4103831891223736e-05,
+      "loss": 0.8829,
+      "step": 390
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.46484375,
+      "learning_rate": 2.4721878862793575e-05,
+      "loss": 0.8734,
+      "step": 400
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.5078125,
+      "learning_rate": 2.5339925834363414e-05,
+      "loss": 0.8421,
+      "step": 410
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.51953125,
+      "learning_rate": 2.595797280593325e-05,
+      "loss": 0.8636,
+      "step": 420
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.51953125,
+      "learning_rate": 2.6576019777503092e-05,
+      "loss": 0.856,
+      "step": 430
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.50390625,
+      "learning_rate": 2.7194066749072934e-05,
+      "loss": 0.8537,
+      "step": 440
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.56640625,
+      "learning_rate": 2.781211372064277e-05,
+      "loss": 0.8738,
+      "step": 450
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.47265625,
+      "learning_rate": 2.843016069221261e-05,
+      "loss": 0.8548,
+      "step": 460
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.484375,
+      "learning_rate": 2.904820766378245e-05,
+      "loss": 0.8214,
+      "step": 470
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.52734375,
+      "learning_rate": 2.9666254635352287e-05,
+      "loss": 0.8489,
+      "step": 480
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.474609375,
+      "learning_rate": 3.0284301606922126e-05,
+      "loss": 0.8641,
+      "step": 490
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.50390625,
+      "learning_rate": 3.090234857849197e-05,
+      "loss": 0.8443,
+      "step": 500
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.546875,
+      "learning_rate": 3.1520395550061804e-05,
+      "loss": 0.9063,
+      "step": 510
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.68359375,
+      "learning_rate": 3.2138442521631646e-05,
+      "loss": 0.8458,
+      "step": 520
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.494140625,
+      "learning_rate": 3.275648949320149e-05,
+      "loss": 0.8552,
+      "step": 530
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5,
+      "learning_rate": 3.3374536464771324e-05,
+      "loss": 0.825,
+      "step": 540
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.51171875,
+      "learning_rate": 3.3992583436341166e-05,
+      "loss": 0.8666,
+      "step": 550
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.54296875,
+      "learning_rate": 3.4610630407911e-05,
+      "loss": 0.8252,
+      "step": 560
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.69921875,
+      "learning_rate": 3.522867737948084e-05,
+      "loss": 0.86,
+      "step": 570
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.4921875,
+      "learning_rate": 3.584672435105068e-05,
+      "loss": 0.8664,
+      "step": 580
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.431640625,
+      "learning_rate": 3.646477132262052e-05,
+      "loss": 0.8137,
+      "step": 590
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.5,
+      "learning_rate": 3.708281829419036e-05,
+      "loss": 0.8572,
+      "step": 600
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.466796875,
+      "learning_rate": 3.77008652657602e-05,
+      "loss": 0.8617,
+      "step": 610
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.46875,
+      "learning_rate": 3.831891223733004e-05,
+      "loss": 0.8579,
+      "step": 620
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.451171875,
+      "learning_rate": 3.893695920889988e-05,
+      "loss": 0.8581,
+      "step": 630
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.462890625,
+      "learning_rate": 3.955500618046972e-05,
+      "loss": 0.8633,
+      "step": 640
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.478515625,
+      "learning_rate": 4.0173053152039556e-05,
+      "loss": 0.852,
+      "step": 650
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.478515625,
+      "learning_rate": 4.079110012360939e-05,
+      "loss": 0.8598,
+      "step": 660
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.61328125,
+      "learning_rate": 4.1409147095179234e-05,
+      "loss": 0.844,
+      "step": 670
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.51953125,
+      "learning_rate": 4.2027194066749076e-05,
+      "loss": 0.834,
+      "step": 680
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.4453125,
+      "learning_rate": 4.264524103831891e-05,
+      "loss": 0.8257,
+      "step": 690
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.50390625,
+      "learning_rate": 4.3263288009888754e-05,
+      "loss": 0.8337,
+      "step": 700
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.431640625,
+      "learning_rate": 4.3881334981458596e-05,
+      "loss": 0.819,
+      "step": 710
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.455078125,
+      "learning_rate": 4.449938195302843e-05,
+      "loss": 0.8306,
+      "step": 720
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.47265625,
+      "learning_rate": 4.511742892459827e-05,
+      "loss": 0.8521,
+      "step": 730
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.54296875,
+      "learning_rate": 4.573547589616811e-05,
+      "loss": 0.8355,
+      "step": 740
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.486328125,
+      "learning_rate": 4.635352286773795e-05,
+      "loss": 0.8552,
+      "step": 750
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.455078125,
+      "learning_rate": 4.697156983930779e-05,
+      "loss": 0.814,
+      "step": 760
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.4296875,
+      "learning_rate": 4.758961681087763e-05,
+      "loss": 0.832,
+      "step": 770
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.447265625,
+      "learning_rate": 4.820766378244747e-05,
+      "loss": 0.8399,
+      "step": 780
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.447265625,
+      "learning_rate": 4.882571075401731e-05,
+      "loss": 0.7969,
+      "step": 790
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.4296875,
+      "learning_rate": 4.944375772558715e-05,
+      "loss": 0.8477,
+      "step": 800
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.49609375,
+      "learning_rate": 4.99999941211936e-05,
+      "loss": 0.8282,
+      "step": 810
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5,
+      "learning_rate": 4.999928866777183e-05,
+      "loss": 0.8227,
+      "step": 820
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.412109375,
+      "learning_rate": 4.999740749108744e-05,
+      "loss": 0.8077,
+      "step": 830
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.4453125,
+      "learning_rate": 4.99943506796127e-05,
+      "loss": 0.8447,
+      "step": 840
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.52734375,
+      "learning_rate": 4.999011837711028e-05,
+      "loss": 0.8327,
+      "step": 850
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.998471078262649e-05,
+      "loss": 0.8341,
+      "step": 860
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.43359375,
+      "learning_rate": 4.997812815048196e-05,
+      "loss": 0.8469,
+      "step": 870
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.453125,
+      "learning_rate": 4.9970370790259646e-05,
+      "loss": 0.8331,
+      "step": 880
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.458984375,
+      "learning_rate": 4.9961439066790275e-05,
+      "loss": 0.8161,
+      "step": 890
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.404296875,
+      "learning_rate": 4.995133340013522e-05,
+      "loss": 0.8255,
+      "step": 900
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.4375,
+      "learning_rate": 4.994005426556668e-05,
+      "loss": 0.8595,
+      "step": 910
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.44140625,
+      "learning_rate": 4.99276021935454e-05,
+      "loss": 0.8391,
+      "step": 920
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.419921875,
+      "learning_rate": 4.991397776969566e-05,
+      "loss": 0.8364,
+      "step": 930
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.439453125,
+      "learning_rate": 4.989918163477778e-05,
+      "loss": 0.8018,
+      "step": 940
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.9883214484657957e-05,
+      "loss": 0.8303,
+      "step": 950
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.470703125,
+      "learning_rate": 4.986607707027556e-05,
+      "loss": 0.837,
+      "step": 960
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.466796875,
+      "learning_rate": 4.9847770197607784e-05,
+      "loss": 0.8289,
+      "step": 970
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.4453125,
+      "learning_rate": 4.982829472763177e-05,
+      "loss": 0.8273,
+      "step": 980
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.470703125,
+      "learning_rate": 4.9807651576284104e-05,
+      "loss": 0.7904,
+      "step": 990
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.458984375,
+      "learning_rate": 4.9785841714417734e-05,
+      "loss": 0.8293,
+      "step": 1000
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.421875,
+      "learning_rate": 4.976286616775634e-05,
+      "loss": 0.8347,
+      "step": 1010
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.439453125,
+      "learning_rate": 4.973872601684604e-05,
+      "loss": 0.8279,
+      "step": 1020
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.470703125,
+      "learning_rate": 4.971342239700462e-05,
+      "loss": 0.8283,
+      "step": 1030
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.474609375,
+      "learning_rate": 4.9686956498268114e-05,
+      "loss": 0.8311,
+      "step": 1040
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.439453125,
+      "learning_rate": 4.9659329565334854e-05,
+      "loss": 0.8081,
+      "step": 1050
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.41796875,
+      "learning_rate": 4.963054289750693e-05,
+      "loss": 0.8201,
+      "step": 1060
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4765625,
+      "learning_rate": 4.960059784862905e-05,
+      "loss": 0.8056,
+      "step": 1070
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.55859375,
+      "learning_rate": 4.956949582702492e-05,
+      "loss": 0.8262,
+      "step": 1080
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.4765625,
+      "learning_rate": 4.953723829543095e-05,
+      "loss": 0.8043,
+      "step": 1090
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.412109375,
+      "learning_rate": 4.950382677092754e-05,
+      "loss": 0.8074,
+      "step": 1100
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.41796875,
+      "learning_rate": 4.946926282486766e-05,
+      "loss": 0.8122,
+      "step": 1110
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.4609375,
+      "learning_rate": 4.943354808280298e-05,
+      "loss": 0.7852,
+      "step": 1120
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.447265625,
+      "learning_rate": 4.9396684224407404e-05,
+      "loss": 0.8119,
+      "step": 1130
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.466796875,
+      "learning_rate": 4.9358672983398105e-05,
+      "loss": 0.8018,
+      "step": 1140
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.390625,
+      "learning_rate": 4.931951614745395e-05,
+      "loss": 0.7945,
+      "step": 1150
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.431640625,
+      "learning_rate": 4.927921555813148e-05,
+      "loss": 0.8152,
+      "step": 1160
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.4453125,
+      "learning_rate": 4.92377731107782e-05,
+      "loss": 0.8117,
+      "step": 1170
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.919519075444358e-05,
+      "loss": 0.7931,
+      "step": 1180
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.915147049178726e-05,
+      "loss": 0.7958,
+      "step": 1190
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.910661437898493e-05,
+      "loss": 0.8202,
+      "step": 1200
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.4453125,
+      "learning_rate": 4.906062452563165e-05,
+      "loss": 0.8097,
+      "step": 1210
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.4453125,
+      "learning_rate": 4.901350309464256e-05,
+      "loss": 0.7959,
+      "step": 1220
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.44140625,
+      "learning_rate": 4.8965252302151234e-05,
+      "loss": 0.7805,
+      "step": 1230
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.443359375,
+      "learning_rate": 4.8915874417405394e-05,
+      "loss": 0.8031,
+      "step": 1240
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.41015625,
+      "learning_rate": 4.886537176266024e-05,
+      "loss": 0.8371,
+      "step": 1250
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.881374671306917e-05,
+      "loss": 0.8063,
+      "step": 1260
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.4609375,
+      "learning_rate": 4.8761001696572166e-05,
+      "loss": 0.7884,
+      "step": 1270
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.40234375,
+      "learning_rate": 4.87071391937815e-05,
+      "loss": 0.7977,
+      "step": 1280
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.4609375,
+      "learning_rate": 4.865216173786517e-05,
+      "loss": 0.8154,
+      "step": 1290
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.50390625,
+      "learning_rate": 4.859607191442768e-05,
+      "loss": 0.8157,
+      "step": 1300
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.435546875,
+      "learning_rate": 4.853887236138851e-05,
+      "loss": 0.792,
+      "step": 1310
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.42578125,
+      "learning_rate": 4.8480565768857995e-05,
+      "loss": 0.8177,
+      "step": 1320
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.43359375,
+      "learning_rate": 4.842115487901086e-05,
+      "loss": 0.8022,
+      "step": 1330
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.41796875,
+      "learning_rate": 4.8360642485957195e-05,
+      "loss": 0.8069,
+      "step": 1340
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.396484375,
+      "learning_rate": 4.829903143561113e-05,
+      "loss": 0.7789,
+      "step": 1350
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.453125,
+      "learning_rate": 4.823632462555691e-05,
+      "loss": 0.7911,
+      "step": 1360
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.46875,
+      "learning_rate": 4.817252500491264e-05,
+      "loss": 0.8172,
+      "step": 1370
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.435546875,
+      "learning_rate": 4.810763557419164e-05,
+      "loss": 0.8421,
+      "step": 1380
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.447265625,
+      "learning_rate": 4.8041659385161255e-05,
+      "loss": 0.8011,
+      "step": 1390
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.5078125,
+      "learning_rate": 4.7974599540699386e-05,
+      "loss": 0.8185,
+      "step": 1400
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.416015625,
+      "learning_rate": 4.7906459194648545e-05,
+      "loss": 0.7877,
+      "step": 1410
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.462890625,
+      "learning_rate": 4.783724155166751e-05,
+      "loss": 0.7888,
+      "step": 1420
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.423828125,
+      "learning_rate": 4.7766949867080624e-05,
+      "loss": 0.8004,
+      "step": 1430
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.412109375,
+      "learning_rate": 4.76955874467247e-05,
+      "loss": 0.8045,
+      "step": 1440
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.4921875,
+      "learning_rate": 4.762315764679353e-05,
+      "loss": 0.7865,
+      "step": 1450
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.458984375,
+      "learning_rate": 4.754966387368008e-05,
+      "loss": 0.782,
+      "step": 1460
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.4453125,
+      "learning_rate": 4.747510958381623e-05,
+      "loss": 0.7674,
+      "step": 1470
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.41796875,
+      "learning_rate": 4.739949828351028e-05,
+      "loss": 0.805,
+      "step": 1480
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.7322833528782e-05,
+      "loss": 0.7571,
+      "step": 1490
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.44921875,
+      "learning_rate": 4.7245118925195374e-05,
+      "loss": 0.8033,
+      "step": 1500
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.447265625,
+      "learning_rate": 4.716635812768911e-05,
+      "loss": 0.821,
+      "step": 1510
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.431640625,
+      "learning_rate": 4.7086554840404676e-05,
+      "loss": 0.8236,
+      "step": 1520
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.451171875,
+      "learning_rate": 4.7005712816512096e-05,
+      "loss": 0.7947,
+      "step": 1530
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.51953125,
+      "learning_rate": 4.6923835858033495e-05,
+      "loss": 0.7978,
+      "step": 1540
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.4921875,
+      "learning_rate": 4.684092781566422e-05,
+      "loss": 0.7876,
+      "step": 1550
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.408203125,
+      "learning_rate": 4.6756992588591775e-05,
+      "loss": 0.7836,
+      "step": 1560
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.408203125,
+      "learning_rate": 4.667203412431245e-05,
+      "loss": 0.7749,
+      "step": 1570
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.419921875,
+      "learning_rate": 4.658605641844564e-05,
+      "loss": 0.8152,
+      "step": 1580
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.431640625,
+      "learning_rate": 4.6499063514545934e-05,
+      "loss": 0.7645,
+      "step": 1590
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.447265625,
+      "learning_rate": 4.6411059503913e-05,
+      "loss": 0.7905,
+      "step": 1600
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.44921875,
+      "learning_rate": 4.63220485253991e-05,
+      "loss": 0.7851,
+      "step": 1610
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.435546875,
+      "learning_rate": 4.623203476521445e-05,
+      "loss": 0.809,
+      "step": 1620
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.400390625,
+      "learning_rate": 4.6141022456730395e-05,
+      "loss": 0.7852,
+      "step": 1630
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.44140625,
+      "learning_rate": 4.604901588028024e-05,
+      "loss": 0.7987,
+      "step": 1640
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.48046875,
+      "learning_rate": 4.5956019362958006e-05,
+      "loss": 0.8158,
+      "step": 1650
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.43359375,
+      "learning_rate": 4.586203727841488e-05,
+      "loss": 0.8109,
+      "step": 1660
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.408203125,
+      "learning_rate": 4.576707404665356e-05,
+      "loss": 0.7697,
+      "step": 1670
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.423828125,
+      "learning_rate": 4.567113413382034e-05,
+      "loss": 0.791,
+      "step": 1680
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.43359375,
+      "learning_rate": 4.5574222051995085e-05,
+      "loss": 0.7941,
+      "step": 1690
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.412109375,
+      "learning_rate": 4.547634235897906e-05,
+      "loss": 0.8203,
+      "step": 1700
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.451171875,
+      "learning_rate": 4.5377499658080525e-05,
+      "loss": 0.8188,
+      "step": 1710
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.42578125,
+      "learning_rate": 4.527769859789825e-05,
+      "loss": 0.8146,
+      "step": 1720
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.392578125,
+      "learning_rate": 4.5176943872102915e-05,
+      "loss": 0.7621,
+      "step": 1730
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.451171875,
+      "learning_rate": 4.5075240219216336e-05,
+      "loss": 0.801,
+      "step": 1740
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.4765625,
+      "learning_rate": 4.4972592422388634e-05,
+      "loss": 0.7702,
+      "step": 1750
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.390625,
+      "learning_rate": 4.486900530917329e-05,
+      "loss": 0.7888,
+      "step": 1760
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.44921875,
+      "learning_rate": 4.476448375130004e-05,
+      "loss": 0.7873,
+      "step": 1770
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.455078125,
+      "learning_rate": 4.4659032664445856e-05,
+      "loss": 0.7647,
+      "step": 1780
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.455078125,
+      "learning_rate": 4.4552657008003676e-05,
+      "loss": 0.7822,
+      "step": 1790
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.458984375,
+      "learning_rate": 4.4445361784849195e-05,
+      "loss": 0.7746,
+      "step": 1800
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.458984375,
+      "learning_rate": 4.43371520411056e-05,
+      "loss": 0.8167,
+      "step": 1810
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.431640625,
+      "learning_rate": 4.42280328659062e-05,
+      "loss": 0.8368,
+      "step": 1820
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.4118009391155124e-05,
+      "loss": 0.8029,
+      "step": 1830
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.396484375,
+      "learning_rate": 4.400708679128596e-05,
+      "loss": 0.8033,
+      "step": 1840
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.42578125,
+      "learning_rate": 4.389527028301836e-05,
+      "loss": 0.7797,
+      "step": 1850
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.412109375,
+      "learning_rate": 4.378256512511277e-05,
+      "loss": 0.8037,
+      "step": 1860
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.4921875,
+      "learning_rate": 4.366897661812304e-05,
+      "loss": 0.8031,
+      "step": 1870
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.46875,
+      "learning_rate": 4.355451010414716e-05,
+      "loss": 0.7943,
+      "step": 1880
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.41796875,
+      "learning_rate": 4.343917096657606e-05,
+      "loss": 0.7925,
+      "step": 1890
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.3322964629840344e-05,
+      "loss": 0.7984,
+      "step": 1900
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.4453125,
+      "learning_rate": 4.320589655915527e-05,
+      "loss": 0.8129,
+      "step": 1910
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.308797226026364e-05,
+      "loss": 0.7955,
+      "step": 1920
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.42578125,
+      "learning_rate": 4.296919727917692e-05,
+      "loss": 0.8326,
+      "step": 1930
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.439453125,
+      "learning_rate": 4.284957720191438e-05,
+      "loss": 0.8345,
+      "step": 1940
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3828125,
+      "learning_rate": 4.272911765424039e-05,
+      "loss": 0.7972,
+      "step": 1950
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.412109375,
+      "learning_rate": 4.260782430139984e-05,
+      "loss": 0.7851,
+      "step": 1960
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.44140625,
+      "learning_rate": 4.248570284785172e-05,
+      "loss": 0.8103,
+      "step": 1970
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.466796875,
+      "learning_rate": 4.236275903700078e-05,
+      "loss": 0.8216,
+      "step": 1980
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.431640625,
+      "learning_rate": 4.223899865092749e-05,
+      "loss": 0.8012,
+      "step": 1990
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.443359375,
+      "learning_rate": 4.2114427510116036e-05,
+      "loss": 0.7701,
+      "step": 2000
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.44921875,
+      "learning_rate": 4.198905147318065e-05,
+      "loss": 0.7926,
+      "step": 2010
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.43359375,
+      "learning_rate": 4.186287643659005e-05,
+      "loss": 0.8113,
+      "step": 2020
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.423828125,
+      "learning_rate": 4.1735908334390085e-05,
+      "loss": 0.8093,
+      "step": 2030
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3828125,
+      "learning_rate": 4.1608153137924723e-05,
+      "loss": 0.7778,
+      "step": 2040
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.427734375,
+      "learning_rate": 4.147961685555517e-05,
+      "loss": 0.8171,
+      "step": 2050
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.43359375,
+      "learning_rate": 4.1350305532377334e-05,
+      "loss": 0.8025,
+      "step": 2060
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.404296875,
+      "learning_rate": 4.122022524993747e-05,
+      "loss": 0.7875,
+      "step": 2070
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.41015625,
+      "learning_rate": 4.108938212594622e-05,
+      "loss": 0.7867,
+      "step": 2080
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.4375,
+      "learning_rate": 4.095778231399086e-05,
+      "loss": 0.7849,
+      "step": 2090
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.470703125,
+      "learning_rate": 4.08254320032459e-05,
+      "loss": 0.8028,
+      "step": 2100
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.44921875,
+      "learning_rate": 4.0692337418182014e-05,
+      "loss": 0.8001,
+      "step": 2110
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.474609375,
+      "learning_rate": 4.055850481827329e-05,
+      "loss": 0.8469,
+      "step": 2120
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.43359375,
+      "learning_rate": 4.042394049770286e-05,
+      "loss": 0.7858,
+      "step": 2130
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.400390625,
+      "learning_rate": 4.0288650785066886e-05,
+      "loss": 0.8021,
+      "step": 2140
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.4453125,
+      "learning_rate": 4.0152642043076884e-05,
+      "loss": 0.8165,
+      "step": 2150
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.43359375,
+      "learning_rate": 4.0015920668260544e-05,
+      "loss": 0.7834,
+      "step": 2160
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.435546875,
+      "learning_rate": 3.987849309066085e-05,
+      "loss": 0.8018,
+      "step": 2170
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.3984375,
+      "learning_rate": 3.9740365773533704e-05,
+      "loss": 0.7935,
+      "step": 2180
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.4453125,
+      "learning_rate": 3.960154521304394e-05,
+      "loss": 0.7974,
+      "step": 2190
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.439453125,
+      "learning_rate": 3.946203793795982e-05,
+      "loss": 0.7654,
+      "step": 2200
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.4375,
+      "learning_rate": 3.9321850509345946e-05,
+      "loss": 0.7982,
+      "step": 2210
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.458984375,
+      "learning_rate": 3.9180989520254785e-05,
+      "loss": 0.8053,
+      "step": 2220
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.42578125,
+      "learning_rate": 3.903946159541647e-05,
+      "loss": 0.8066,
+      "step": 2230
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.447265625,
+      "learning_rate": 3.889727339092736e-05,
+      "loss": 0.8322,
+      "step": 2240
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.453125,
+      "learning_rate": 3.875443159393689e-05,
+      "loss": 0.7522,
+      "step": 2250
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.48046875,
+      "learning_rate": 3.861094292233316e-05,
+      "loss": 0.8174,
+      "step": 2260
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.455078125,
+      "learning_rate": 3.8466814124426945e-05,
+      "loss": 0.8254,
+      "step": 2270
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.419921875,
+      "learning_rate": 3.8322051978634325e-05,
+      "loss": 0.7808,
+      "step": 2280
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.42578125,
+      "learning_rate": 3.817666329315792e-05,
+      "loss": 0.7936,
+      "step": 2290
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.451171875,
+      "learning_rate": 3.803065490566667e-05,
+      "loss": 0.8087,
+      "step": 2300
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.4296875,
+      "learning_rate": 3.788403368297426e-05,
+      "loss": 0.8025,
+      "step": 2310
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.453125,
+      "learning_rate": 3.773680652071619e-05,
+      "loss": 0.8164,
+      "step": 2320
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.447265625,
+      "learning_rate": 3.758898034302547e-05,
+      "loss": 0.8024,
+      "step": 2330
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.439453125,
+      "learning_rate": 3.7440562102206924e-05,
+      "loss": 0.7758,
+      "step": 2340
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.419921875,
+      "learning_rate": 3.7291558778410314e-05,
+      "loss": 0.7855,
+      "step": 2350
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.423828125,
+      "learning_rate": 3.714197737930199e-05,
+      "loss": 0.7729,
+      "step": 2360
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.400390625,
+      "learning_rate": 3.699182493973532e-05,
+      "loss": 0.8203,
+      "step": 2370
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.453125,
+      "learning_rate": 3.684110852141991e-05,
+      "loss": 0.8143,
+      "step": 2380
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.44921875,
+      "learning_rate": 3.668983521258938e-05,
+      "loss": 0.7886,
+      "step": 2390
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.470703125,
+      "learning_rate": 3.65380121276681e-05,
+      "loss": 0.804,
+      "step": 2400
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.43359375,
+      "learning_rate": 3.638564640693654e-05,
+      "loss": 0.7876,
+      "step": 2410
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.478515625,
+      "learning_rate": 3.623274521619549e-05,
+      "loss": 0.8019,
+      "step": 2420
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.4140625,
+      "learning_rate": 3.607931574642902e-05,
+      "loss": 0.8213,
+      "step": 2430
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.43359375,
+      "learning_rate": 3.5925365213466314e-05,
+      "loss": 0.7727,
+      "step": 2440
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.451171875,
+      "learning_rate": 3.5770900857642306e-05,
+      "loss": 0.8027,
+      "step": 2450
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.44140625,
+      "learning_rate": 3.561592994345715e-05,
+      "loss": 0.7515,
+      "step": 2460
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.431640625,
+      "learning_rate": 3.546045975923458e-05,
+      "loss": 0.7791,
+      "step": 2470
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.423828125,
+      "learning_rate": 3.530449761677911e-05,
+      "loss": 0.7931,
+      "step": 2480
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.484375,
+      "learning_rate": 3.514805085103222e-05,
+      "loss": 0.8273,
+      "step": 2490
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.4453125,
+      "learning_rate": 3.499112681972734e-05,
+      "loss": 0.7935,
+      "step": 2500
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.41796875,
+      "learning_rate": 3.4833732903043806e-05,
+      "loss": 0.8248,
+      "step": 2510
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.470703125,
+      "learning_rate": 3.467587650325981e-05,
+      "loss": 0.8168,
+      "step": 2520
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.416015625,
+      "learning_rate": 3.4517565044404266e-05,
+      "loss": 0.8103,
+      "step": 2530
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.421875,
+      "learning_rate": 3.4358805971907593e-05,
+      "loss": 0.8084,
+      "step": 2540
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.412109375,
+      "learning_rate": 3.4199606752251634e-05,
+      "loss": 0.8073,
+      "step": 2550
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.41796875,
+      "learning_rate": 3.403997487261846e-05,
+      "loss": 0.7934,
+      "step": 2560
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.427734375,
+      "learning_rate": 3.3879917840538265e-05,
+      "loss": 0.7868,
+      "step": 2570
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.45703125,
+      "learning_rate": 3.371944318353626e-05,
+      "loss": 0.8131,
+      "step": 2580
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.408203125,
+      "learning_rate": 3.355855844877869e-05,
+      "loss": 0.8016,
+      "step": 2590
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.51171875,
+      "learning_rate": 3.3397271202717834e-05,
+      "loss": 0.784,
+      "step": 2600
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.427734375,
+      "learning_rate": 3.323558903073623e-05,
+      "loss": 0.795,
+      "step": 2610
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.435546875,
+      "learning_rate": 3.307351953678986e-05,
+      "loss": 0.7958,
+      "step": 2620
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.466796875,
+      "learning_rate": 3.2911070343050555e-05,
+      "loss": 0.7914,
+      "step": 2630
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.41796875,
+      "learning_rate": 3.274824908954756e-05,
+      "loss": 0.7871,
+      "step": 2640
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.458984375,
+      "learning_rate": 3.258506343380815e-05,
+      "loss": 0.785,
+      "step": 2650
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.412109375,
+      "learning_rate": 3.242152105049758e-05,
+      "loss": 0.8188,
+      "step": 2660
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.427734375,
+      "learning_rate": 3.2257629631058066e-05,
+      "loss": 0.7935,
+      "step": 2670
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.466796875,
+      "learning_rate": 3.20933968833471e-05,
+      "loss": 0.8115,
+      "step": 2680
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.43359375,
+      "learning_rate": 3.1928830531274935e-05,
+      "loss": 0.8129,
+      "step": 2690
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.392578125,
+      "learning_rate": 3.176393831444131e-05,
+      "loss": 0.7732,
+      "step": 2700
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.435546875,
+      "learning_rate": 3.159872798777149e-05,
+      "loss": 0.794,
+      "step": 2710
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.421875,
+      "learning_rate": 3.143320732115153e-05,
+      "loss": 0.7794,
+      "step": 2720
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.40234375,
+      "learning_rate": 3.126738409906284e-05,
+      "loss": 0.7922,
+      "step": 2730
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.451171875,
+      "learning_rate": 3.110126612021613e-05,
+      "loss": 0.7881,
+      "step": 2740
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.427734375,
+      "learning_rate": 3.093486119718455e-05,
+      "loss": 0.7701,
+      "step": 2750
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.4140625,
+      "learning_rate": 3.076817715603634e-05,
+      "loss": 0.7826,
+      "step": 2760
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.451171875,
+      "learning_rate": 3.060122183596676e-05,
+      "loss": 0.7625,
+      "step": 2770
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.435546875,
+      "learning_rate": 3.0434003088929354e-05,
+      "loss": 0.7932,
+      "step": 2780
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.40625,
+      "learning_rate": 3.0266528779266722e-05,
+      "loss": 0.7694,
+      "step": 2790
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.453125,
+      "learning_rate": 3.0098806783340644e-05,
+      "loss": 0.7738,
+      "step": 2800
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.4140625,
+      "learning_rate": 2.993084498916165e-05,
+      "loss": 0.7805,
+      "step": 2810
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.43359375,
+      "learning_rate": 2.9762651296018047e-05,
+      "loss": 0.7879,
+      "step": 2820
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.435546875,
+      "learning_rate": 2.9594233614104412e-05,
+      "loss": 0.814,
+      "step": 2830
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.3984375,
+      "learning_rate": 2.942559986414957e-05,
+      "loss": 0.789,
+      "step": 2840
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.439453125,
+      "learning_rate": 2.925675797704411e-05,
+      "loss": 0.815,
+      "step": 2850
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.388671875,
+      "learning_rate": 2.908771589346731e-05,
+      "loss": 0.7808,
+      "step": 2860
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.447265625,
+      "learning_rate": 2.89184815635138e-05,
+      "loss": 0.7706,
+      "step": 2870
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.427734375,
+      "learning_rate": 2.8749062946319576e-05,
+      "loss": 0.7939,
+      "step": 2880
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.419921875,
+      "learning_rate": 2.857946800968773e-05,
+      "loss": 0.7898,
+      "step": 2890
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.51953125,
+      "learning_rate": 2.8409704729713694e-05,
+      "loss": 0.8139,
+      "step": 2900
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.431640625,
+      "learning_rate": 2.8239781090410133e-05,
+      "loss": 0.8046,
+      "step": 2910
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.4765625,
+      "learning_rate": 2.806970508333146e-05,
+      "loss": 0.7808,
+      "step": 2920
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.4296875,
+      "learning_rate": 2.789948470719798e-05,
+      "loss": 0.7892,
+      "step": 2930
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.416015625,
+      "learning_rate": 2.772912796751972e-05,
+      "loss": 0.7499,
+      "step": 2940
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.474609375,
+      "learning_rate": 2.755864287621992e-05,
+      "loss": 0.7509,
+      "step": 2950
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.435546875,
+      "learning_rate": 2.738803745125821e-05,
+      "loss": 0.8001,
+      "step": 2960
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.396484375,
+      "learning_rate": 2.7217319716253574e-05,
+      "loss": 0.7911,
+      "step": 2970
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.4296875,
+      "learning_rate": 2.704649770010696e-05,
+      "loss": 0.8088,
+      "step": 2980
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.416015625,
+      "learning_rate": 2.6875579436623678e-05,
+      "loss": 0.8155,
+      "step": 2990
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.431640625,
+      "learning_rate": 2.6704572964135576e-05,
+      "loss": 0.7625,
+      "step": 3000
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.416015625,
+      "learning_rate": 2.6533486325123007e-05,
+      "loss": 0.7729,
+      "step": 3010
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.4140625,
+      "learning_rate": 2.636232756583657e-05,
+      "loss": 0.7772,
+      "step": 3020
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.431640625,
+      "learning_rate": 2.6191104735918686e-05,
+      "loss": 0.7969,
+      "step": 3030
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.451171875,
+      "learning_rate": 2.601982588802507e-05,
+      "loss": 0.8003,
+      "step": 3040
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.44921875,
+      "learning_rate": 2.5848499077445932e-05,
+      "loss": 0.8036,
+      "step": 3050
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.44921875,
+      "learning_rate": 2.5677132361727225e-05,
+      "loss": 0.7889,
+      "step": 3060
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.40234375,
+      "learning_rate": 2.5505733800291622e-05,
+      "loss": 0.7891,
+      "step": 3070
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.42578125,
+      "learning_rate": 2.5334311454059505e-05,
+      "loss": 0.7805,
+      "step": 3080
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.416015625,
+      "learning_rate": 2.516287338506989e-05,
+      "loss": 0.8171,
+      "step": 3090
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.41015625,
+      "learning_rate": 2.499142765610122e-05,
+      "loss": 0.7768,
+      "step": 3100
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.40234375,
+      "learning_rate": 2.4819982330292184e-05,
+      "loss": 0.822,
+      "step": 3110
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.4375,
+      "learning_rate": 2.4648545470762517e-05,
+      "loss": 0.7701,
+      "step": 3120
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.443359375,
+      "learning_rate": 2.4477125140233787e-05,
+      "loss": 0.7995,
+      "step": 3130
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.4609375,
+      "learning_rate": 2.4305729400650192e-05,
+      "loss": 0.7826,
+      "step": 3140
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.4296875,
+      "learning_rate": 2.4134366312799412e-05,
+      "loss": 0.7764,
+      "step": 3150
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.396484375,
+      "learning_rate": 2.396304393593351e-05,
+      "loss": 0.793,
+      "step": 3160
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.423828125,
+      "learning_rate": 2.37917703273899e-05,
+      "loss": 0.7768,
+      "step": 3170
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.431640625,
+      "learning_rate": 2.362055354221241e-05,
+      "loss": 0.778,
+      "step": 3180
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.4140625,
+      "learning_rate": 2.3449401632772445e-05,
+      "loss": 0.8069,
+      "step": 3190
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.423828125,
+      "learning_rate": 2.3278322648390298e-05,
+      "loss": 0.7816,
+      "step": 3200
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.4453125,
+      "learning_rate": 2.3107324634956552e-05,
+      "loss": 0.7878,
+      "step": 3210
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.42578125,
+      "learning_rate": 2.2936415634553727e-05,
+      "loss": 0.8075,
+      "step": 3220
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.4296875,
+      "learning_rate": 2.276560368507803e-05,
+      "loss": 0.7748,
+      "step": 3230
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.435546875,
+      "learning_rate": 2.2594896819861345e-05,
+      "loss": 0.7655,
+      "step": 3240
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.41796875,
+      "learning_rate": 2.2424303067293396e-05,
+      "loss": 0.7799,
+      "step": 3250
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.439453125,
+      "learning_rate": 2.2253830450444202e-05,
+      "loss": 0.7805,
+      "step": 3260
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.4453125,
+      "learning_rate": 2.208348698668674e-05,
+      "loss": 0.7737,
+      "step": 3270
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.431640625,
+      "learning_rate": 2.191328068731987e-05,
+      "loss": 0.816,
+      "step": 3280
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.408203125,
+      "learning_rate": 2.1743219557191585e-05,
+      "loss": 0.7772,
+      "step": 3290
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.400390625,
+      "learning_rate": 2.157331159432253e-05,
+      "loss": 0.7636,
+      "step": 3300
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.408203125,
+      "learning_rate": 2.1403564789529833e-05,
+      "loss": 0.7981,
+      "step": 3310
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.3671875,
+      "learning_rate": 2.123398712605134e-05,
+      "loss": 0.7901,
+      "step": 3320
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.40234375,
+      "learning_rate": 2.1064586579170124e-05,
+      "loss": 0.7919,
+      "step": 3330
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.41796875,
+      "learning_rate": 2.0895371115839413e-05,
+      "loss": 0.7885,
+      "step": 3340
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.45703125,
+      "learning_rate": 2.0726348694307916e-05,
+      "loss": 0.7947,
+      "step": 3350
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.412109375,
+      "learning_rate": 2.0557527263745522e-05,
+      "loss": 0.8202,
+      "step": 3360
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.419921875,
+      "learning_rate": 2.038891476386948e-05,
+      "loss": 0.8103,
+      "step": 3370
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.423828125,
+      "learning_rate": 2.0220519124570946e-05,
+      "loss": 0.7949,
+      "step": 3380
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.42578125,
+      "learning_rate": 2.0052348265542087e-05,
+      "loss": 0.7849,
+      "step": 3390
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.3828125,
+      "learning_rate": 1.9884410095903587e-05,
+      "loss": 0.7817,
+      "step": 3400
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 0.39453125,
+      "learning_rate": 1.971671251383268e-05,
+      "loss": 0.8121,
+      "step": 3410
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 0.4140625,
+      "learning_rate": 1.9549263406191707e-05,
+      "loss": 0.787,
+      "step": 3420
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 0.42578125,
+      "learning_rate": 1.9382070648157187e-05,
+      "loss": 0.7708,
+      "step": 3430
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.44140625,
+      "learning_rate": 1.921514210284945e-05,
+      "loss": 0.779,
+      "step": 3440
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.423828125,
+      "learning_rate": 1.9048485620962833e-05,
+      "loss": 0.7638,
+      "step": 3450
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.431640625,
+      "learning_rate": 1.8882109040396456e-05,
+      "loss": 0.7859,
+      "step": 3460
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 0.41015625,
+      "learning_rate": 1.8716020185885598e-05,
+      "loss": 0.7698,
+      "step": 3470
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 0.37890625,
+      "learning_rate": 1.855022686863372e-05,
+      "loss": 0.7937,
+      "step": 3480
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 0.39453125,
+      "learning_rate": 1.838473688594506e-05,
+      "loss": 0.7875,
+      "step": 3490
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.416015625,
+      "learning_rate": 1.821955802085798e-05,
+      "loss": 0.8009,
+      "step": 3500
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.404296875,
+      "learning_rate": 1.8054698041778877e-05,
+      "loss": 0.7664,
+      "step": 3510
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 0.3828125,
+      "learning_rate": 1.7890164702116867e-05,
+      "loss": 0.7846,
+      "step": 3520
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 0.435546875,
+      "learning_rate": 1.7725965739919113e-05,
+      "loss": 0.7939,
+      "step": 3530
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 0.39453125,
+      "learning_rate": 1.756210887750692e-05,
+      "loss": 0.7927,
+      "step": 3540
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.4140625,
+      "learning_rate": 1.7398601821112552e-05,
+      "loss": 0.8056,
+      "step": 3550
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.447265625,
+      "learning_rate": 1.7235452260516804e-05,
+      "loss": 0.8037,
+      "step": 3560
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.41796875,
+      "learning_rate": 1.7072667868687348e-05,
+      "loss": 0.7432,
+      "step": 3570
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 0.421875,
+      "learning_rate": 1.6910256301417856e-05,
+      "loss": 0.7778,
+      "step": 3580
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 0.369140625,
+      "learning_rate": 1.6748225196967983e-05,
+      "loss": 0.7727,
+      "step": 3590
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.41015625,
+      "learning_rate": 1.6586582175704092e-05,
+      "loss": 0.7997,
+      "step": 3600
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.4375,
+      "learning_rate": 1.6425334839740915e-05,
+      "loss": 0.7967,
+      "step": 3610
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.3984375,
+      "learning_rate": 1.6264490772583985e-05,
+      "loss": 0.8328,
+      "step": 3620
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 0.42578125,
+      "learning_rate": 1.6104057538772975e-05,
+      "loss": 0.8032,
+      "step": 3630
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 0.416015625,
+      "learning_rate": 1.594404268352599e-05,
+      "loss": 0.7699,
+      "step": 3640
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 0.369140625,
+      "learning_rate": 1.5784453732384652e-05,
+      "loss": 0.8065,
+      "step": 3650
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 0.42578125,
+      "learning_rate": 1.562529819086023e-05,
+      "loss": 0.7946,
+      "step": 3660
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 0.423828125,
+      "learning_rate": 1.5466583544080587e-05,
+      "loss": 0.7898,
+      "step": 3670
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 0.49609375,
+      "learning_rate": 1.5308317256438205e-05,
+      "loss": 0.7733,
+      "step": 3680
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 0.48046875,
+      "learning_rate": 1.5150506771239114e-05,
+      "loss": 0.7995,
+      "step": 3690
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 0.423828125,
+      "learning_rate": 1.4993159510352836e-05,
+      "loss": 0.7843,
+      "step": 3700
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.41796875,
+      "learning_rate": 1.4836282873863319e-05,
+      "loss": 0.7567,
+      "step": 3710
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.427734375,
+      "learning_rate": 1.4679884239720928e-05,
+      "loss": 0.7699,
+      "step": 3720
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.421875,
+      "learning_rate": 1.452397096339545e-05,
+      "loss": 0.7625,
+      "step": 3730
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 0.447265625,
+      "learning_rate": 1.436855037753016e-05,
+      "loss": 0.764,
+      "step": 3740
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 0.46875,
+      "learning_rate": 1.421362979159695e-05,
+      "loss": 0.7931,
+      "step": 3750
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 0.44921875,
+      "learning_rate": 1.4059216491552621e-05,
+      "loss": 0.7986,
+      "step": 3760
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.423828125,
+      "learning_rate": 1.3905317739496143e-05,
+      "loss": 0.791,
+      "step": 3770
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.4375,
+      "learning_rate": 1.3751940773327194e-05,
+      "loss": 0.7896,
+      "step": 3780
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 0.439453125,
+      "learning_rate": 1.3599092806405677e-05,
+      "loss": 0.77,
+      "step": 3790
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 0.404296875,
+      "learning_rate": 1.3446781027212563e-05,
+      "loss": 0.8162,
+      "step": 3800
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 0.431640625,
+      "learning_rate": 1.329501259901173e-05,
+      "loss": 0.7702,
+      "step": 3810
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.435546875,
+      "learning_rate": 1.3143794659513153e-05,
+      "loss": 0.7743,
+      "step": 3820
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.43359375,
+      "learning_rate": 1.299313432053713e-05,
+      "loss": 0.7876,
+      "step": 3830
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.44140625,
+      "learning_rate": 1.2843038667679905e-05,
+      "loss": 0.7682,
+      "step": 3840
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 0.431640625,
+      "learning_rate": 1.2693514759980345e-05,
+      "loss": 0.7882,
+      "step": 3850
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 0.435546875,
+      "learning_rate": 1.2544569629587996e-05,
+      "loss": 0.7782,
+      "step": 3860
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.4296875,
+      "learning_rate": 1.2396210281432374e-05,
+      "loss": 0.8012,
+      "step": 3870
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.447265625,
+      "learning_rate": 1.2248443692893463e-05,
+      "loss": 0.8027,
+      "step": 3880
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.42578125,
+      "learning_rate": 1.2101276813473642e-05,
+      "loss": 0.7991,
+      "step": 3890
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.458984375,
+      "learning_rate": 1.1954716564470772e-05,
+      "loss": 0.784,
+      "step": 3900
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.388671875,
+      "learning_rate": 1.1808769838652755e-05,
+      "loss": 0.7678,
+      "step": 3910
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.4140625,
+      "learning_rate": 1.1663443499933303e-05,
+      "loss": 0.8044,
+      "step": 3920
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.43359375,
+      "learning_rate": 1.1518744383049187e-05,
+      "loss": 0.7707,
+      "step": 3930
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.421875,
+      "learning_rate": 1.1374679293238733e-05,
+      "loss": 0.7842,
+      "step": 3940
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 0.4453125,
+      "learning_rate": 1.1231255005921845e-05,
+      "loss": 0.7971,
+      "step": 3950
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 0.388671875,
+      "learning_rate": 1.1088478266381258e-05,
+      "loss": 0.7996,
+      "step": 3960
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 0.423828125,
+      "learning_rate": 1.094635578944541e-05,
+      "loss": 0.7945,
+      "step": 3970
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.421875,
+      "learning_rate": 1.0804894259172579e-05,
+      "loss": 0.8138,
+      "step": 3980
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.427734375,
+      "learning_rate": 1.0664100328536525e-05,
+      "loss": 0.8029,
+      "step": 3990
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.412109375,
+      "learning_rate": 1.0523980619113655e-05,
+      "loss": 0.8133,
+      "step": 4000
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 5390,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 200,
+  "total_flos": 4.738673495402086e+19,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7a679998340bfc147dea361d69c35dae027c71bfd6ee3a5e8a4f777e6456595
+size 4920

vision_tower_builder.py ADDED Viewed

	@@ -0,0 +1,632 @@

+from typing import Optional, Tuple, Union, Dict
+from dataclasses import dataclass
+from functools import partial, reduce
+from PIL import Image
+import os
+from transformers.image_processing_utils import BatchFeature, get_size_dict
+from transformers.image_transforms import (
+    convert_to_rgb,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    ChannelDimension,
+    PILImageResampling,
+    to_numpy_array,
+)
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from functools import partial
+try:
+    from flash_attn import flash_attn_qkvpacked_func
+    use_flash_attn = True
+except:
+    use_flash_attn = False
+    print("You need to install flash_attn to be faster!")
+try:
+    from timm.layers import drop_path, to_2tuple, trunc_normal_
+except:
+    from timm.models.layers import drop_path, trunc_normal_, to_2tuple
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., attn_head_dim=None,
+            attn_type='flash_v2'):
+        if use_flash_attn:
+            attn_type = attn_type
+        else:
+            attn_type = 'origin'
+        print(attn_type)
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        if attn_type not in ['origin', 'flash_v2']:
+            raise NotImplementedError(f"Not support attn_type: {attn_type}")
+        # print('umt:', f'attn_type: {attn_type}')
+        self.attn_type = attn_type
+        if attn_type == 'flash_v2':
+            self.attn_drop = attn_drop
+        else:
+            self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
+        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        if self.attn_type == 'flash_v2':
+            qkv = qkv.reshape(B, N, 3, self.num_heads, -1)
+            x = flash_attn_qkvpacked_func(qkv, dropout_p=self.attn_drop, softmax_scale=self.scale, causal=False).reshape(B, N, -1)
+        else:
+            qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv[0], qkv[1], qkv[
+                2]  # make torchscript happy (cannot use tensor as tuple)
+            # B num_heads N head_dim
+            q = q * self.scale
+            attn = (q @ k.transpose(-2, -1))
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+                 attn_head_dim=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        if init_values > 0:
+            self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+            self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+    def forward(self, x):
+        if self.gamma_1 is None:
+            x = x + self.drop_path(self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, num_frames=16, tubelet_size=2):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.tubelet_size = int(tubelet_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (num_frames // self.tubelet_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv3d(
+            in_channels=in_chans, out_channels=embed_dim,
+            kernel_size=(self.tubelet_size, patch_size[0], patch_size[1]),
+            stride=(self.tubelet_size, patch_size[0], patch_size[1])
+        )
+        # print('umt:', f'Num of patches: {num_patches}')
+    def forward(self, x, **kwargs):
+        B, C, T, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        # assert H == self.img_size[0] and W == self.img_size[1], \
+        #     f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+# sin-cos position encoding
+# https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31
+def get_sinusoid_encoding_table(n_position, d_hid, ckpt_num_frame=-1, cur_frame=12):
+    ''' Sinusoid position encoding table '''
+    # TODO: make it with torch instead of numpy
+    def get_position_angle_vec(position):
+        return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+    if ckpt_num_frame != -1 and ckpt_num_frame != cur_frame:
+        # print('umt:', f"Interpolate position embedding")
+        # print('umt:', f"Testing frame: {cur_frame}")
+        # print('umt:', f"Checkpoint frame: {ckpt_num_frame}")
+        T = ckpt_num_frame # checkpoint frame
+        new_T = cur_frame # testing frame
+        n_position = n_position // new_T * T # generate checkpoint position embedding
+        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
+        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
+        sinusoid_table = torch.tensor(sinusoid_table, dtype=torch.float, requires_grad=False).unsqueeze(0)
+        # interpolate
+        P = int((n_position // T) ** 0.5)
+        C = d_hid
+        sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C)
+        sinusoid_table = sinusoid_table.permute(0, 2, 3, 4, 1).reshape(-1, C, T)  # BHW, C, T
+        sinusoid_table = torch.nn.functional.interpolate(sinusoid_table, size=new_T, mode='linear')
+        sinusoid_table = sinusoid_table.reshape(1, P, P, C, new_T).permute(0, 4, 1, 2, 3) # B, T, H, W, C
+        sinusoid_table = sinusoid_table.flatten(1, 3)
+        return sinusoid_table
+    else:
+        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
+        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
+        return torch.tensor(sinusoid_table, dtype=torch.float, requires_grad=False).unsqueeze(0)
+def get_sinusoid_encoding_table2(n_position=784, d_hid=1024, cur_frame=8, ckpt_num_frame=4, pre_n_position=784):
+    ''' Sinusoid position encoding table '''
+    # TODO: make it with torch instead of numpy
+    def get_position_angle_vec(position):
+        return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+    # generate checkpoint position embedding
+    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(pre_n_position)])
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
+    sinusoid_table = torch.tensor(sinusoid_table, dtype=torch.float, requires_grad=False).unsqueeze(0)
+    # print(f"n_position: {n_position}")
+    # print(f"pre_n_position: {pre_n_position}")
+    if n_position != pre_n_position:
+        T = ckpt_num_frame # checkpoint frame
+        P = 14 # checkpoint size
+        C = d_hid
+        new_P = int((n_position // cur_frame) ** 0.5) # testing size
+        # print(f'Pretraining uses 14x14, but current version is {new_P}x{new_P}')
+        # print(f'Interpolate the position embedding')
+        sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C)
+        sinusoid_table = sinusoid_table.reshape(-1, P, P, C).permute(0, 3, 1, 2)
+        sinusoid_table = torch.nn.functional.interpolate(
+            sinusoid_table, size=(new_P, new_P), mode='bicubic', align_corners=False)
+        # BT, C, H, W -> BT, H, W, C ->  B, T, H, W, C
+        sinusoid_table = sinusoid_table.permute(0, 2, 3, 1).reshape(-1, T, new_P, new_P, C)
+        sinusoid_table = sinusoid_table.flatten(1, 3)  # B, THW, C
+    if cur_frame != ckpt_num_frame:
+        # print(f'Pretraining uses 4 frames, but current frame is {cur_frame}')
+        # print(f'Interpolate the position embedding')
+        T = ckpt_num_frame # checkpoint frame
+        new_T = cur_frame # testing frame
+        # interpolate
+        P = int((n_position // cur_frame) ** 0.5) # testing size
+        C = d_hid
+        sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C)
+        sinusoid_table = sinusoid_table.permute(0, 2, 3, 4, 1).reshape(-1, C, T)  # BHW, C, T
+        sinusoid_table = torch.nn.functional.interpolate(sinusoid_table, size=new_T, mode='linear')
+        sinusoid_table = sinusoid_table.reshape(1, P, P, C, new_T).permute(0, 4, 1, 2, 3) # B, T, H, W, C
+        sinusoid_table = sinusoid_table.flatten(1, 3)  # B, THW, C
+    return sinusoid_table
+class PretrainVisionTransformerEncoder(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None, num_frames=8, tubelet_size=1,
+                 use_learnable_pos_emb=False,
+                 use_checkpoint=False, checkpoint_num=0,
+                 ckpt_num_frame=-1, with_ln=True, return_index=-1
+                 ):
+        super().__init__()
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+            num_frames=num_frames, tubelet_size=tubelet_size
+        )
+        num_patches = self.patch_embed.num_patches
+        self.depth = depth + return_index + 1
+        self.use_checkpoint = use_checkpoint
+        self.checkpoint_num = checkpoint_num
+        # print('umt:', f"Use checkpoint: {use_checkpoint}")
+        # print('umt:', f"Checkpoint number: {checkpoint_num}")
+        # print('UMT:', f"Real runing depth: {self.depth}")
+        # TODO: Add the cls token
+        if use_learnable_pos_emb:
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+            self.img_pos_embed = nn.Parameter(torch.zeros(1, num_patches//(num_frames//tubelet_size) + 1, embed_dim))
+        else:
+            # sine-cosine positional embeddings
+            if img_size != 224:
+                self.pos_embed = get_sinusoid_encoding_table2(num_patches, embed_dim, ckpt_num_frame=ckpt_num_frame, cur_frame=num_frames//tubelet_size)
+                self.img_pos_embed = get_sinusoid_encoding_table2(num_patches//(num_frames//tubelet_size), embed_dim, cur_frame=1, ckpt_num_frame=1, pre_n_position=14*14)
+            else:
+                self.pos_embed = get_sinusoid_encoding_table(num_patches, embed_dim, ckpt_num_frame=ckpt_num_frame, cur_frame=num_frames//tubelet_size)
+                self.img_pos_embed = get_sinusoid_encoding_table(num_patches//(num_frames//tubelet_size), embed_dim)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                init_values=init_values)
+            for i in range(self.depth)])
+        if with_ln:
+            self.vision_layernorm = nn.LayerNorm(embed_dim, eps=1e-12)
+        else:
+            self.vision_layernorm = nn.Identity()
+        if use_learnable_pos_emb:
+            trunc_normal_(self.pos_embed, std=.02)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+    def forward_features(self, x, use_image=False):
+        x = self.patch_embed(x)
+        if use_image:
+            x = x + self.img_pos_embed.type_as(x).to(x.device).clone().detach()
+        else:
+            x = x + self.pos_embed.type_as(x).to(x.device).clone().detach()
+        B, _, C = x.shape
+        x_vis = x
+        for idx, blk in enumerate(self.blocks):
+            if self.use_checkpoint and idx < self.checkpoint_num:
+                x_vis = checkpoint.checkpoint(blk, x_vis)
+            else:
+                x_vis = blk(x_vis)
+        # with ln ot not
+        x_vis = self.vision_layernorm(x_vis)
+        return x_vis
+    def forward(self, x, use_image=False):
+        x_vis = self.forward_features(x, use_image)
+        return x_vis
+class PretrainVisionTransformer(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 encoder_in_chans=3,
+                 encoder_embed_dim=768,
+                 encoder_depth=12,
+                 encoder_num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 init_values=0.,
+                 use_learnable_pos_emb=False,
+                 num_frames=8,
+                 tubelet_size=1,
+                 use_checkpoint=False,
+                 checkpoint_num=0,
+                 ckpt_num_frame=4, # the pretrained model uses 4 frames
+                 return_index=-1,
+                 with_ln=False
+                ):
+        super().__init__()
+        self.encoder = PretrainVisionTransformerEncoder(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=encoder_in_chans,
+            embed_dim=encoder_embed_dim,
+            depth=encoder_depth,
+            num_heads=encoder_num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            drop_rate=drop_rate,
+            attn_drop_rate=attn_drop_rate,
+            drop_path_rate=drop_path_rate,
+            norm_layer=norm_layer,
+            init_values=init_values,
+            num_frames=num_frames,
+            tubelet_size=tubelet_size,
+            use_learnable_pos_emb=use_learnable_pos_emb,
+            use_checkpoint=use_checkpoint,
+            checkpoint_num=checkpoint_num,
+            ckpt_num_frame=ckpt_num_frame,
+            with_ln=with_ln,
+            return_index=return_index
+        )
+        # print('umt:', f'With LN: {with_ln}')
+        # print('UMT:', f'Total {encoder_depth} layer')
+        # print('UMT:', f'Return {encoder_depth+return_index+1}-th layer')
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token', 'clip_pos_embed'}
+    def forward(self, x, use_image=False):
+        T = x.shape[2]
+        x_vis = self.encoder(x, use_image) # [B, N_vis, C_e]
+        B, TL, C = x_vis.shape
+        x_vis = x_vis.view(B, T, TL // T, C)
+        return x_vis
+class UMTImageProcessor:
+    def __init__(self, image_mean=(0.485, 0.456, 0.406), image_std=(0.229, 0.224, 0.225), size=(224, 224), crop_size: Dict[str, int] = None, resample=PILImageResampling.BICUBIC, rescale_factor=1 / 255, data_format=ChannelDimension.FIRST):
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.size = size
+        self.resample = resample
+        self.rescale_factor = rescale_factor
+        self.data_format = data_format
+        self.crop_size = crop_size
+    def preprocess(self, images, return_tensors, target_size=None):
+        if isinstance(images, Image.Image):
+            images = [images]
+        else:
+            # to adapt video data
+            images = [to_numpy_array(image) for image in images]
+            assert isinstance(images, list)
+        if target_size is None:
+            target_size = self.size
+        transforms = [
+            convert_to_rgb,
+            to_numpy_array,
+            partial(resize, size=target_size, resample=self.resample, data_format=self.data_format),
+            partial(rescale, scale=self.rescale_factor, data_format=self.data_format),
+            partial(normalize, mean=self.image_mean, std=self.image_std, data_format=self.data_format),
+            partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format),
+        ]
+        images = reduce(lambda x, f: [*map(f, x)], transforms, images)
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+class UMTVisionConfig:
+    model_type = "umt_vision_model"
+    def __init__(
+        self,
+        num_frames=4,
+        hidden_size=1024,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=224,
+        patch_size=16,
+        return_idx=-2
+        # **kwargs,
+    ):
+        # super().__init__(**kwargs)
+        self.num_frames = num_frames
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.return_idx = return_idx
+def build_vit(config, pt_type='origin'):
+    model = PretrainVisionTransformer(
+        img_size=config.image_size,
+        patch_size=16,
+        encoder_embed_dim=1024,
+        encoder_depth=24,
+        encoder_num_heads=16,
+        drop_path_rate=0.,
+        num_frames=config.num_frames,
+        tubelet_size=1,
+        use_checkpoint=False,
+        checkpoint_num=24,
+        return_index=config.return_idx,
+        with_ln=True, # merge vision_layernorm in it
+    )
+    # no need to load pt
+    return model
+class UMTVisionTower(nn.Module):
+    def __init__(self, vision_tower, vision_tower_cfg, delay_load=False, pt_type='origin', image_size=224):
+        super().__init__()
+        self.is_loaded = False
+        self.pt_type = pt_type
+        self.config = UMTVisionConfig(num_frames=vision_tower_cfg.mm_local_num_frames, return_idx=vision_tower_cfg.mm_vision_select_layer, image_size=image_size)
+        self.vision_tower_name = vision_tower
+        self.image_processor = UMTImageProcessor(size=(image_size, image_size))
+        if not delay_load:
+            print(f"Loading vision tower: {vision_tower}")
+            self.load_model()
+        elif getattr(vision_tower_cfg, "unfreeze_mm_vision_tower", False):
+            # TODO: better detector is needed.
+            print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.")
+            self.load_model()
+        elif hasattr(vision_tower_cfg, "mm_tunable_parts") and "mm_vision_tower" in vision_tower_cfg.mm_tunable_parts:
+            print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.")
+            self.load_model()
+        else:
+            self.cfg_only = self.config
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name))
+            return
+        self.vision_tower = build_vit(self.config, pt_type=self.pt_type)
+        self.vision_tower.requires_grad_(False)
+        self.is_loaded = True
+    def forward(self, images):
+        if type(images) is list:
+            raise NotImplementedError
+        else:
+            # input: B T C H W
+            # output: B T*L C
+            T = images.shape[1]
+            images = images.permute(0, 2, 1, 3, 4)
+            image_embeds = self.vision_tower(images, use_image=(T == 1))
+            B, T, L, C = image_embeds.shape
+            image_embeds = image_embeds.reshape(B, -1, C)
+        return image_embeds
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        for p in self.vision_tower.parameters():
+            return p.dtype
+    @property
+    def device(self):
+        for p in self.vision_tower.parameters():
+            return p.device
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+    @property
+    def image_size(self):
+        return self.config.image_size
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None))
+    if "umt-hd" in vision_tower:
+        return UMTVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, image_size=448, **kwargs)
+    elif "umt" in vision_tower:
+        return UMTVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs)
+    raise ValueError(f"Unknown vision tower: {vision_tower}")

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff