Spaces:

FoundationVision
/

UniTok

Running on Zero

App Files Files Community

machuofan commited on 30 days ago

Commit

7385f22

1 Parent(s): bf8190d

init

Browse files

Files changed (50) hide show

.gitattributes +0 -35
constants.py +27 -0
conversation.py +460 -0
i2t.py +192 -0
mm_utils.py +105 -0
model/__init__.py +7 -0
model/__pycache__/__init__.cpython-311.pyc +0 -0
model/__pycache__/__init__.cpython-39.pyc +0 -0
model/__pycache__/arhead.cpython-311.pyc +0 -0
model/__pycache__/arhead.cpython-39.pyc +0 -0
model/__pycache__/builder.cpython-311.pyc +0 -0
model/__pycache__/builder.cpython-39.pyc +0 -0
model/__pycache__/liquid.cpython-311.pyc +0 -0
model/__pycache__/mini_gemini_arch.cpython-311.pyc +0 -0
model/__pycache__/mini_gemini_arch.cpython-39.pyc +0 -0
model/__pycache__/quant.cpython-311.pyc +0 -0
model/__pycache__/quant.cpython-39.pyc +0 -0
model/arhead.py +241 -0
model/builder.py +138 -0
model/language_model/__pycache__/mini_gemini_llama.cpython-311.pyc +0 -0
model/language_model/__pycache__/mini_gemini_llama.cpython-39.pyc +0 -0
model/language_model/mini_gemini_llama.py +488 -0
model/liquid.py +669 -0
model/multimodal_encoder/__pycache__/builder.cpython-311.pyc +0 -0
model/multimodal_encoder/__pycache__/builder.cpython-39.pyc +0 -0
model/multimodal_encoder/__pycache__/clip_encoder.cpython-311.pyc +0 -0
model/multimodal_encoder/__pycache__/clip_encoder.cpython-39.pyc +0 -0
model/multimodal_encoder/__pycache__/eva_encoder.cpython-311.pyc +0 -0
model/multimodal_encoder/__pycache__/eva_encoder.cpython-39.pyc +0 -0
model/multimodal_encoder/__pycache__/openclip_encoder.cpython-311.pyc +0 -0
model/multimodal_encoder/__pycache__/openclip_encoder.cpython-39.pyc +0 -0
model/multimodal_encoder/builder.py +33 -0
model/multimodal_encoder/clip_encoder.py +89 -0
model/multimodal_encoder/eva_encoder.py +551 -0
model/multimodal_encoder/openclip_encoder.py +188 -0
model/multimodal_projector/__pycache__/builder.cpython-311.pyc +0 -0
model/multimodal_projector/__pycache__/builder.cpython-39.pyc +0 -0
model/multimodal_projector/builder.py +50 -0
model/processor/__pycache__/video_processor.cpython-311.pyc +0 -0
model/processor/__pycache__/video_processor.cpython-39.pyc +0 -0
model/processor/video_processor.py +74 -0
model/quant.py +519 -0
t2i.py +224 -0
tools.py +126 -0
unitok/config.py +243 -0
unitok/dist.py +302 -0
unitok/model.py +184 -0
unitok/quant.py +185 -0
unitok/vitamin.py +792 -0
unitok/vqvae.py +175 -0

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

constants.py ADDED Viewed

	@@ -0,0 +1,27 @@

+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+PREDICT_TOKEN_INDEX = -300
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"
+DEFAULT_PREDICT_TOKEN = "<predict>"
+DESCRIPT_PROMPT = [
+    "Describe this image thoroughly.",
+    "Provide a detailed description in this picture.",
+    "Detail every aspect of what's in this picture.",
+    "Explain this image with precision and detail.",
+    "Give a comprehensive description of this visual.",
+    "Elaborate on the specifics within this image.",
+    "Offer a detailed account of this picture's contents.",
+    "Describe in detail what this image portrays.",
+    "Break down this image into detailed descriptions.",
+    "Provide a thorough description of the elements in this image."]

conversation.py ADDED Viewed

	@@ -0,0 +1,460 @@

+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+import base64
+from io import BytesIO
+from PIL import Image
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+    GEMMA = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if 'mmtag' in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0: message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.GEMMA:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += "<start_of_turn>" + role + "\n" + message + "<end_of_turn>\n" + seps[i % 2]
+                else:
+                    ret += "<start_of_turn>" + role + "\n"
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=1344, min_len=672):
+        if image_process_mode == "Pad":
+            def expand2square(pil_img, background_color=(122, 116, 104)):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+            image = expand2square(image)
+        elif image_process_mode in ["Default", "Crop"]:
+            pass
+        elif image_process_mode == "Resize":
+            image = image.resize((336, 336))
+        else:
+            raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+        if max(image.size) > max_len:
+            max_hw, min_hw = max(image.size), min(image.size)
+            aspect_ratio = max_hw / min_hw
+            shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+            longest_edge = int(shortest_edge * aspect_ratio)
+            W, H = image.size
+            if H > W:
+                H, W = longest_edge, shortest_edge
+            else:
+                H, W = shortest_edge, longest_edge
+            image = image.resize((W, H))
+        if return_pil:
+            return image
+        else:
+            buffered = BytesIO()
+            image.save(buffered, format=image_format)
+            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            return img_b64_str
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    image = self.process_image(image, image_process_mode, return_pil=return_pil)
+                    images.append(image)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    img_b64_str = self.process_image(
+                        image, "Default", return_pil=False,
+                        image_format='JPEG')
+                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace('<image>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                if type(msg) is tuple and len(msg) == 2:
+                    msg, img_b64_str = msg
+                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = msg.strip() + img_str
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        ("Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+           "You are able to understand the visual content that the user provides, "
+           "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_vicuna_imgsp_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="imgsp_v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_plain_guided = Conversation(
+    system="",
+    roles=("", ""),
+    version="plain_guided",
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+conv_phi_2 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="phi2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="<|endoftext|>",
+)
+conv_mistral_instruct = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_gemma = Conversation(
+    system="",
+    roles=("user", "model"),
+    version="gemma",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.GEMMA,
+    sep="",
+    sep2="<eos>",
+)
+conv_chatml_direct = Conversation(
+    system="""<|im_start|>system
+Answer the questions.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+default_conversation = conv_vicuna_v1
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "phi_2": conv_phi_2,
+    "gemma": conv_gemma,
+    "llama_2": conv_llama_2,
+    "imgsp_v1": conv_vicuna_imgsp_v1,
+    "plain_guided": conv_llava_plain_guided,
+    "mistral_instruct": conv_mistral_instruct,
+    "chatml_direct": conv_chatml_direct,
+    "mistral_direct": conv_chatml_direct,
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+    "mpt": conv_mpt,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

i2t.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import os
+import sys
+import json
+import math
+import torch
+import argparse
+import shortuuid
+from tqdm import tqdm
+from PIL import Image
+from PIL import ImageFile
+from torchvision import transforms
+from constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
+from conversation import conv_templates, SeparatorStyle
+from model.builder import load_pretrained_model
+from tools import disable_torch_init
+from mm_utils import tokenizer_image_token, get_model_name_from_path
+from torch.utils.data import Dataset, DataLoader
+from unitok.config import Args
+from unitok.model import UniTok
+ImageFile.LOAD_TRUNCATED_IMAGES = False
+torch.set_grad_enabled(False)
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+# Custom dataset class
+class CustomDataset(Dataset):
+    def __init__(self, questions, image_folder, tokenizer, image_processor, model_config):
+        self.questions = questions
+        self.image_folder = image_folder
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.model_config = model_config
+    def __getitem__(self, index):
+        line = self.questions[index]
+        image_file = line["image"]
+        qs = line["text"]
+        qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        # prompt = prompt.replace('<image>','<boi><image><eoi>')
+        # import pdb;pdb.set_trace()
+        image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
+        # import pdb;pdb.set_trace()
+        pad_image = expand2square(image, (122, 116, 104) )
+        # import pdb;pdb.set_trace()
+        img = self.image_processor[0](pad_image).unsqueeze(0)
+        img = img.to('cuda')
+        # import pdb;pdb.set_trace()
+        with torch.no_grad():
+            vq_code = self.image_processor[1].img_to_idx(img)
+            vqcode = vq_code.cpu()
+        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
+        return input_ids,vqcode,os.path.join(self.image_folder, image_file) #, image_tensor, image_tensor_aux
+    def __len__(self):
+        return len(self.questions)
+# DataLoader
+def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=0):
+    assert batch_size == 1, "batch_size must be 1"
+    dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config)
+    data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
+    return data_loader
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name, load_8bit=args.load_8bit)
+    ckpt = torch.load(args.tokenizer_path, map_location='cpu')
+    vae_cfg = Args()
+    vae_cfg.load_state_dict(ckpt['args'])
+    vq_model = UniTok(vae_cfg)
+    vq_model.load_state_dict(ckpt['trainer']['unitok'])
+    vq_model.to('cuda')
+    vq_model.eval()
+    del ckpt
+    crop_size = 256
+    transform = transforms.Compose([
+        transforms.Resize((crop_size, crop_size)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+    ])
+    image_processor = (transform, vq_model)
+    questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    if 'plain' in args.conv_mode and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
+        args.conv_mode = args.conv_mode + '_mmtag'
+        print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
+    data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config)
+    for (input_ids, image_codes,imagepath), line in tqdm(zip(data_loader, questions), total=len(questions)):
+        idx = line["question_id"]
+        cur_prompt = line["text"]
+        input_ids = input_ids.to(device=model.device, non_blocking=True)
+        image_codes = image_codes.to(device=model.device, non_blocking=True)
+        if hasattr(model, "update_prompt"):
+            model.update_prompt([[cur_prompt]])
+        with torch.inference_mode():
+            output_ids = model.generate_mllm(
+                input_ids,
+                images=image_codes,
+                images_aux=  None,
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                max_new_tokens=args.max_new_tokens,
+                bos_token_id=tokenizer.bos_token_id,  # Begin of sequence token
+                eos_token_id=tokenizer.eos_token_id,  # End of sequence token
+                pad_token_id=tokenizer.pad_token_id,  # Pad token
+                use_cache=False
+            )
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({
+            "question_id": idx,
+            "prompt": cur_prompt,
+            "text": outputs,
+            "answer_id": ans_id,
+            "model_id": model_name,
+            "metadata": {}
+        }) + "\n")
+    ans_file.close()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--tokenizer-path", type=str, required=True)
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument('--load_8bit', type=bool, default=False)
+    parser.add_argument("--max_new_tokens", type=int, default=128)
+    args = parser.parse_args()
+    eval_model(args)

mm_utils.py ADDED Viewed

	@@ -0,0 +1,105 @@

+from PIL import Image
+from io import BytesIO
+import base64
+import torch
+from transformers import StoppingCriteria
+from constants import IMAGE_TOKEN_INDEX
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def process_images(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+    new_images = []
+    if image_aspect_ratio == 'pad':
+        for image in images:
+            image = expand2square(image.convert('RGB'), tuple(int(x*255) for x in image_processor.image_mean))
+            image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            new_images.append(image)
+    else:
+        return image_processor(images, return_tensors='pt')['pixel_values']
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images
+def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith('checkpoint-'):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        self.max_keyword_len = 0
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            if len(cur_keyword_ids) > self.max_keyword_len:
+                self.max_keyword_len = len(cur_keyword_ids)
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            truncated_output_ids = output_ids[0, -keyword_id.shape[0]:]
+            if torch.equal(truncated_output_ids, keyword_id):
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        outputs = []
+        for i in range(output_ids.shape[0]):
+            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
+        return all(outputs)

model/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .language_model.mini_gemini_llama import MiniGeminiLlamaForCausalLM
+try:
+    from .language_model.mini_gemini_mistral import MiniGeminiMistralForCausalLM
+    from .language_model.mini_gemini_mixtral import MiniGeminiMixtralForCausalLM
+    from .language_model.mini_gemini_gemma import MiniGeminiGemmaForCausalLM
+except:
+    ImportWarning("New model not imported. Try to update Transformers.")

model/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (714 Bytes). View file

model/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (585 Bytes). View file

model/__pycache__/arhead.cpython-311.pyc ADDED Viewed

Binary file (11.1 kB). View file

model/__pycache__/arhead.cpython-39.pyc ADDED Viewed

Binary file (5.89 kB). View file

model/__pycache__/builder.cpython-311.pyc ADDED Viewed

Binary file (5.87 kB). View file

model/__pycache__/builder.cpython-39.pyc ADDED Viewed

Binary file (2.91 kB). View file

model/__pycache__/liquid.cpython-311.pyc ADDED Viewed

Binary file (43.8 kB). View file

model/__pycache__/mini_gemini_arch.cpython-311.pyc ADDED Viewed

Binary file (49.9 kB). View file

model/__pycache__/mini_gemini_arch.cpython-39.pyc ADDED Viewed

Binary file (20.5 kB). View file

model/__pycache__/quant.cpython-311.pyc ADDED Viewed

Binary file (38.9 kB). View file

model/__pycache__/quant.cpython-39.pyc ADDED Viewed

Binary file (16.5 kB). View file

model/arhead.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import torch
+import torch.utils.checkpoint
+from torch import nn
+from typing import List, Optional, Tuple, Union
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.models.llama.modeling_llama import LlamaRMSNorm, LlamaDecoderLayer
+from transformers.modeling_outputs import BaseModelOutputWithPast
+class AR_head(nn.Module):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`GemmaDecoderLayer`]
+    Args:
+        config: GemmaConfig
+    """
+    def __init__(self, config, codebook_size, num_codebooks):
+        super().__init__()
+        # import pdb;pdb.set_trace()
+        self.num_codebooks = num_codebooks
+        vocab_size = codebook_size
+        self.sub_vocab_size = vocab_size // self.num_codebooks
+        # self.layers = nn.ModuleList(
+        #     [GemmaDecoderLayer(config, layer_idx) for layer_idx in range(3)]
+        # )
+        # self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.linear_head = nn.Linear(config.hidden_size, self.sub_vocab_size)
+        self.layers = nn.ModuleList(
+            [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(3)]
+        )
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # vocab_size 16384
+        self.codebooks = nn.ModuleList()
+        for _ in range(self.num_codebooks-1):
+            codebook = nn.Embedding(self.sub_vocab_size, config.hidden_size)
+            self.codebooks.append(codebook)
+        # import pdb;pdb.set_trace()
+        self.config = config
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self._init_weights(self.layers)
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    # Ignore copy
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> torch.tensor:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        past_seen_tokens = 0
+        if use_cache:  # kept for BC (cache positions)
+            if not isinstance(past_key_values, StaticCache):
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                past_seen_tokens = past_key_values.get_seq_length()
+        if cache_position is None:
+            if isinstance(past_key_values, StaticCache):
+                raise ValueError("cache_position is a required argument when using StaticCache.")
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
+        # embed positions
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = None
+        if use_cache:
+            next_cache = (
+                next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
+            )
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
+    # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
+    # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
+    # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
+    def _update_causal_mask(self, attention_mask, input_tensor, cache_position):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        if hasattr(self.layers[0].self_attn, "past_key_value"):  # static cache
+            target_length = self.config.max_position_embeddings
+        else:  # dynamic cache
+            target_length = (
+                attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else cache_position[-1] + 1
+            )
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            if attention_mask.dim() == 2:
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
+                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
+            elif attention_mask.dim() == 4:
+                # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with
+                # cache. In that case, the 4D attention mask attends to the newest tokens only.
+                if attention_mask.shape[-2] < cache_position[0] + sequence_length:
+                    offset = cache_position[0]
+                else:
+                    offset = 0
+                mask_shape = attention_mask.shape
+                mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype
+                causal_mask[
+                    : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]
+                ] = mask_slice
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+        ):
+            # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400).
+            is_tracing = (
+                torch.jit.is_tracing()
+                or isinstance(input_tensor, torch.fx.Proxy)
+                or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling())
+            )
+            if not is_tracing and torch.any(attention_mask != 1):
+                # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+                # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+                # Details: https://github.com/pytorch/pytorch/issues/110213
+                causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask

model/builder.py ADDED Viewed

	@@ -0,0 +1,138 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# ------------------------------------------------------------------------
+# Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
+# Copyright 2024 Yanwei Li
+# ------------------------------------------------------------------------
+import os
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
+from model import *
+from constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
+    kwargs = {"device_map": device_map, **kwargs}
+    if device != "cuda":
+        kwargs['device_map'] = {"": device}
+    if load_8bit:
+        kwargs['load_in_8bit'] = True
+    elif load_4bit:
+        kwargs['load_in_4bit'] = True
+        kwargs['quantization_config'] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4'
+        )
+    else:
+        kwargs['torch_dtype'] = torch.float16
+    if use_flash_attn:
+        kwargs['attn_implementation'] = 'flash_attention_2'
+    # import pdb;pdb.set_trace()
+    if 'mini-gemini' in model_name.lower():
+        # Load MiniGemini model
+        if model_base is not None:
+            # this may be mm projector only
+            print('Loading MiniGemini from base model...')
+            if "8x7b" in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_base)
+                model = MiniGeminiMixtralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
+            elif "2b" in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_base)
+                model = MiniGeminiGemmaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+                model = MiniGeminiLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
+            mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
+            mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
+            model.load_state_dict(mm_projector_weights, strict=False)
+        else:
+            if "8x7b" in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path)
+                model = MiniGeminiMixtralForCausalLM.from_pretrained(model_path, **kwargs)
+            elif "2b" in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path)
+                model = MiniGeminiGemmaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = MiniGeminiLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+    if 'gemma' in model_name.lower():
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        model = MiniGeminiGemmaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+    elif 'vicuna'  in model_name.lower() or 'unitok' in model_name.lower():
+        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+        model = MiniGeminiLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+    else:
+        # Load language model
+        if model_base is not None:
+            # PEFT model
+            from peft import PeftModel
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
+            print(f"Loading LoRA weights from {model_path}")
+            model = PeftModel.from_pretrained(model, model_path)
+            print(f"Merging weights")
+            model = model.merge_and_unload()
+            print('Convert to FP16...')
+            model.to(torch.float16)
+        else:
+            if 'mpt' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+    image_processor = None
+    # import pdb;pdb.set_trace()
+    # mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+    # mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+    # if mm_use_im_patch_token:
+    #     tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+    # if mm_use_im_start_end:
+    #     tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+    # model.resize_token_embeddings(len(tokenizer))
+    # vision_tower = model.get_vision_tower()
+    # if not vision_tower.is_loaded:
+    #     vision_tower.load_model()
+    # vision_tower.to(device=device, dtype=torch.float16)
+    # image_processor = vision_tower.image_processor
+    if 'mini-gemini' in model_name.lower():
+        vision_tower_aux = model.get_vision_tower_aux()
+        if not vision_tower_aux.is_loaded:
+            vision_tower_aux.load_model()
+        vision_tower_aux.to(device=device, dtype=torch.float16)
+        # initialize attention modules
+        model.config.model_path = model_path
+        model.get_model().initialize_uni_modules(model.config, for_eval=True)
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+    return tokenizer, model, image_processor, context_len

model/language_model/__pycache__/mini_gemini_llama.cpython-311.pyc ADDED Viewed

Binary file (18 kB). View file

model/language_model/__pycache__/mini_gemini_llama.cpython-39.pyc ADDED Viewed

Binary file (7.73 kB). View file

model/language_model/mini_gemini_llama.py ADDED Viewed

	@@ -0,0 +1,488 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# ------------------------------------------------------------------------
+# Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
+# Copyright 2024 Yanwei Li
+# ------------------------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+from typing import List, Optional, Tuple, Union
+from transformers.utils import logging
+from transformers.generation.utils import GenerateOutput
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers import AutoConfig, AutoModelForCausalLM, LlamaConfig, LlamaModel, LlamaForCausalLM
+from model.arhead import AR_head
+from model.liquid import MiniGeminiMetaModel, MiniGeminiMetaForCausalLM
+logger = logging.get_logger(__name__)
+class MiniGeminiConfig(LlamaConfig):
+    model_type = "mini_gemini"
+class MiniGeminiLlamaModel(MiniGeminiMetaModel, LlamaModel):
+    config_class = MiniGeminiConfig
+    def __init__(self, config: LlamaConfig):
+        super(MiniGeminiLlamaModel, self).__init__(config)
+class MiniGeminiLlamaForCausalLM(LlamaForCausalLM, MiniGeminiMetaForCausalLM):
+    config_class = MiniGeminiConfig
+    def __init__(self, config):
+        super(LlamaForCausalLM, self).__init__(config)
+        self.model = MiniGeminiLlamaModel(config)
+        self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.ar_head = AR_head(self.config, codebook_size=32768, num_codebooks=8)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        data_types: torch.LongTensor = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        images_aux: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        additional_image_indexs = None
+        if inputs_embeds is None and past_key_values is None:  # no in inference
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels,
+                data_types,
+                additional_image_labels,
+                additional_image_indexs
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                images_aux,
+                data_types
+            )
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        text_loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            text_loss = loss_fct(shift_logits, shift_labels)
+            num_text_tokens = (shift_labels != -100).sum().item()
+        if additional_image_indexs is None:
+            return CausalLMOutputWithPast(
+                loss=text_loss,
+                logits=logits,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+        to_image_mask = data_types == 1  # where to get t2i loss in each batch  [True, False, False, True....]
+        if len(additional_image_indexs) > 0 and len(to_image_mask) == len(hidden_states):  # image generation loss
+            to_image_states = hidden_states[to_image_mask]
+            # assert len(to_image_states) == len(additional_image_indexs)
+            if len(to_image_states) != len(additional_image_indexs):
+                print('to_image_mask', to_image_mask)
+                print('additional_image_indexs', additional_image_indexs)
+            shift_image_states = torch.stack([state[start_id - 1:end_id - 1] for (start_id, end_id), state in
+                                              zip(additional_image_indexs, to_image_states)])  # Shift so that tokens < n predict n  [bz, seq_len, hidden_dim]
+            base_tokens = shift_image_states
+            K = self.ar_head.num_codebooks
+            B, L, C = base_tokens.shape
+            base_tokens = base_tokens.reshape(B * L, 1, C)
+            targets = torch.cat(additional_image_labels, dim=0)  # [B, K, L]
+            image_code_labels = targets
+            targets = targets.permute(0, 2, 1).reshape(B * L, K)[:, :-1]
+            index_embeddings = []
+            for i in range(K - 1):
+                index_embed = self.ar_head.codebooks[i](targets[:, i])
+                index_embeddings.append(index_embed)
+            index_embeddings = torch.stack(index_embeddings, dim=1)
+            # import pdb;pdb.set_trace()
+            h = torch.cat((base_tokens, index_embeddings), dim=1)  # [B*L, K, C]
+            multicode_embedding = self.ar_head(
+                input_ids=None,
+                attention_mask=None,
+                position_ids=None,
+                past_key_values=None,
+                inputs_embeds=h,
+                use_cache=False,
+                output_attentions=False,
+                output_hidden_states=False,
+                return_dict=False,
+                cache_position=None,
+            )
+            image_logits = self.ar_head.linear_head(multicode_embedding)
+            image_logits = image_logits.reshape(B, L, K, -1).permute(0, 2, 1, 3)  # [B, K, L, sub_vocab_size]
+            loss_fct = CrossEntropyLoss()
+            image_logits = image_logits.reshape(-1, self.ar_head.sub_vocab_size)
+            image_labels = image_code_labels.view(-1)
+            image_labels = image_labels.to(image_logits.device)
+            image_softmax_normalizer = image_logits.max(-1).values ** 2
+            image_z_loss = 0.00005 * image_softmax_normalizer.mean()
+            image_loss = loss_fct(image_logits, image_labels) + image_z_loss
+            num_image_tokens = image_labels.shape[0]
+        else:
+            if len(hidden_states) != len(to_image_mask):
+                print('to_image_mask', to_image_mask)
+                print('hidden_states', hidden_states.shape)
+                print('inputs_embeds', inputs_embeds.shape)
+                print('additional_image_indexs', additional_image_indexs)
+            fake_ids = torch.ones(1, self.model.multi_embedder.num_codebooks - 1).to(inputs_embeds).long()
+            index_embeddings = []
+            for i in range(self.model.multi_embedder.num_codebooks - 1):
+                index_embed = self.ar_head.codebooks[i](fake_ids[:, i])
+                index_embeddings.append(index_embed)
+            index_embeddings = torch.stack(index_embeddings, dim=1)
+            multicode_embedding = self.ar_head(
+                input_ids=None,
+                attention_mask=None,
+                position_ids=None,
+                past_key_values=None,
+                inputs_embeds=index_embeddings,
+                use_cache=False,
+                output_attentions=False,
+                output_hidden_states=False,
+                return_dict=False,
+                cache_position=None,
+            )
+            image_logits = self.ar_head.linear_head(multicode_embedding)
+            num_image_tokens = 0
+            image_loss = (image_logits * 0).sum()  # + (base_tokens*0).sum()
+            pass
+        loss = image_loss * (num_image_tokens / (num_image_tokens + num_text_tokens)) + \
+                text_loss * (num_text_tokens / (num_image_tokens + num_text_tokens))
+        # t2i_ratio = to_image_mask.sum() / len(to_image_mask)
+        # loss = image_loss * t2i_ratio + text_loss * (1 - t2i_ratio)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    @torch.no_grad()
+    def generate_mllm(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        images_aux: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        # import pdb;pdb.set_trace()
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                images_aux
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+        # import pdb;pdb.set_trace()
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+    @torch.no_grad()
+    def generate(
+            self,
+            inputs: Optional[torch.Tensor] = None,
+            images: Optional[torch.Tensor] = None,
+            images_aux: Optional[torch.FloatTensor] = None,
+            **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                images_aux
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+    def test_forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        input_multi_ids: torch.LongTensor = None,
+        data_types: torch.LongTensor = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        images_aux: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        # import pdb;pdb.set_trace()
+        if input_multi_ids is not None:
+            input_multi_ids = input_multi_ids.unsqueeze(-1)  # [B,K,1]
+            input_ids = None  # [B,1]
+            inputs_embeds = self.model.multi_embedder(input_multi_ids)  # [B,1,C]
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return outputs
+    def T2I_forward_nocache(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        input_multi_ids: torch.LongTensor = None,
+        data_types: torch.LongTensor = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        images_aux: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        # import pdb;pdb.set_trace()
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_multi_ids is not None:
+            inputs_text_embeds = self.get_model().embed_tokens(input_ids)
+            input_ids = None  # [B,1]
+            inputs_image_embeds = self.model.multi_embedder(input_multi_ids)  # [B,1,C]
+            inputs_image_mask = torch.empty(inputs_image_embeds.shape[0], inputs_image_embeds.shape[1]).fill_(1).to(
+                attention_mask)
+            inputs_embeds = torch.cat([inputs_text_embeds, inputs_image_embeds], dim=1)
+            attention_mask = torch.cat([attention_mask, inputs_image_mask], dim=1)
+            position_ids = torch.arange(0, inputs_embeds.shape[1], device=inputs_embeds.device).unsqueeze(0).repeat(
+                inputs_embeds.shape[0], 1)
+        else:
+            inputs_embeds = self.get_model().embed_tokens(input_ids)
+            input_ids = None
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return outputs
+    def T2I_forward_withcache(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        input_multi_ids: torch.LongTensor = None,
+        data_types: torch.LongTensor = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        images_aux: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        # import pdb;pdb.set_trace()
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_multi_ids is not None:
+            inputs_image_embeds = self.model.multi_embedder(input_multi_ids[:, :, -1:])  # [B,1,C]
+            inputs_embeds = inputs_image_embeds
+            input_ids = None  # [B,1]
+        else:
+            inputs_embeds = self.get_model().embed_tokens(input_ids)
+            input_ids = None
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return outputs
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        images_aux = kwargs.pop("images_aux", None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            _inputs['images'] = images
+        if images_aux is not None:
+            _inputs['images_aux'] = images_aux
+        return _inputs
+AutoConfig.register("mini_gemini", MiniGeminiConfig)
+AutoModelForCausalLM.register(MiniGeminiConfig, MiniGeminiLlamaForCausalLM)

model/liquid.py ADDED Viewed

	@@ -0,0 +1,669 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# ------------------------------------------------------------------------
+# Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
+# Copyright 2024 Yanwei Li
+# ------------------------------------------------------------------------
+# Modified from MiniGemini (https://github.com/dvlab-research/MGM)
+# Copyright 2025 ByteDance
+# ------------------------------------------------------------------------
+import os
+import json
+import torch
+import deepspeed
+import safetensors
+import transformers
+import torch.nn as nn
+import torch.nn.functional as F
+from abc import ABC, abstractmethod
+from transformers.deepspeed import is_deepspeed_zero3_enabled
+from model.quant import VectorQuantizerM, AttnProjection
+from model.multimodal_projector.builder import build_vision_projector
+from model.multimodal_encoder.builder import build_vision_tower, build_vision_tower_aux
+from constants import (
+    DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN,
+    IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN
+)
+IS_NEW_TRANSFORMERS = transformers.__version__ >= "4.34.0"
+class MiniGeminiMetaModel:
+    def __init__(self, config):
+        super(MiniGeminiMetaModel, self).__init__(config)
+        self.config = config
+        self.multi_embedder = TokenEmbedder(self.config.hidden_size)
+        if hasattr(config, "mm_vision_tower"):
+            self.vision_tower = build_vision_tower(config, delay_load=True)
+            self.mm_projector = build_vision_projector(config)
+        if hasattr(config, "mm_vision_tower_aux"):
+            self.vision_tower_aux = build_vision_tower_aux(config, delay_load=True)
+    def get_vision_tower(self):
+        vision_tower = getattr(self, 'vision_tower', None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def get_vision_tower_aux(self):
+        vision_tower_aux = getattr(self, 'vision_tower_aux', None)
+        if type(vision_tower_aux) is list:
+            vision_tower_aux = vision_tower_aux[0]
+        return vision_tower_aux
+    def initialize_embedder(self, unitok_pth, mm_projecter_pth=None):
+        self.multi_embedder = TokenEmbedder(self.config.hidden_size)
+        if unitok_pth is not None:
+            ckpt = torch.load(unitok_pth, map_location='cpu')
+            unitok_ckpt = ckpt['trainer']['unitok']
+            quantizer_weights = dict()
+            for k, v in unitok_ckpt.items():
+                if k.startswith('quantizer'):
+                    new_k = k.replace('quantizer.', '')
+                    quantizer_weights[new_k] = v
+            attn_proj_weights = dict()
+            for k, v in unitok_ckpt.items():
+                if k.startswith('post_quant_proj'):
+                    new_k = k.replace('post_quant_proj.', '')
+                    attn_proj_weights[new_k] = v
+            if is_deepspeed_zero3_enabled():
+                with deepspeed.zero.GatheredParameters(quantizer_weights, modifier_rank=0):
+                    if torch.distributed.get_rank() == 0:
+                        self.multi_embedder.quantizer.load_state_dict(quantizer_weights)
+                with deepspeed.zero.GatheredParameters(attn_proj_weights, modifier_rank=0):
+                    if torch.distributed.get_rank() == 0:
+                        self.multi_embedder.attn_projection.load_state_dict(attn_proj_weights)
+            else:
+                status = self.multi_embedder.quantizer.load_state_dict(quantizer_weights)
+                print('missing_keys:', status.missing_keys)
+                status = self.multi_embedder.attn_projection.load_state_dict(attn_proj_weights)
+                print('missing_keys:', status.missing_keys)
+        if mm_projecter_pth is not None:
+            mm_projector_weights = torch.load(mm_projecter_pth, map_location='cpu')
+            def get_w(weights, keyword):
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword + '.' in k}
+            named_parameters = get_w(mm_projector_weights, 'mm_projector')
+            if is_deepspeed_zero3_enabled():
+                with deepspeed.zero.GatheredParameters(named_parameters, modifier_rank=0):
+                    if torch.distributed.get_rank() == 0:
+                        self.multi_embedder.mm_projector.load_state_dict(named_parameters)
+            else:
+                status = self.multi_embedder.mm_projector.load_state_dict(named_parameters)
+                print('missing_keys:', status.missing_keys)
+        self.multi_embedder = self.multi_embedder.to(device='cuda')
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        vision_tower = model_args.vision_tower
+        vision_tower_aux = model_args.vision_tower_aux
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+        self.config.mm_vision_tower = vision_tower
+        self.config.mm_vision_tower_aux = vision_tower_aux
+        if self.get_vision_tower() is None:
+            vision_tower = build_vision_tower(model_args)
+            if fsdp is not None and len(fsdp) > 0:
+                self.vision_tower = [vision_tower]
+            else:
+                self.vision_tower = vision_tower
+        else:
+            if fsdp is not None and len(fsdp) > 0:
+                vision_tower = self.vision_tower[0]
+            else:
+                vision_tower = self.vision_tower
+            vision_tower.load_model()
+        if vision_tower_aux is not None:
+            if self.get_vision_tower_aux() is None:
+                vision_tower_aux = build_vision_tower_aux(model_args)
+                if fsdp is not None and len(fsdp) > 0:
+                    self.vision_tower_aux = [vision_tower_aux]
+                else:
+                    self.vision_tower_aux = vision_tower_aux
+            else:
+                if fsdp is not None and len(fsdp) > 0:
+                    vision_tower_aux = self.vision_tower_aux[0]
+                else:
+                    vision_tower_aux = self.vision_tower_aux
+                vision_tower_aux.load_model()
+            self.config.mm_hidden_size_aux = vision_tower_aux.hidden_size
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
+        self.config.mm_hidden_size = vision_tower.hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+        if getattr(self, 'mm_projector', None) is None:
+            self.mm_projector = build_vision_projector(self.config)
+        else:
+            # In case it is frozen by LoRA
+            for p in self.mm_projector.parameters():
+                p.requires_grad = True
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+            def get_w(weights, keyword):
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword + '.' in k}
+            if 'model' in mm_projector_weights.keys():
+                mm_projector_weights = mm_projector_weights['model']
+                if is_deepspeed_zero3_enabled():
+                    if len(mm_projector_weights) > 0:
+                        with deepspeed.zero.GatheredParameters(mm_projector_weights, modifier_rank=0):
+                            if torch.distributed.get_rank() == 0:
+                                self.mm_projector.load_state_dict(mm_projector_weights)
+                else:
+                    status = self.mm_projector.load_state_dict(mm_projector_weights, strict=False)
+                    print('missing_keys:', status.missing_keys)
+            else:
+                if is_deepspeed_zero3_enabled():
+                    named_parameters = get_w(mm_projector_weights, 'mm_projector')
+                    if len(named_parameters) > 0:
+                        with deepspeed.zero.GatheredParameters(named_parameters, modifier_rank=0):
+                            if torch.distributed.get_rank() == 0:
+                                self.mm_projector.load_state_dict(named_parameters)
+                else:
+                    status = self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'),
+                                                               strict=False)
+                    print('missing_keys:', status.missing_keys)
+            self.mm_projector = self.mm_projector.to(device='cuda')
+    def initialize_uni_modules(self, model_args, for_eval=False):
+        pretrain_mm_mlp_adapter = getattr(model_args, "pretrain_mm_mlp_adapter", None)
+        self.config.image_size_aux = getattr(model_args, 'image_size_aux', 320)
+        self.config.optimize_vision_tower = getattr(model_args, 'optimize_vision_tower', False)
+        self.config.optimize_vision_tower_aux = getattr(model_args, 'optimize_vision_tower_aux', False)
+        self.vlm_uni_query_projector = nn.Sequential(nn.LayerNorm(self.config.mm_hidden_size),
+                                                     nn.Linear(self.config.mm_hidden_size, self.config.mm_hidden_size))
+        self.vlm_uni_aux_projector = nn.Sequential(nn.LayerNorm(self.config.mm_hidden_size_aux),
+                                                   nn.Linear(self.config.mm_hidden_size_aux,
+                                                             self.config.mm_hidden_size))
+        self.vlm_uni_val_projector = nn.Sequential(nn.LayerNorm(self.config.mm_hidden_size_aux),
+                                                   nn.Linear(self.config.mm_hidden_size_aux,
+                                                             self.config.mm_hidden_size))
+        if pretrain_mm_mlp_adapter is not None:
+            projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+        else:
+            trainable_module = ['vlm_uni', 'vision_fpn', 'vision_stages']
+            if hasattr(model_args, 'model_name_or_path'):
+                model_save_path = model_args.model_name_or_path
+            else:
+                model_save_path = model_args.model_path
+            model_idx_path = getattr(model_args, 'model_path', model_save_path)
+            if IS_NEW_TRANSFORMERS:
+                try:
+                    weight_file = json.load(open(os.path.join(model_idx_path, 'model.safetensors.index.json'), 'r'))[
+                        'weight_map']
+                except:
+                    weight_file = json.load(open(os.path.join(model_idx_path, 'pytorch_model.bin.index.json'), 'r'))[
+                        'weight_map']
+            else:
+                weight_file = json.load(open(os.path.join(model_idx_path, 'pytorch_model.bin.index.json'), 'r'))[
+                    'weight_map']
+            model_path = set(
+                [weight_file[_key] for _key in weight_file if any([_module in _key for _module in trainable_module])])
+            projector_weights = {}
+            for _model in model_path:
+                if not IS_NEW_TRANSFORMERS:
+                    projector_weights.update(torch.load(os.path.join(model_idx_path, _model), map_location='cpu'))
+                else:
+                    with safetensors.safe_open(os.path.join(model_idx_path, _model), framework="pt", device='cpu') as f:
+                        for _key in f.keys():
+                            projector_weights.update({_key: f.get_tensor(_key)})
+            if len(projector_weights) == 0:
+                return
+        def get_w(weights, keyword, main_module, sub_module):
+            if getattr(main_module, sub_module, None) is None:
+                return
+            pretrain_weight = {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword + '.' in k}
+            if len(pretrain_weight) == 0:
+                return
+            if is_deepspeed_zero3_enabled():
+                named_parameters = [v for k, v in getattr(main_module, sub_module).named_parameters()]
+                if len(named_parameters) > 0:
+                    # because zero3 puts placeholders in model params, this context
+                    # manager gathers (unpartitions) the params of the current layer, then loads from
+                    # the state dict and then re-partitions them again
+                    with deepspeed.zero.GatheredParameters(named_parameters, modifier_rank=0):
+                        if torch.distributed.get_rank() == 0:
+                            getattr(main_module, sub_module).load_state_dict(pretrain_weight)
+                    with deepspeed.zero.GatheredParameters(self.mm_projector[0].weight, modifier_rank=None):
+                        weight_type = self.mm_projector[0].weight.dtype
+                        device_type = self.mm_projector[0].weight.device
+            else:
+                weight_type = self.mm_projector[0].weight.dtype
+                device_type = self.mm_projector[0].weight.device
+                getattr(main_module, sub_module).load_state_dict(pretrain_weight)
+            if weight_type == torch.uint8 or weight_type == torch.int8 or weight_type == torch.int16:
+                weight_type = torch.float16
+            getattr(main_module, sub_module).to(device=device_type, dtype=weight_type)
+            print(f"Loading {sub_module} weights...")
+        # load pretrained weights
+        get_w(projector_weights, 'vision_tower.vision_tower', self.vision_tower, 'vision_tower')
+        # load pretrained weights
+        if self.config.optimize_vision_tower_aux:
+            # not optimize vision stem, just used to check
+            get_w(projector_weights, 'vision_tower_aux.vision_stem', self.vision_tower_aux, 'vision_stem')
+            get_w(projector_weights, 'vision_tower_aux.vision_stages', self.vision_tower_aux, 'vision_stages')
+        get_w(projector_weights, 'vlm_uni_query_projector', self, 'vlm_uni_query_projector')
+        get_w(projector_weights, 'vlm_uni_aux_projector', self, 'vlm_uni_aux_projector')
+        get_w(projector_weights, 'vlm_uni_val_projector', self, 'vlm_uni_val_projector')
+class TokenEmbedder(nn.Module):
+    def __init__(self, hidden_size):
+        super().__init__()
+        # hard coding for unitok, need to be fixed
+        self.num_codebooks = 8
+        self.quantizer = VectorQuantizerM(32768, 64, 0.25, False, 0.01, 8)
+        self.attn_projection = AttnProjection(64, 1024, 16)
+        self.mm_projector = nn.Sequential(
+            nn.LayerNorm(1024, eps=1e-6),
+            nn.Linear(1024, hidden_size),
+            nn.GELU(),
+            nn.Linear(hidden_size, hidden_size),
+        )
+    def forward(self, indices):  # input [bz,num-codebook,256]
+        assert indices.shape[1] == self.num_codebooks
+        features = self.quantizer.idx_to_f(indices)  # [bz,256,C]
+        features = self.attn_projection(features)  # [bz,256,1024]
+        latent_features = self.mm_projector(features)  # [bz,256,hidden_size]
+        return latent_features  # [bz,256,hidden_size
+class MiniGeminiMetaForCausalLM(ABC):
+    @abstractmethod
+    def get_model(self):
+        pass
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+    def get_vision_tower_aux(self):
+        return self.get_model().get_vision_tower_aux()
+    def encode_images(self, images, images_aux=None, is_video=False):
+        image_grid = getattr(self.config, 'image_grid', 1)
+        image_global = getattr(self.config, 'image_global', False)
+        if image_grid > 1:
+            batch_size = images.shape[0]
+            if image_global:
+                global_images = images[:, -1:].flatten(0, 1).contiguous()
+                grid_images = images[:, :-1].flatten(0, 1).contiguous()
+                images = torch.cat([grid_images, global_images], dim=0)
+            else:
+                images = images.flatten(0, 1).contiguous()
+        image_features = self.get_model().get_vision_tower()(images)
+        if image_global:
+            image_feat_global = image_features[-len(global_images):]
+            image_features = image_features[:len(grid_images)]
+        if images_aux is not None:
+            image_aux_features_raw = self.get_model().get_vision_tower_aux()(images_aux).to(
+                dtype=image_features.dtype, device=image_features.device)
+            if image_global:
+                image_aux_features_global = F.interpolate(image_aux_features_raw.float(),
+                                                          scale_factor=1 / image_grid,
+                                                          mode='bilinear',
+                                                          align_corners=False).to(dtype=image_aux_features_raw.dtype)
+                image_feat_global, image_aux_feat_global = self.unified_resampler(image_feat_global,
+                                                                                  image_aux_features_global)
+            if image_grid > 1:
+                image_aux_features_raw = image_aux_features_raw.reshape(*image_aux_features_raw.shape[:2],
+                                                                        image_grid,
+                                                                        image_aux_features_raw.shape[-2] // image_grid,
+                                                                        image_grid,
+                                                                        image_aux_features_raw.shape[-1] // image_grid)
+                image_aux_features_raw = image_aux_features_raw.permute(0, 2, 4, 1, 3, 5).flatten(1, 2).flatten(0,
+                                                                                                                1).contiguous()
+            image_features, image_aux_features = self.unified_resampler(image_features, image_aux_features_raw)
+            if image_grid > 1:
+                image_features = image_features.reshape(batch_size, image_grid ** 2, *image_features.shape[1:])
+                image_features = image_features.flatten(1, 2).contiguous()
+                image_aux_features = image_aux_features.reshape(batch_size, image_grid ** 2,
+                                                                *image_aux_features.shape[1:])
+                image_aux_features = image_aux_features.flatten(1, 2).contiguous()
+            # add global features, [global, local]
+            if image_global:
+                image_features = torch.cat([image_feat_global, image_features], dim=1)
+                image_aux_features = torch.cat([image_aux_feat_global, image_aux_features], dim=1)
+            # token generation
+            image_features = image_features + image_aux_features
+        # process image features after token generation
+        image_features = self.get_model().mm_projector(image_features)
+        return image_features
+    def unified_resampler(self, images, images_aux):
+        # patchwise with square images
+        patch_num = int(images.shape[1] ** 0.5)
+        patch_size = images_aux.shape[-1] // patch_num
+        # within patch attention
+        images_aux = images_aux.permute(0, 2, 3, 1)
+        images_aux = images_aux.reshape(len(images_aux), patch_num, patch_size, patch_num, patch_size,
+                                        images_aux.shape[-1])
+        images_aux = images_aux.permute(0, 1, 3, 2, 4, 5)
+        images_aux = images_aux.reshape(len(images_aux), patch_num ** 2, patch_size ** 2,
+                                        images_aux.shape[-1]).contiguous()
+        # token attention
+        embed_query = self.get_model().vlm_uni_query_projector(images)
+        embed_aux = self.get_model().vlm_uni_aux_projector(images_aux)
+        embed_value = self.get_model().vlm_uni_val_projector(images_aux)
+        embed_att = embed_query[:, :, None] @ (embed_aux.transpose(-1, -2) / (embed_aux.shape[-1] ** 0.5))
+        embed_att = embed_att.nan_to_num()
+        embed_feat = (embed_att.softmax(-1) @ embed_value).mean(2)
+        return images, embed_feat
+    def prepare_inputs_labels_for_multimodal(
+            self, input_ids, position_ids, attention_mask, past_key_values, labels, images=None, images_aux=None,
+            data_types=None,
+    ):
+        vision_tower = self.get_vision_tower()
+        multi_embedder = self.model.multi_embedder
+        # import pdb;pdb.set_trace()
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            if past_key_values is not None and vision_tower is not None and images is not None and input_ids.shape[
+                1] == 1:
+                target_shape = past_key_values[-1][-1].shape[-2] + 1
+                attention_mask = torch.cat((attention_mask, torch.ones(
+                    (attention_mask.shape[0], target_shape - attention_mask.shape[1]),
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device
+                )), dim=1)
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+            if position_ids is None:
+                position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+            bug_flag = False
+            if images is not None:
+                _labels = labels
+                _position_ids = position_ids
+                _attention_mask = attention_mask
+                new_input_embeds = []
+                new_labels = []
+                additional_image_labels = []
+                additional_image_indexs = []
+                if attention_mask is not None:
+                    input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in
+                                 zip(input_ids, attention_mask)]
+                    labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in
+                              zip(labels, attention_mask)]
+                # import pdb;pdb.set_trace()
+                for image, cur_input_ids, cur_labels, data_type in zip(images, input_ids, labels, data_types):
+                    # import pdb;pdb.set_trace()
+                    num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+                    # import pdb;pdb.set_trace()
+                    if num_images == 0:
+                        # import pdb;pdb.set_trace()
+                        empty_image_embed = multi_embedder(
+                            torch.zeros(1, self.model.multi_embedder.num_codebooks, 1).long().to(cur_input_ids))[0, :0]
+                        new_input_embeds.append(
+                            torch.cat([self.get_model().embed_tokens(cur_input_ids), empty_image_embed], dim=0))
+                        new_labels.append(cur_labels)
+                        continue  # pure text data
+                    assert len(image.shape) == 3  # [bz,num-codebook,256]  image token id
+                    if len(image) > num_images:
+                        image = image[:num_images]  # remove cutted images
+                    image_embedding = multi_embedder(image)  # get image embeddings
+                    image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [
+                        cur_input_ids.shape[0]]
+                    cur_input_ids_noim = []
+                    cur_labels_noim = []
+                    for i in range(len(image_token_indices) - 1):
+                        cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1:image_token_indices[i + 1]])
+                        cur_labels_noim.append(cur_labels[image_token_indices[i] + 1:image_token_indices[i + 1]])
+                    split_sizes = [x.shape[0] for x in cur_labels_noim]
+                    cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
+                    cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+                    cur_new_input_embeds = []
+                    cur_new_labels = []
+                    # import pdb;pdb.set_trace()
+                    max_pos_id = 0
+                    for i in range(num_images + 1):
+                        cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                        cur_new_labels.append(cur_labels_noim[i])
+                        # import pdb;pdb.set_trace()
+                        max_pos_id += cur_input_embeds_no_im[i].shape[0]
+                        if i < num_images:
+                            cur_image_features = image_embedding[i]
+                            cur_new_input_embeds.append(cur_image_features)
+                            if data_type == 1:  # to Image, loss on 4x image tokens
+                                additional_image_labels.append(image)
+                                additional_image_indexs.append((cur_new_labels[-1].shape[0],
+                                                                cur_new_labels[-1].shape[0] + cur_image_features.shape[
+                                                                    0]))
+                            ###   input:   describe xxxx: boi 8*[256] (256 embedding) eoi eos
+                            ###   labels: -100  -100 -100 -100 -100 -100 -100 -100 -100 eoi eos
+                            cur_new_labels.append(
+                                torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device,
+                                           dtype=cur_labels.dtype))
+                            max_pos_id += cur_image_features.shape[0]
+                    cur_new_input_embeds = [x.to(device=cur_input_embeds.device) for x in cur_new_input_embeds]
+                    cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+                    cur_new_labels = torch.cat(cur_new_labels)
+                    new_input_embeds.append(cur_new_input_embeds)
+                    new_labels.append(cur_new_labels)
+                tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
+                if tokenizer_model_max_length is not None:
+                    new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+                    new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+                # Combine them
+                max_len = max(x.shape[0] for x in new_input_embeds)
+                batch_size = len(new_input_embeds)
+                assert len(new_labels) == len(data_types)
+                new_input_embeds_padded = []
+                new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype,
+                                               device=new_labels[0].device)
+                attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype,
+                                             device=attention_mask.device)
+                position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+                # import pdb;pdb.set_trace()
+                for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+                    cur_len = cur_new_embed.shape[0]
+                    if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
+                        new_input_embeds_padded.append(torch.cat((
+                            torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype,
+                                        device=cur_new_embed.device),
+                            cur_new_embed
+                        ), dim=0))
+                        if cur_len > 0:
+                            new_labels_padded[i, -cur_len:] = cur_new_labels
+                            attention_mask[i, -cur_len:] = True
+                            position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype,
+                                                                      device=position_ids.device)
+                    else:
+                        new_input_embeds_padded.append(torch.cat((
+                            cur_new_embed,
+                            torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype,
+                                        device=cur_new_embed.device)
+                        ), dim=0))
+                        if cur_len > 0:
+                            new_labels_padded[i, :cur_len] = cur_new_labels
+                            attention_mask[i, :cur_len] = True
+                            position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype,
+                                                                     device=position_ids.device)
+                new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+                if _labels is None:
+                    new_labels = None
+                else:
+                    new_labels = new_labels_padded
+                if _attention_mask is None:
+                    attention_mask = None
+                else:
+                    attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+                if _position_ids is None:
+                    position_ids = None
+                # import pdb;pdb.set_trace()
+                return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels, data_types, additional_image_labels, additional_image_indexs
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+    def prepare_inputs_for_multimodal(
+        self, input_ids, position_ids, attention_mask,
+        past_key_values, labels, images=None, images_aux=None, data_types=None,
+    ):
+        multi_embedder = self.model.multi_embedder
+        # import pdb;pdb.set_trace()
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if images is not None:
+            new_input_embeds = []
+            for image, cur_input_ids in zip(images, input_ids):
+                # import pdb;pdb.set_trace()
+                num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+                if num_images == 0:
+                    new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids))
+                    continue  # pure text data
+                image_embedding = multi_embedder(image)
+                # import pdb;pdb.set_trace()
+                image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [
+                    cur_input_ids.shape[0]]
+                cur_input_ids_noim = []
+                for i in range(len(image_token_indices) - 1):
+                    cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1:image_token_indices[i + 1]])
+                split_sizes = [x.shape[0] for x in cur_input_ids_noim]
+                cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
+                cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+                cur_new_input_embeds = []
+                # import pdb;pdb.set_trace()
+                max_pos_id = 0
+                for i in range(num_images + 1):
+                    cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                    # import pdb;pdb.set_trace()
+                    max_pos_id += cur_input_embeds_no_im[i].shape[0]
+                    if i < num_images:
+                        cur_image_features = image_embedding[i]
+                        cur_new_input_embeds.append(cur_image_features)
+                        max_pos_id += cur_image_features.shape[0]
+                cur_new_input_embeds = [x.to(device=cur_input_embeds.device) for x in cur_new_input_embeds]
+                cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+                new_input_embeds.append(cur_new_input_embeds)
+            tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
+            if tokenizer_model_max_length is not None:
+                new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            # import pdb;pdb.set_trace()
+            # Combine them
+            max_len = max(x.shape[0] for x in new_input_embeds)
+            batch_size = len(new_input_embeds)
+            new_input_embeds_padded = []
+            new_input_embeds = torch.stack(new_input_embeds, dim=0)
+            # import pdb;pdb.set_trace()
+            if _labels is None:
+                new_labels = None
+            else:
+                new_labels = new_labels_padded
+            if _attention_mask is None:
+                attention_mask = None
+            else:
+                attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+            if _position_ids is None:
+                position_ids = None
+            # import pdb;pdb.set_trace()
+            return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
+    def initialize_vision_tokenizer(self, model_args, tokenizer):
+        if model_args.mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+        if model_args.mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+            if model_args.pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
+                embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
+                assert num_new_tokens == 2
+                if input_embeddings.shape == embed_tokens_weight.shape:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
+                elif embed_tokens_weight.shape[0] == num_new_tokens:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                else:
+                    raise ValueError(
+                        f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
+        elif model_args.mm_use_im_patch_token:
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = False
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False

model/multimodal_encoder/__pycache__/builder.cpython-311.pyc ADDED Viewed

Binary file (2.35 kB). View file

model/multimodal_encoder/__pycache__/builder.cpython-39.pyc ADDED Viewed

Binary file (1.27 kB). View file

model/multimodal_encoder/__pycache__/clip_encoder.cpython-311.pyc ADDED Viewed

Binary file (5.92 kB). View file

model/multimodal_encoder/__pycache__/clip_encoder.cpython-39.pyc ADDED Viewed

Binary file (3.37 kB). View file

model/multimodal_encoder/__pycache__/eva_encoder.cpython-311.pyc ADDED Viewed

Binary file (34.2 kB). View file

model/multimodal_encoder/__pycache__/eva_encoder.cpython-39.pyc ADDED Viewed

Binary file (17.3 kB). View file

model/multimodal_encoder/__pycache__/openclip_encoder.cpython-311.pyc ADDED Viewed

Binary file (12.3 kB). View file

model/multimodal_encoder/__pycache__/openclip_encoder.cpython-39.pyc ADDED Viewed

Binary file (6.52 kB). View file

model/multimodal_encoder/builder.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os
+from .clip_encoder import CLIPVisionTower
+from .eva_encoder import EVAVisionTower
+from .openclip_encoder import OpenCLIPVisionTower
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
+    image_processor = getattr(vision_tower_cfg, 'image_processor', getattr(vision_tower_cfg, 'image_processor', "../processor/clip-patch14-224"))
+    if not os.path.exists(vision_tower):
+        raise ValueError(f'Not find vision tower: {vision_tower}')
+    if "openai" in vision_tower.lower() or "ShareGPT4V" in vision_tower:
+        return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    elif "lavis" in vision_tower.lower() or "eva" in vision_tower.lower():
+        return EVAVisionTower(vision_tower, image_processor, args=vision_tower_cfg, **kwargs)
+    else:
+        raise ValueError(f'Unknown vision tower: {vision_tower}')
+def build_vision_tower_aux(vision_tower_cfg, **kwargs):
+    vision_tower_aux = getattr(vision_tower_cfg, 'mm_vision_tower_aux', getattr(vision_tower_cfg, 'vision_tower_aux', None))
+    if not os.path.exists(vision_tower_aux):
+        raise ValueError(f'Not find vision tower: {vision_tower_aux}')
+    if "openclip" in vision_tower_aux.lower():
+        return OpenCLIPVisionTower(vision_tower_aux, args=vision_tower_cfg, **kwargs)
+    elif "openai" in vision_tower_aux.lower():
+        return CLIPVisionTower(vision_tower_aux, args=vision_tower_cfg, **kwargs)
+    else:
+        raise ValueError(f'Unknown vision tower: {vision_tower_aux}')

model/multimodal_encoder/clip_encoder.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import torch
+import torch.nn as nn
+from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+from ..processor.video_processor import VideoFramesProcessor
+class CLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        self.is_optimize = getattr(args, 'optimize_vision_tower', False)
+        if not delay_load:
+            self.load_model()
+        elif getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+    def load_model(self):
+        self.image_processor = VideoFramesProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
+        self.vision_tower.requires_grad_(False)
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+    def image_forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    def forward(self, images):
+        if not self.is_optimize:
+            with torch.no_grad():
+                image_features = self.image_forward(images)
+        else:
+            image_features = self.image_forward(images)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2

model/multimodal_encoder/eva_encoder.py ADDED Viewed

	@@ -0,0 +1,551 @@

+# Based on EVA, BEIT, timm and DeiT code bases
+# https://github.com/baaivision/EVA
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# https://github.com/microsoft/unilm/tree/master/beit
+# https://github.com/facebookresearch/deit/
+# https://github.com/facebookresearch/dino
+# --------------------------------------------------------'
+import math
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+from timm.models.registry import register_model
+from transformers import CLIPImageProcessor, CLIPVisionConfig
+from ..processor.video_processor import VideoFramesProcessor
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
+        **kwargs
+    }
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the orignal BERT implement
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., window_size=None, attn_head_dim=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(window_size[0])
+            coords_w = torch.arange(window_size[1])
+            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = \
+                torch.zeros(size=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+            self.register_buffer("relative_position_index", relative_position_index)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, rel_pos_bias=None):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
+        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = \
+                self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+                 window_size=None, attn_head_dim=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        if init_values is not None and init_values > 0:
+            self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+            self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+    def forward(self, x, rel_pos_bias=None):
+        if self.gamma_1 is None:
+            x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+class RelativePositionBias(nn.Module):
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(window_size[0])
+        coords_w = torch.arange(window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+        self.register_buffer("relative_position_index", relative_position_index)
+        # trunc_normal_(self.relative_position_bias_table, std=.02)
+    def forward(self):
+        relative_position_bias = \
+            self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1] + 1,
+                self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+class VisionTransformer(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None,
+                 use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False,
+                 use_mean_pooling=True, init_scale=0.001, use_checkpoint=False):
+        super().__init__()
+        self.image_size = img_size
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if use_abs_pos_emb:
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        else:
+            self.pos_embed = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+        self.use_checkpoint = use_checkpoint
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None)
+            for i in range(depth)])
+#         self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
+#         self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
+#         self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        # trunc_normal_(self.mask_token, std=.02)
+#         if isinstance(self.head, nn.Linear):
+#             trunc_normal_(self.head.weight, std=.02)
+        self.apply(self._init_weights)
+        self.fix_init_weight()
+#         if isinstance(self.head, nn.Linear):
+#             self.head.weight.data.mul_(init_scale)
+#             self.head.bias.data.mul_(init_scale)
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def get_classifier(self):
+        return self.head
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, rel_pos_bias)
+            else:
+                x = blk(x, rel_pos_bias)
+        return x
+#         x = self.norm(x)
+#         if self.fc_norm is not None:
+#             t = x[:, 1:, :]
+#             return self.fc_norm(t.mean(1))
+#         else:
+#             return x[:, 0]
+    def forward(self, x):
+        x = self.forward_features(x)
+#         x = self.head(x)
+        return x
+    def get_intermediate_layers(self, x):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+        features = []
+        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
+        for blk in self.blocks:
+            x = blk(x, rel_pos_bias)
+            features.append(x)
+        return features
+    @property
+    def dtype(self):
+        return self.cls_token.dtype
+    @property
+    def device(self):
+        return self.cls_token.device
+    def get_num_layer(self, var_name=""):
+        if var_name in ("cls_token", "mask_token", "pos_embed"):
+            return 0
+        elif var_name.startswith("patch_embed"):
+            return 0
+        elif var_name.startswith("rel_pos_bias"):
+            return len(self.blocks) - 1
+        elif var_name.startswith("blocks"):
+            layer_id = int(var_name.split('.')[1])
+            return layer_id + 1
+        else:
+            return len(self.blocks)
+def interpolate_pos_embed(model, checkpoint_model):
+    if 'pos_embed' in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model['pos_embed'].float()
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches ** 0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model['pos_embed'] = new_pos_embed
+def convert_weights_to_fp16(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+#         if isinstance(l, (nn.MultiheadAttention, Attention)):
+#             for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+#                 tensor = getattr(l, attr)
+#                 if tensor is not None:
+#                     tensor.data = tensor.data.half()
+    model.apply(_convert_weights_to_fp16)
+class EVAVisionTower(nn.Module):
+    def __init__(self, vision_tower, image_processor, args, use_checkpoint=False, drop_path_rate=0.0, delay_load=False, dtype=torch.float32):
+        super().__init__()
+        self.is_loaded = False
+        self.use_checkpoint = use_checkpoint
+        self.vision_tower_name = vision_tower
+        self.image_processor_name = image_processor
+        self.drop_path_rate = drop_path_rate
+        self.patch_size = 14
+        self.out_channel = 1408
+        if not delay_load:
+            self.load_model()
+        self.vision_config = CLIPVisionConfig.from_pretrained(image_processor)
+    def load_model(self):
+        # self.image_processor = CLIPImageProcessor.from_pretrained(self.image_processor_name)
+        self.image_processor = VideoFramesProcessor.from_pretrained(self.image_processor_name)
+        self.vision_tower = VisionTransformer(
+            img_size=self.image_processor.size['shortest_edge'],
+            patch_size=self.patch_size,
+            use_mean_pooling=False,
+            embed_dim=self.out_channel,
+            depth=39,
+            num_heads=self.out_channel//88,
+            mlp_ratio=4.3637,
+            qkv_bias=True,
+            drop_path_rate=self.drop_path_rate,
+            norm_layer=partial(nn.LayerNorm, eps=1e-6),
+            use_checkpoint=self.use_checkpoint,
+        )
+        state_dict = torch.load(self.vision_tower_name, map_location="cpu")
+        interpolate_pos_embed(self.vision_tower, state_dict)
+        incompatible_keys = self.vision_tower.load_state_dict(state_dict, strict=False)
+        print(incompatible_keys)
+        self.vision_tower.requires_grad_(False)
+        self.is_loaded = True
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
+                image_feature = image_forward_out.to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype))
+            image_features = image_forward_outs.to(images.dtype)
+        return image_features
+    def feature_select(self, image_features):
+        # image_features = image_features.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        return self.vision_config
+    @property
+    def hidden_size(self):
+        return self.out_channel
+    @property
+    def num_patches(self):
+        return (self.image_processor.size['shortest_edge'] // self.patch_size) ** 2
+def create_eva_vit_g(img_size=224,drop_path_rate=0.4,use_checkpoint=False,model_path=None,precision="fp16"):
+    model = VisionTransformer(
+        img_size=img_size,
+        patch_size=14,
+        use_mean_pooling=False,
+        embed_dim=1408,
+        depth=39,
+        num_heads=1408//88,
+        mlp_ratio=4.3637,
+        qkv_bias=True,
+        drop_path_rate=drop_path_rate,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        use_checkpoint=use_checkpoint,
+    )
+    # url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/eva_vit_g.pth"
+    # cached_file = download_cached_file(
+    #     url, check_hash=False, progress=True
+    # )
+    state_dict = torch.load(model_path, map_location="cpu")
+    interpolate_pos_embed(model,state_dict)
+    incompatible_keys = model.load_state_dict(state_dict, strict=False)
+    print(incompatible_keys)
+    if precision == "fp16":
+        convert_weights_to_fp16(model)
+    return model

model/multimodal_encoder/openclip_encoder.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+import json
+import logging
+import deepspeed
+from pathlib import Path
+from open_clip.factory import load_state_dict, get_model_config
+from open_clip.model import CLIPVisionCfg, CLIPTextCfg, _build_vision_tower, convert_to_custom_text_state_dict, resize_pos_embed
+from typing import Dict, Optional
+from transformers.deepspeed import deepspeed_config, is_deepspeed_zero3_enabled
+class OpenCLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.vision_config = json.load(open(os.path.join(vision_tower,'open_clip_config.json'), 'r'))
+        self.is_optimize = getattr(args, 'optimize_vision_tower_aux', False)
+        if not delay_load:
+            self.load_model()
+    def load_model(self):
+        ckpt_path = os.path.join(self.vision_tower_name, 'open_clip_pytorch_model.bin')
+        if 'convnext' in self.vision_tower_name:
+            if 'large' in self.vision_tower_name and 'd-320' in self.vision_tower_name:
+                self.model_type = 'convnext_large_d_320'
+                self.model_channel = [192, 384, 768, 1536] # stage 0-3
+            elif 'base' in self.vision_tower_name and 'w-320' in self.vision_tower_name:
+                self.model_type = 'convnext_base_w_320'
+                self.model_channel = [128, 256, 512, 1024]
+            elif 'xxlarge' in self.vision_tower_name:
+                self.model_type = 'convnext_xxlarge'
+                self.model_channel = [384, 768, 1536, 3072]
+        clip_model = CLIP(**get_model_config(self.model_type))
+        clip_model.visual.trunk.norm_pre = None
+        clip_model.visual.trunk.head = None
+        clip_model.visual.head = None
+        print(f'Loading pretrained weights ({self.model_type}).')
+        load_checkpoint(clip_model, ckpt_path, strict=False)
+        self.is_loaded = True
+        # decompose stem and stages blocks in vision tower
+        self.vision_stem = clip_model.visual.trunk.stem
+        self.vision_stages = clip_model.visual.trunk.stages
+        self.vision_stem.requires_grad_(False)
+        self.vision_stages.requires_grad_(False)
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_feature = self.backbone(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
+                image_features.append(image_feature)
+        else:
+            image_features = self.backbone(images.to(device=self.device, dtype=self.dtype))
+        return image_features
+    def backbone(self, images):
+        if not self.is_optimize:
+            with torch.no_grad():
+                results = self.basic_forward(images)
+        else:
+            results = self.basic_forward(images)
+        target_size = (results['stage_0'].shape[-2], results['stage_0'].shape[-1])
+        result_cat = []
+        for _stage in results:
+            if _stage == 'stage_0':
+                result_cat.append(results[_stage].contiguous())
+            else:
+                result_cat.append(F.interpolate(results[_stage].float().contiguous() ,
+                                                size=target_size,
+                                                mode='bilinear',
+                                                align_corners=False).to(dtype=results[_stage].dtype))
+        result_cat = torch.cat(result_cat, dim=1)
+        return result_cat.contiguous()
+    def basic_forward(self, images):
+        results = {}
+        x = self.vision_stem(images)
+        for _idx in range(len(self.vision_stages)):
+            x = self.vision_stages[_idx](x)
+            results[f'stage_{_idx}'] = x
+        return results
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_stem[0].weight.dtype
+    @property
+    def device(self):
+        return self.vision_stem[0].weight.device
+    @property
+    def config(self):
+        return self.vision_config
+    @property
+    def hidden_size(self):
+        return sum(self.model_channel)
+# modified function from open_clip to support zero3 stage
+def load_checkpoint(model, checkpoint_path, strict=True):
+    if Path(checkpoint_path).suffix in ('.npz', '.npy'):
+        from open_clip.big_vision import load_big_vision_weights
+        load_big_vision_weights(model, checkpoint_path)
+        return {}
+    state_dict = load_state_dict(checkpoint_path)
+    # detect old format and make compatible with new format
+    if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'):
+        state_dict = convert_to_custom_text_state_dict(state_dict)
+    # If loading a non-SigLIP model for SigLIP training. See https://github.com/mlfoundations/open_clip/issues/712
+    # if 'logit_bias' not in state_dict and model.logit_bias is not None:
+    #     state_dict["logit_bias"] = torch.zeros_like(state_dict["logit_scale"])
+    # Certain text transformers no longer expect position_ids after transformers==4.31
+    position_id_key = 'text.transformer.embeddings.position_ids'
+    if position_id_key in state_dict and not hasattr(model, position_id_key):
+        del state_dict[position_id_key]
+    resize_pos_embed(state_dict, model)
+    # resize_text_pos_embed(state_dict, model)
+    #incompatible_keys = model.load_state_dict(state_dict, strict=strict)
+    if is_deepspeed_zero3_enabled():
+        error_msgs = []
+        def load(module: nn.Module, state_dict, prefix=""):
+            metadata = None
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            args = (state_dict, prefix, local_metadata, True, [], [], error_msgs)
+            # Parameters of module and children will start with prefix. We can exit early if there are none in this
+            # state_dict
+            if len([key for key in state_dict if key.startswith(prefix)]) > 0:
+                if is_deepspeed_zero3_enabled():
+                    # In sharded models, each shard has only part of the full state_dict, so only gather
+                    # parameters that are in the current state_dict.
+                    named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False))
+                    params_to_gather = [named_parameters[k] for k in state_dict.keys() if k in named_parameters]
+                    if len(params_to_gather) > 0:
+                        # because zero3 puts placeholders in model params, this context
+                        # manager gathers (unpartitions) the params of the current layer, then loads from
+                        # the state dict and then re-partitions them again
+                        with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0):
+                            if torch.distributed.get_rank() == 0:
+                                module._load_from_state_dict(*args)
+                else:
+                    module._load_from_state_dict(*args)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, state_dict, prefix + name + ".")
+        load(model, state_dict)
+        incompatible_keys = []
+    else:
+        incompatible_keys = model.load_state_dict(state_dict, strict=strict)
+        logging.info(f"incompatible_keys.missing_keys: {incompatible_keys.missing_keys}")
+    return incompatible_keys
+class CLIP(nn.Module):
+    output_dict: torch.jit.Final[bool]
+    def __init__(
+            self,
+            embed_dim: int,
+            vision_cfg: CLIPVisionCfg,
+            text_cfg: CLIPTextCfg,
+            quick_gelu: bool = False,
+            cast_dtype: Optional[torch.dtype] = None,
+            output_dict: bool = False,
+    ):
+        super().__init__()
+        self.output_dict = output_dict
+        self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype)

model/multimodal_projector/__pycache__/builder.cpython-311.pyc ADDED Viewed

Binary file (3.59 kB). View file

model/multimodal_projector/__pycache__/builder.cpython-39.pyc ADDED Viewed

Binary file (2.02 kB). View file

model/multimodal_projector/builder.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+import torch.nn as nn
+import re
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": 'identity'}
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+    if projector_type == 'linear':
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+    if projector_type == 'identity':
+        return IdentityMap()
+    raise ValueError(f'Unknown projector type: {projector_type}')

model/processor/__pycache__/video_processor.cpython-311.pyc ADDED Viewed

Binary file (4.86 kB). View file

model/processor/__pycache__/video_processor.cpython-39.pyc ADDED Viewed

Binary file (2.84 kB). View file

model/processor/video_processor.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from transformers import CLIPImageProcessor
+from transformers.image_processing_utils import BatchFeature, get_size_dict
+from transformers.image_transforms import get_resize_output_image_size
+import torch
+import torch.nn.functional as F
+import numpy as np
+class VideoFramesProcessor(CLIPImageProcessor):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def preprocess(self, images, **kwargs):
+        if not isinstance(images, np.ndarray):
+            return super().preprocess(images=images, **kwargs)
+        do_resize = kwargs.get('do_resize', self.do_resize)
+        size = kwargs.get('size', self.size)
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        do_center_crop = kwargs.get('do_center_crop', self.do_center_crop)
+        crop_size = kwargs.get('crop_size', self.crop_size)
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = kwargs.get('do_rescale', self.do_rescale)
+        rescale_factor = kwargs.get('rescale_factor', self.rescale_factor)
+        do_normalize = kwargs.get('do_normalize', self.do_normalize)
+        image_mean = kwargs.get('image_mean', self.image_mean)
+        image_std = kwargs.get('image_std', self.image_std)
+        return_tensors = kwargs.get('return_tensors', None)
+        def resize(images, output_size):
+            images = images.permute((0, 3, 1, 2))
+            images = F.interpolate(images, size=output_size, mode='bicubic')
+            images = images.permute((0, 2, 3, 1))
+            return images
+        def center_crop(images, crop_size):
+            crop_width, crop_height = crop_size["width"], crop_size["height"]
+            img_width, img_height = images.shape[1:3]
+            x = (img_width - crop_width) // 2
+            y = (img_height - crop_height) // 2
+            images = images[:, x:x+crop_width, y:y+crop_height]
+            return images
+        def rescale(images, rescale_factor):
+            images = images * rescale_factor
+            return images
+        def normalize(images, mean, std):
+            mean = torch.tensor(mean)
+            std = torch.tensor(std)
+            images = (images - mean) / std
+            return images
+        images = torch.from_numpy(images).float()
+        if do_resize:
+            output_size = get_resize_output_image_size(images[0], size=size["shortest_edge"], default_to_square=False)
+            images = resize(images, output_size)
+        if do_center_crop:
+            images = center_crop(images, crop_size)
+        if do_rescale:
+            images = rescale(images, rescale_factor)
+        if do_normalize:
+            images = normalize(images, image_mean, image_std)
+        images = images.permute((0, 3, 1, 2))
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)

model/quant.py ADDED Viewed

	@@ -0,0 +1,519 @@

+from typing import List, Tuple
+import torch
+from torch import distributed as tdist, nn as nn
+from torch.nn import functional as F
+from torch.nn.functional import scaled_dot_product_attention
+# from utils import dist
+# this file only provides the VectorQuantizer2 used in VQVAE
+__all__ = ['VectorQuantizer', ]
+def get_entropy_loss(latent_embed, codebook_embed, inv_entropy_tau):
+    E_dist = latent_embed.square().sum(dim=1, keepdim=True) + codebook_embed.square().sum(dim=1, keepdim=False)
+    E_dist.addmm_(latent_embed, codebook_embed.T, alpha=-2, beta=1)  # E_dist: (N, vocab_size)
+    logits = -E_dist.float().mul_(inv_entropy_tau)
+    # calc per_sample_entropy
+    prob, log_prob = logits.softmax(dim=-1), logits.log_softmax(dim=-1)  # both are (N, vocab_size)
+    per_sample_entropy = torch.mean((-prob * log_prob).sum(dim=-1))
+    # calc codebook_entropy
+    avg_prob = prob.mean(dim=0)  # (vocab_size,)
+    log_avg_prob = torch.log(avg_prob + 1e-7)
+    codebook_entropy = (-avg_prob * log_avg_prob).sum()
+    # calc entropy_loss
+    entropy_loss = per_sample_entropy - codebook_entropy
+    return entropy_loss
+class NormalizedEmbedding(nn.Embedding):
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        super().__init__(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
+        # self.norm_scale = nn.Parameter(torch.tensor(0.0, dtype=torch.float32))
+    def forward(self, idx):
+        return F.embedding(
+            idx, F.normalize(self.weight, dim=1), self.padding_idx, self.max_norm,
+            self.norm_type, self.scale_grad_by_freq, self.sparse
+        )
+    def get_norm_weight(self):
+        return F.normalize(self.weight, dim=1)
+class ResConv(nn.Conv2d):
+    def __init__(self, embed_dim, quant_resi):
+        ks = 3 if quant_resi < 0 else 1
+        super().__init__(in_channels=embed_dim, out_channels=embed_dim, kernel_size=ks, stride=1, padding=ks // 2)
+        self.resi_ratio = abs(quant_resi)
+    def forward(self, h_BChw):
+        return h_BChw.mul(1 - self.resi_ratio) + super().forward(h_BChw).mul_(self.resi_ratio)
+class VectorQuantizer(nn.Module):
+    def __init__(
+            self, vocab_size: int, vocab_width: int, vocab_norm: bool, beta: float = 0.25, quant_resi=-0.5,
+            using_entropy_loss=False, entropy_temp=0.01,
+    ):
+        super().__init__()
+        self.vocab_size: int = vocab_size
+        self.vocab_width: int = vocab_width
+        self.register_buffer('vocab_usage', torch.zeros(self.vocab_size))
+        self.vocab_usage_record_times: int = 0
+        self.vocab_norm: bool = vocab_norm
+        # self.quant_resi = ResConv(self.vocab_width, quant_resi=quant_resi)
+        self.quant_resi = nn.Identity()
+        self.embedding = nn.Embedding(self.vocab_size, self.vocab_width)
+        self.beta: float = beta
+        self.using_entropy_loss, self.inv_entropy_tau = using_entropy_loss, 1 / entropy_temp
+        if not self.vocab_norm:
+            assert not self.using_entropy_loss, 'entropy loss without vocab norm is not supported'
+    def init_vocab(self, eini: float):
+        if eini > 0:
+            nn.init.trunc_normal_(self.embedding.weight.data, std=eini)
+        elif eini < 0:
+            base = self.vocab_width ** -0.5
+            base /= 36
+            self.embedding.weight.data.uniform_(-abs(eini) * base, abs(eini) * base)
+    def extra_repr(self) -> str:
+        return f'beta={self.beta:g}'
+    # ===================== `forward` is only used in VAE training =====================
+    def forward(self, f_BChw: torch.Tensor, ret_usages=False) -> Tuple[
+        torch.Tensor, torch.Tensor, torch.Tensor, List[float]]:
+        f_BChw = f_BChw.float()
+        B, C, h, w = f_BChw.shape
+        if self.vocab_norm:
+            if self.using_entropy_loss:
+                # find the nearest neighbor
+                NxC = f_BChw.permute(0, 2, 3, 1).reshape(-1, C)
+                NxC_no_grad = NxC.detach()
+                NxC_no_grad = F.normalize(NxC_no_grad, dim=-1)
+                idx_N = torch.argmax(NxC_no_grad @ F.normalize(self.embedding.weight.data.T, dim=0), dim=1)
+                # get logits
+                E_dist = NxC.square().sum(dim=1, keepdim=True) + self.embedding.weight.square().sum(dim=1,
+                                                                                                    keepdim=False)
+                E_dist.addmm_(NxC, self.embedding.weight.T, alpha=-2, beta=1)  # E_dist: (N, vocab_size)
+                logits = -E_dist.float().mul_(self.inv_entropy_tau)
+                # calc per_sample_entropy
+                prob, log_prob = logits.softmax(dim=-1), logits.log_softmax(dim=-1)  # both are (N, vocab_size)
+                per_sample_entropy = torch.mean((-prob * log_prob).sum(dim=-1))
+                # calc codebook_entropy
+                avg_prob = prob.mean(dim=0)  # (vocab_size,)
+                log_avg_prob = torch.log(avg_prob + 1e-7)
+                codebook_entropy = (-avg_prob * log_avg_prob).sum()
+                # calc entropy_loss
+                entropy_loss = per_sample_entropy - codebook_entropy
+            else:
+                NxC_no_grad = f_BChw.detach().permute(0, 2, 3, 1).reshape(-1, C)
+                NxC_no_grad = F.normalize(NxC_no_grad, dim=-1)
+                idx_N = torch.argmax(NxC_no_grad @ F.normalize(self.embedding.weight.data.T, dim=0), dim=1)
+                entropy_loss = 0
+        else:  # not self.vocab_norm
+            NxC_no_grad = f_BChw.detach().permute(0, 2, 3, 1).reshape(-1, C)
+            E_dist = NxC_no_grad.square().sum(dim=1, keepdim=True) + self.embedding.weight.data.square().sum(dim=1,
+                                                                                                             keepdim=False)
+            E_dist.addmm_(NxC_no_grad, self.embedding.weight.data.T, alpha=-2, beta=1)  # E_dist: N x vocab_size
+            idx_N = torch.argmin(E_dist, dim=1)
+            entropy_loss = 0
+        prob_per_class_is_chosen = idx_N.bincount(minlength=self.vocab_size).float()
+        handler = tdist.all_reduce(prob_per_class_is_chosen, async_op=True) if (
+                self.training and dist.initialized()) else None
+        # look up
+        idx_Bhw = idx_N.view(B, h, w)
+        fhat_BChw = self.quant_resi(self.embedding(idx_Bhw).permute(0, 3, 1, 2).contiguous())
+        # calc loss
+        vq_loss = F.mse_loss(fhat_BChw.detach(), f_BChw).mul_(self.beta) + F.mse_loss(fhat_BChw, f_BChw.detach())
+        fhat_BChw = (fhat_BChw.detach() - f_BChw.detach()).add_(f_BChw)
+        # update vocab_usage
+        if handler is not None:
+            handler.wait()
+        prob_per_class_is_chosen /= prob_per_class_is_chosen.sum()
+        vocab_usage = (prob_per_class_is_chosen > 0.01 / self.vocab_size).float().mean().mul_(100)
+        if self.vocab_usage_record_times == 0:
+            self.vocab_usage.copy_(prob_per_class_is_chosen)
+        elif self.vocab_usage_record_times < 100:
+            self.vocab_usage.mul_(0.9).add_(prob_per_class_is_chosen, alpha=0.1)
+        else:
+            self.vocab_usage.mul_(0.99).add_(prob_per_class_is_chosen, alpha=0.01)
+        self.vocab_usage_record_times += 1
+        return fhat_BChw, vq_loss, entropy_loss, (vocab_usage if ret_usages else None)
+    def f_to_idx(self, f_BChw: torch.Tensor) -> torch.LongTensor:
+        f_BChw = f_BChw.float()
+        B, C, h, w = f_BChw.shape
+        with torch.cuda.amp.autocast(enabled=False):
+            # find the nearest embedding
+            query_NxC = f_BChw.detach().permute(0, 2, 3, 1).reshape(-1, C)
+            if self.vocab_norm:
+                query_NxC = F.normalize(query_NxC, dim=-1)
+                idx_N = torch.argmax(query_NxC @ F.normalize(self.embedding.weight.data.T, dim=0), dim=1)
+            else:
+                E_dist = torch.sum(query_NxC.square(), dim=1, keepdim=True) + torch.sum(
+                    self.embedding.weight.data.square(), dim=1, keepdim=False)
+                E_dist.addmm_(query_NxC, self.embedding.weight.data.T, alpha=-2, beta=1)  # (B*h*w, vocab_size)
+                idx_N = torch.argmin(E_dist, dim=1)
+        return idx_N.view(B, h, w)
+class VectorQuantizerHybrid(nn.Module):
+    def __init__(
+            self, vocab_size: int, vocab_width: int, vocab_norm: bool, beta: float = 0.25, quant_resi=-0.5,
+            using_entropy_loss=False, entropy_temp=0.01,
+    ):
+        super().__init__()
+        self.vocab_size: int = vocab_size
+        self.vocab_width: int = vocab_width
+        self.register_buffer('vocab_usage', torch.zeros(self.vocab_size))
+        self.vocab_usage_record_times: int = 0
+        self.vocab_norm: bool = vocab_norm
+        # self.quant_resi = ResConv(self.vocab_width, quant_resi=quant_resi)
+        self.embedding = nn.Embedding(self.vocab_size, self.vocab_width)
+        self.beta: float = beta
+        self.using_entropy_loss, self.inv_entropy_tau = using_entropy_loss, 1 / entropy_temp
+        if not self.vocab_norm:
+            assert not self.using_entropy_loss, 'entropy loss without vocab norm is not supported'
+    def init_vocab(self, eini: float):
+        if eini > 0:
+            nn.init.trunc_normal_(self.embedding.weight.data, std=eini)
+        elif eini < 0:
+            base = self.vocab_width ** -0.5
+            base /= 36
+            self.embedding.weight.data.uniform_(-abs(eini) * base, abs(eini) * base)
+    def extra_repr(self) -> str:
+        return f'beta={self.beta:g}'
+    def forward(self, class_tokens, patch_tokens, ret_usages=False):
+        class_tokens = class_tokens.float()
+        patch_tokens = patch_tokens.float()
+        B, L, C = class_tokens.shape
+        B, C, H, W = patch_tokens.shape
+        patch_tokens = patch_tokens.flatten(start_dim=2).permute(0, 2, 1)
+        NxC = torch.cat((class_tokens, patch_tokens), dim=1).reshape(-1, C)
+        if self.vocab_norm:
+            if self.using_entropy_loss:
+                # find the nearest neighbor
+                NxC_no_grad = NxC.detach()
+                NxC_no_grad = F.normalize(NxC_no_grad, dim=-1)
+                idx_N = torch.argmax(NxC_no_grad @ F.normalize(self.embedding.weight.data.T, dim=0), dim=1)
+                # get logits
+                E_dist = NxC.square().sum(dim=1, keepdim=True) + self.embedding.weight.square().sum(dim=1,
+                                                                                                    keepdim=False)
+                E_dist.addmm_(NxC, self.embedding.weight.T, alpha=-2, beta=1)  # E_dist: (N, vocab_size)
+                logits = -E_dist.float().mul_(self.inv_entropy_tau)
+                # calc per_sample_entropy
+                prob, log_prob = logits.softmax(dim=-1), logits.log_softmax(dim=-1)  # both are (N, vocab_size)
+                per_sample_entropy = torch.mean((-prob * log_prob).sum(dim=-1))
+                # calc codebook_entropy
+                avg_prob = prob.mean(dim=0)  # (vocab_size,)
+                log_avg_prob = torch.log(avg_prob + 1e-7)
+                codebook_entropy = (-avg_prob * log_avg_prob).sum()
+                # calc entropy_loss
+                entropy_loss = per_sample_entropy - codebook_entropy
+            else:
+                NxC_no_grad = NxC.detach()
+                NxC_no_grad = F.normalize(NxC_no_grad, dim=-1)
+                idx_N = torch.argmax(NxC_no_grad @ F.normalize(self.embedding.weight.data.T, dim=0), dim=1)
+                entropy_loss = 0
+        else:  # not self.vocab_norm
+            NxC_no_grad = NxC.detach()
+            E_dist = NxC_no_grad.square().sum(dim=1, keepdim=True) + self.embedding.weight.data.square().sum(dim=1,
+                                                                                                             keepdim=False)
+            E_dist.addmm_(NxC_no_grad, self.embedding.weight.data.T, alpha=-2, beta=1)  # E_dist: N x vocab_size
+            idx_N = torch.argmin(E_dist, dim=1)
+            entropy_loss = 0
+        prob_per_class_is_chosen = idx_N.bincount(minlength=self.vocab_size).float()
+        handler = tdist.all_reduce(prob_per_class_is_chosen, async_op=True) if (
+                self.training and dist.initialized()) else None
+        # look up
+        fhat = self.embedding(idx_N)
+        # calc loss
+        vq_loss = F.mse_loss(fhat.detach(), NxC).mul_(self.beta) + F.mse_loss(fhat, NxC.detach())
+        fhat = (fhat.detach() - NxC.detach()).add_(NxC)
+        # update vocab_usage
+        if handler is not None:
+            handler.wait()
+        prob_per_class_is_chosen /= prob_per_class_is_chosen.sum()
+        vocab_usage = (prob_per_class_is_chosen > 0.01 / self.vocab_size).float().mean().mul_(100)
+        if self.vocab_usage_record_times == 0:
+            self.vocab_usage.copy_(prob_per_class_is_chosen)
+        elif self.vocab_usage_record_times < 100:
+            self.vocab_usage.mul_(0.9).add_(prob_per_class_is_chosen, alpha=0.1)
+        else:
+            self.vocab_usage.mul_(0.99).add_(prob_per_class_is_chosen, alpha=0.01)
+        self.vocab_usage_record_times += 1
+        fhat = fhat.view(B, -1, C)
+        fhat_class = fhat[:, :L, :]
+        fhat_patch = fhat[:, L:, :].view(B, H, W, C).permute(0, 3, 1, 2)
+        return fhat_class, fhat_patch, vq_loss, entropy_loss, (vocab_usage if ret_usages else None)
+    def f_to_idx(self, class_tokens, patch_tokens) -> torch.LongTensor:
+        B, L, C = class_tokens.shape
+        B, C, H, W = patch_tokens.shape
+        class_tokens = class_tokens.float()
+        patch_tokens = patch_tokens.float()
+        patch_tokens = patch_tokens.flatten(start_dim=2).permute(0, 2, 1)
+        NxC = torch.cat((class_tokens, patch_tokens), dim=1).reshape(-1, C)
+        with torch.cuda.amp.autocast(enabled=False):
+            # find the nearest embedding
+            if self.vocab_norm:
+                NxC = F.normalize(NxC, dim=-1)
+                idx_N = torch.argmax(NxC @ F.normalize(self.embedding.weight.data.T, dim=0), dim=1)
+            else:
+                E_dist = torch.sum(NxC.square(), dim=1, keepdim=True) + torch.sum(self.embedding.weight.data.square(),
+                                                                                  dim=1, keepdim=False)
+                E_dist.addmm_(NxC, self.embedding.weight.data.T, alpha=-2, beta=1)  # (B*h*w, vocab_size)
+                idx_N = torch.argmin(E_dist, dim=1)
+        return idx_N
+class VectorQuantizerX(nn.Module):
+    def __init__(
+            self,
+            vocab_size: int,
+            vocab_width: int,
+            beta: float = 0.25,
+            use_entropy_loss=False,
+            entropy_temp=0.01,
+    ):
+        super().__init__()
+        self.beta = beta
+        self.vocab_size = vocab_size
+        self.vocab_width = vocab_width
+        self.vocab_usage_record_times: int = 0
+        self.register_buffer('vocab_usage', torch.zeros(self.vocab_size))
+        self.codebook = NormalizedEmbedding(self.vocab_size, self.vocab_width)
+        self.use_entropy_loss = use_entropy_loss
+        self.inv_entropy_tau = 1 / entropy_temp
+    def init_vocab(self, eini: float):
+        if eini > 0:
+            nn.init.trunc_normal_(self.codebook.weight.data, std=eini)
+        elif eini < 0:
+            base = self.vocab_width ** -0.5
+            base /= 36
+            self.codebook.weight.data.uniform_(-abs(eini) * base, abs(eini) * base)
+    def extra_repr(self) -> str:
+        return f'beta={self.beta:g}'
+    def forward(self, features):
+        B, L, C = features.shape
+        features = features.reshape(-1, C)
+        features = F.normalize(features, dim=-1).float()
+        codebook_embed = self.codebook.get_norm_weight()
+        indices = torch.argmax(features.detach() @ codebook_embed.T, dim=1)
+        entropy_loss = get_entropy_loss(features, codebook_embed, self.inv_entropy_tau) if self.use_entropy_loss else 0
+        features_hat = self.codebook(indices)
+        # calc loss
+        vq_loss = F.mse_loss(features_hat.detach(), features).mul_(self.beta) + F.mse_loss(features_hat,
+                                                                                           features.detach())
+        features_hat = (features_hat.detach() - features.detach()).add_(features)
+        # update vocab_usage
+        prob_per_class_is_chosen = indices.bincount(minlength=self.vocab_size).float()
+        handler = tdist.all_reduce(prob_per_class_is_chosen, async_op=True) if (
+                self.training and dist.initialized()) else None
+        if handler is not None:
+            handler.wait()
+        prob_per_class_is_chosen /= prob_per_class_is_chosen.sum()
+        vocab_usage = (prob_per_class_is_chosen > 0.01 / self.vocab_size).float().mean().mul_(100)
+        if self.vocab_usage_record_times == 0:
+            self.vocab_usage.copy_(prob_per_class_is_chosen)
+        elif self.vocab_usage_record_times < 100:
+            self.vocab_usage.mul_(0.9).add_(prob_per_class_is_chosen, alpha=0.1)
+        else:
+            self.vocab_usage.mul_(0.99).add_(prob_per_class_is_chosen, alpha=0.01)
+        self.vocab_usage_record_times += 1
+        return features_hat.view(B, L, C), vq_loss, entropy_loss, vocab_usage
+    def f_to_idx(self, features):
+        B, L, C = features.shape
+        features = features.reshape(-1, C)
+        features = F.normalize(features, dim=-1).float()
+        codebook_embed = self.codebook.get_norm_weight().float()
+        indices = torch.argmax(features.detach() @ codebook_embed.T, dim=1)
+        return indices.view(B, L)
+class VectorQuantizerM(nn.Module):
+    def __init__(
+            self,
+            vocab_size,
+            vocab_width,
+            beta=0.25,
+            use_entropy_loss=False,
+            entropy_temp=0.01,
+            num_codebooks=16
+    ):
+        super().__init__()
+        self.num_codebooks = num_codebooks
+        self.codebooks = nn.ModuleList()
+        for _ in range(num_codebooks):
+            codebook = VectorQuantizerX(
+                vocab_size=vocab_size // num_codebooks,
+                vocab_width=vocab_width // num_codebooks,
+                beta=beta,
+                use_entropy_loss=use_entropy_loss,
+                entropy_temp=entropy_temp,
+            )
+            self.codebooks.append(codebook)
+    def init_vocab(self, eini: float):
+        for codebook in self.codebooks:
+            codebook.init_vocab(eini)
+    def f_to_idx(self, features):
+        indices = []
+        chunk_size = features.shape[-1] // self.num_codebooks
+        splited_features = features.split(chunk_size, dim=-1)
+        for i, codebook in enumerate(self.codebooks):
+            indices.append(codebook.f_to_idx(splited_features[i]))
+        indices = torch.stack(indices, dim=1)
+        return indices
+    def idx_to_f(self, indices):
+        assert indices.shape[1] == self.num_codebooks
+        latent_features = []
+        for i, codebook in enumerate(self.codebooks):
+            sub_indices = indices[:, i].flatten(start_dim=1)
+            latent_feature = codebook.codebook(sub_indices)
+            latent_features.append(latent_feature)
+        latent_features = torch.cat(latent_features, dim=-1)
+        return latent_features
+    def forward(self, features):
+        latent_features = []
+        global_vq_loss = 0.
+        global_entropy_loss = 0.
+        global_vocab_usage = 0.
+        chunk_size = features.shape[-1] // self.num_codebooks
+        splited_features = features.split(chunk_size, dim=-1)
+        for i, codebook in enumerate(self.codebooks):
+            latent_feature, vq_loss, entropy_loss, vocab_usage = codebook(splited_features[i])
+            latent_features.append(latent_feature)
+            global_vq_loss += vq_loss
+            global_entropy_loss += entropy_loss
+            global_vocab_usage += vocab_usage
+        latent_features = torch.cat(latent_features, dim=-1)
+        global_entropy_loss /= self.num_codebooks
+        global_vq_loss /= self.num_codebooks
+        global_vocab_usage /= self.num_codebooks
+        return latent_features, global_vq_loss, global_entropy_loss, global_vocab_usage
+class CausalAttention(nn.Module):
+    def __init__(self, in_dim, out_dim, num_heads):
+        super().__init__()
+        if in_dim > out_dim:
+            # assert in_dim // num_heads == out_dim
+            self.head_dim = in_dim // num_heads
+            self.qkv = nn.Linear(in_dim, in_dim * 3, bias=False)
+            self.q_bias = nn.Parameter(torch.zeros(in_dim))
+            self.v_bias = nn.Parameter(torch.zeros(in_dim))
+            self.register_buffer('zero_k_bias', torch.zeros(in_dim))
+        else:
+            # assert out_dim // num_heads == in_dim
+            self.head_dim = out_dim // num_heads
+            self.qkv = nn.Linear(in_dim, out_dim * 3, bias=False)
+            self.q_bias = nn.Parameter(torch.zeros(out_dim))
+            self.v_bias = nn.Parameter(torch.zeros(out_dim))
+            self.register_buffer('zero_k_bias', torch.zeros(out_dim))
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.scale = self.head_dim ** -0.5
+        self.proj = nn.Linear(out_dim, out_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=torch.cat((self.q_bias, self.zero_k_bias, self.v_bias)))
+        q, k, v = qkv.reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4).unbind(0)
+        x = scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0., is_causal=True)
+        if self.in_dim > self.out_dim:
+            x = torch.mean(x, dim=1)
+            if self.in_dim // self.num_heads != self.out_dim:
+                x = nn.functional.adaptive_avg_pool1d(x, self.out_dim)
+        else:
+            x = x.transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        return x
+class AttnProjection(nn.Module):
+    def __init__(self, in_dim, out_dim, num_heads, norm_layer=nn.LayerNorm, mlp_ratio=2):
+        super().__init__()
+        assert out_dim % in_dim == 0 or in_dim % out_dim == 0
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.norm1 = norm_layer(in_dim)
+        self.attn = CausalAttention(in_dim, out_dim, num_heads)
+        self.proj = nn.Linear(in_dim, out_dim)
+        self.norm3 = norm_layer(in_dim)
+        self.norm2 = norm_layer(out_dim)
+        hidden_dim = int(out_dim * mlp_ratio)
+        self.mlp = GeGluMlp(
+            in_features=out_dim,
+            hidden_features=hidden_dim
+        )
+    def forward(self, x):
+        x = self.proj(self.norm3(x)) + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+from functools import partial
+from timm.models.layers import create_conv2d, get_norm_act_layer, get_norm_layer, make_divisible
+class GeGluMlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features,
+        act_layer = None,
+        drop = 0.0,
+    ):
+        super().__init__()
+        norm_layer = partial(get_norm_layer('layernorm'), eps=1e-6)
+        self.norm = norm_layer(in_features)
+        self.act = nn.GELU(approximate='tanh')
+        self.w0 = nn.Linear(in_features, hidden_features)
+        self.w1 = nn.Linear(in_features, hidden_features)
+        self.w2 = nn.Linear(hidden_features, in_features)
+    def forward(self, x):
+        x = self.norm(x)
+        x = self.act(self.w0(x)) * self.w1(x)
+        x = self.w2(x)
+        return x

t2i.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import os
+import sys
+import torch
+import argparse
+import numpy as np
+from tqdm import tqdm
+from torchvision import transforms
+from torch.nn import functional as F
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from model import *
+from unitok.config import Args
+from unitok.model import UniTok
+PILtransform = transforms.ToPILImage()
+def top_k_top_p_filtering(
+    logits,
+    top_k: int = 0,
+    top_p: float = 1.0,
+    filter_value: float = -float("Inf"),
+    min_tokens_to_keep: int = 1,
+):
+    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+    Args:
+        logits: logits distribution shape (batch size, vocabulary size)
+        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+        if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        Make sure we keep at least min_tokens_to_keep per batch example in the output
+    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    if top_k > 0:
+        top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))  # Safety check
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+    if top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        if min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        # scatter sorted tensors to original indexing
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        logits[indices_to_remove] = filter_value
+    # import pdb;pdb.set_trace()
+    return logits
+def sample(logits, temperature: float = 1.0, top_k: int = 0, top_p: float = 1.0, sample_logits=True):
+    logits = logits[:, -1, :] / max(temperature, 1e-5)
+    if top_k > 0 or top_p < 1.0:
+        logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
+    probs = F.softmax(logits, dim=-1)
+    if sample_logits:
+        idx = torch.multinomial(probs, num_samples=1)
+    else:
+        _, idx = torch.topk(probs, k=1, dim=-1)
+    return idx, probs
+def split_list(input_list, chunk_size):
+    return [input_list[i:i + chunk_size] for i in range(0, len(input_list), chunk_size)]
+def get_args_parser():
+    parser = argparse.ArgumentParser('Set transformer detector', add_help=False)
+    parser.add_argument('--unitok_path', type=str, required=True)
+    parser.add_argument('--mllm_path', type=str, required=True)
+    parser.add_argument('--prompt_file', type=str, required=True)
+    parser.add_argument('--result_dir', type=str, required=True)
+    parser.add_argument('--idx', type=int, default=0)
+    parser.add_argument('--tau', type=float, default=0.9)
+    parser.add_argument('--topk', type=int, default=2048)
+    parser.add_argument('--topp', type=float, default=0.96)
+    parser.add_argument('--cfg_scale', type=float, default=5.0)
+    return parser
+def main(args):
+    text_set_id = args.idx
+    tau = args.tau
+    topk = args.topk
+    topp = args.topp
+    cfg_scale = args.cfg_scale
+    print('loading vq model ...')
+    ckpt = torch.load(args.unitok_path, map_location='cpu')
+    vae_cfg = Args()
+    vae_cfg.load_state_dict(ckpt['args'])
+    vq_model = UniTok(vae_cfg)
+    vq_model.load_state_dict(ckpt['trainer']['unitok'])
+    vq_model.to('cuda')
+    vq_model.eval()
+    image_save_pth = '{}/GenAI-cfg_{}-topk_{}-topp_{}-tau_{}'.format(args.result_dir, str(cfg_scale), str(topk), str(topp), str(tau))
+    tokenizer = AutoTokenizer.from_pretrained(args.mllm_path, padding_side='left')
+    vqllm = AutoModelForCausalLM.from_pretrained(
+        args.mllm_path,
+        attn_implementation='flash_attention_2',
+        torch_dtype=torch.bfloat16
+    ).to('cuda')
+    num_processes = 8
+    chunk_size = 8  # batchsize
+    num_codebooks = vae_cfg.num_codebooks
+    with open(args.prompt_file, 'r') as f:
+        lines = f.readlines()
+    all_prompts = []
+    for index, line in enumerate(lines):
+        all_prompts.append({'Index': str(index + 1).zfill(5), 'Prompt': line.strip()})
+    chunked_filenames = np.array_split(all_prompts, num_processes)
+    subset = chunked_filenames[text_set_id].tolist()
+    chunk_inputs = split_list(subset, chunk_size)
+    for chunk in tqdm(chunk_inputs):
+        text_inputs = [v['Prompt'] for v in chunk]
+        uncondition_text_inputs = ['<unconditional>'] * len(text_inputs)
+        for i in range(len(text_inputs)):
+            text_inputs[i] = text_inputs[i] + ' Generate an image based on this description.'
+        ori_batchsize = len(text_inputs)
+        save_list = []
+        if cfg_scale > 1:
+            model_inputs = tokenizer(text_inputs + uncondition_text_inputs, return_tensors="pt", padding=True).to('cuda')
+            total_batchsize = len(text_inputs + uncondition_text_inputs)
+            model_inputs['input_ids'] = torch.cat([
+                model_inputs['input_ids'],
+                torch.empty(total_batchsize, 1).fill_(3).to(model_inputs['input_ids'])
+            ], dim=1)
+            model_inputs['attention_mask'] = torch.cat([
+                model_inputs['attention_mask'],
+                torch.empty(total_batchsize, 1).fill_(1).to(model_inputs['attention_mask'])
+            ], dim=1)
+        else:
+            model_inputs = tokenizer(text_inputs, return_tensors="pt", padding=True).to('cuda')
+            total_batchsize = len(text_inputs)
+            model_inputs['input_ids'] = torch.cat([
+                model_inputs['input_ids'],
+                torch.empty(total_batchsize, 1).fill_(3).to(model_inputs['input_ids'])
+            ], dim=1)
+            model_inputs['attention_mask'] = torch.cat([
+                model_inputs['attention_mask'],
+                torch.empty(total_batchsize, 1).fill_(1).to(model_inputs['attention_mask'])
+            ], dim=1)
+        with torch.no_grad():
+            sampling_kwargs = {'temperature': tau, 'top_k': topk, 'top_p': topp, 'sample_logits': True}
+            pred_tokens = []
+            input_multi_ids = None
+            for _ in range(256):
+                outputs = vqllm.T2I_forward_nocache(
+                    **model_inputs,
+                    input_multi_ids=input_multi_ids,
+                    use_cache=None,
+                    return_dict=True,
+                    output_attentions=False,
+                    output_hidden_states=False,
+                )
+                next_embed = outputs['last_hidden_state'][:, -1:, :]
+                indices_arhead = []
+                for i_head in range(num_codebooks):
+                    ar_next_embed = vqllm.ar_head(
+                        inputs_embeds=next_embed,
+                        use_cache=False,
+                        output_attentions=False,
+                        output_hidden_states=False,
+                        return_dict=False,
+                    )
+                    next_token_logits = vqllm.ar_head.linear_head(ar_next_embed)
+                    if cfg_scale > 1:
+                        cond_logits, uncond_logits = torch.split(next_token_logits, len(next_token_logits) // 2, dim=0)
+                        cfg_logits = uncond_logits + (cond_logits - uncond_logits) * cfg_scale
+                        half_next_token, _ = sample(cfg_logits, **sampling_kwargs)
+                        next_token = torch.cat([half_next_token, half_next_token])  # [bz,1]
+                    else:
+                        next_token, next_prob = sample(next_token_logits, **sampling_kwargs)
+                    indices_arhead.append(next_token)
+                    if i_head < num_codebooks - 1:
+                        predicted_embed = vqllm.ar_head.codebooks[i_head](next_token)
+                        next_embed = torch.cat([next_embed, predicted_embed], dim=1)
+                # update generated ids, model inputs, and length for next step
+                pred_tokens.append(torch.cat(indices_arhead, dim=1))  # [numcodebook,bz*2]
+                input_multi_ids = torch.stack(pred_tokens, dim=-1)
+            del sampling_kwargs, model_inputs, outputs
+            image_vq_id = torch.stack(pred_tokens, dim=-1)[:ori_batchsize]
+            save_list.append(image_vq_id)
+        torch.cuda.empty_cache()
+        print('decoding images ...')
+        if not os.path.exists(image_save_pth):
+            os.makedirs(image_save_pth)
+        for datainfo, vq_code in zip(chunk, save_list[0]):
+            idx = datainfo['Index']
+            new_gen_ids = vq_code.unsqueeze(0).to('cuda')
+            rec_image = vq_model.idx_to_img(new_gen_ids)
+            rec_img = PILtransform(rec_image.squeeze(0).add(1).mul_(0.5).clamp_(0, 1))
+            rec_img.save('{}/{}.jpg'.format(image_save_pth, str(idx)))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('genai inference script', parents=[get_args_parser()])
+    args = parser.parse_args()
+    main(args)

tools.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import datetime
+import logging
+import logging.handlers
+import os
+import sys
+import requests
+from constants import LOGDIR
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
+handler = None
+def build_logger(logger_name, logger_filename):
+    global handler
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO)
+    logging.getLogger().handlers[0].setFormatter(formatter)
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+    # Add a file handler for all loggers
+    if handler is None:
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(
+            filename, when='D', utc=True, encoding='UTF-8')
+        handler.setFormatter(formatter)
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+    return logger
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ''
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ''
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == '\n':
+                self.logger.log(self.log_level, line.rstrip())
+            else:
+                self.linebuf += line
+    def flush(self):
+        if self.linebuf != '':
+            self.logger.log(self.log_level, self.linebuf.rstrip())
+        self.linebuf = ''
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+def violates_moderation(text):
+    """
+    Check whether the text violates OpenAI moderation API.
+    """
+    url = "https://api.openai.com/v1/moderations"
+    headers = {"Content-Type": "application/json",
+               "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
+    text = text.replace("\n", "")
+    data = "{" + '"input": ' + f'"{text}"' + "}"
+    data = data.encode("utf-8")
+    try:
+        ret = requests.post(url, headers=headers, data=data, timeout=5)
+        flagged = ret.json()["results"][0]["flagged"]
+    except requests.exceptions.RequestException as e:
+        flagged = False
+    except KeyError as e:
+        flagged = False
+    return flagged
+def pretty_print_semaphore(semaphore):
+    if semaphore is None:
+        return "None"
+    return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"

unitok/config.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import os
+import sys
+import torch
+import random
+import numpy as np
+from tap import Tap
+from typing import Optional, Union
+from collections import OrderedDict
+from unitok import dist
+class Args(Tap):
+    model: str = 'vitamin_large' # 'vitamin_base', 'vitamin_large', xxx
+    exp_name: str = 'unitok_large'
+    output_dir: str = 'local_output'
+    resume_from: str = ''  # if specified, load this checkpoint; if not, load the latest checkpoint in output_dir (if exists)
+    lpips_path: str = 'external/lpips_with_vgg.pth'
+    dino_path: str = 'external/dinov2_vits14_pretrain.pth'
+    fid_eval_src: str = ''
+    fid_eval_dst: str = ''
+    vis_img_dir: str = 'asset/vis_imgs/'
+    fid_feature_extractor: str = 'external/weights-inception-2015-12-05-6726825d.pth'
+    clip_pretrain_path: str = ''
+    # speed-up
+    fp16: bool = False  # whether to use FP16
+    bf16: bool = True  # whether to use BF16
+    tf32: bool = True  # whether to use TensorFloat32
+    compile_model: bool = False  # whether to use torch.compile()
+    ddp_static: bool = False  # whether to use static graph in DDP
+    grad_ckpt: bool = True  # gradient checkpointing
+    grad_accu: int = 1  # gradient accumulation
+    device: str = 'cpu' # will be set automatically
+    dtype: torch.dtype = torch.float32 # will be set automatically
+    # data
+    train_data: str = None
+    val_data: str = None
+    dataset_type: str = 'webdataset'
+    imagenet_val: str = None
+    imagenet_v2: str = None
+    subset_ratio: float = 1.0
+    img_size: int = 256
+    resize_ratio: float = 1.125  # only applicable to 'img' dataset_type
+    hflip: bool = False
+    workers: int = 8  # num workers; 0: auto, -1: don't use multiprocessing in DataLoader
+    train_num_samples: int = 1280_000_000
+    train_data_upsampling_factors: str = None
+    dataset_resampled: bool = False
+    use_aug: bool = False
+    # quantizer
+    vocab_size: int = 32768
+    vocab_width: int = 64
+    vocab_norm: bool = True
+    vq_beta: float = 0.25  # commitment loss weight
+    num_codebooks: int = 8
+    quant_proj: str = 'attn'
+    # model
+    embed_dim: int = 768
+    num_query: int = 0
+    use_clip_pretrain: bool = False
+    patch_size: int = 16
+    drop_path: float = 0.1
+    text_width: int = 768
+    text_heads: int = 12
+    text_layers: int = 12
+    text_vocab_size: int = 49408
+    text_context_length: int = 77
+    # CLIP
+    local_loss: bool = True
+    gather_with_grad: bool = True
+    pretrained_clip: str = None
+    pretrained_clip_text: str = None
+    lock_text: bool = False
+    lock_text_unlocked_layers: int = 0
+    lock_text_freeze_layer_norm: bool = False
+    force_custom_text: bool = False
+    force_custom_vision: bool = False
+    zeroshot_eval_freq: int = 1
+    # discriminator
+    dino_depth: int = 12
+    dino_kernel_size: int = 9
+    disc_norm: str = 'gn'  # gn: group norm, bn: batch norm, sbn: sync batch norm, hbn: hybrid sync batch norm
+    disc_aug_prob: float = 1.0
+    disc_specnorm: bool = False
+    step_disc_every: int = 1
+    # initialization
+    vae_init: float = -0.5  # <0: xavier_normal_(gain=abs(init)); >0: trunc_normal_(std=init)
+    vocab_init: float = -1  # <0: uniform(-abs(init)*base, abs(init)*base), where base = 20/vocab_size; >0: trunc_normal_(std=init)
+    disc_init: float = -0.5  # <0: xavier_normal_(gain=abs(init)); >0: trunc_normal_(std=init)
+    # optimization
+    epoch: int = 1  # number of epochs
+    local_bs: int = 64  # batch size per device; if this is specified, --global_bs will be ignored
+    vae_local_bs: int = 64 # sub-batch size for vae loss calculation
+    global_bs: int = 0  # global batch size (exclusive to --local_bs)
+    lr: float = 5e-4  # learning rate
+    wd: float = 0.02  # weight decay
+    disc_lr: float = 2e-5  # disc lr
+    disc_wd: float = 0.2
+    grad_clip: float = 10  # <=0 for not using grad clip
+    ema: float = 0.9999  # ema ratio
+    warmup_iter: int = None
+    warmup_ep: float = 0.01  # lr warmup: epochs
+    disc_start_ep: float = 0.375  # start using disc loss for VAE after xxx epochs;
+    disc_warmup_ep: float = 0.03  # disc loss warm up epochs;
+    schedule: str = 'cos'  # lr schedule type
+    lr_start_ratio: float = 0.  # lr warmup: initial lr ratio
+    lr_end_ratio: float = 0.1  # lr schedule: final lr ratio
+    disc_lr_end_ratio: float = 0.1
+    custom_lr_multiplier: float = None
+    optimizer: str = 'adamw'
+    optim_eps: float = 1e-6
+    fuse_opt: bool = False  # whether to use fused optimizer
+    optim_beta: str = '0.9_0.95'  # beta1, beta2 of optimizer
+    disc_optim_beta: str = '0.5_0.9'  # beta1, beta2 of disc optimizer
+    # loss
+    l1: float = 0.2  # L1 rec loss weight
+    l2: float = 1.0  # L2 rec loss weight
+    lp: float = 1.0  # lpips loss weight
+    lpr: int = 48    # only calculate lpips >= this image resolution
+    ld: float = 0.4  # discriminator loss weight; if <0: NO ADAPTIVE WEIGHT
+    le: float = 0.0  # VQ entropy loss weight
+    lq: float = 1.0
+    lc: float = 1.0  # CLIP loss weight
+    e_temp: float = 0.01
+    gada: int = 1
+    bcr: float = 4.  # balanced Consistency Regularization, used on small dataset with low reso, StyleSwin: 10.0
+    bcr_cut: float = 0.2  # cutout ratio (0.5: 50% width)
+    dcrit: str = 'hg'  # hg hinge, sp softplus, ln linear
+    # wandb log
+    report_wandb: bool = True
+    wandb_notes: str = None
+    run_id: str = None
+    # debug
+    eval_per_epoch: int = 8
+    dbg_unused_param: bool = False
+    dbg_nan: bool = False  # 'KEVIN_LOCAL' in os.environ
+    seed: int = None
+    deterministic: bool = False
+    same_seed_for_all_ranks: int = 0  # this is only for distributed sampler
+    def seed_everything(self):
+        torch.backends.cudnn.enabled = True
+        torch.backends.cudnn.benchmark = True
+        torch.backends.cudnn.deterministic = False
+        if self.seed is not None:
+            if self.deterministic:
+                torch.backends.cudnn.benchmark = False
+                torch.backends.cudnn.deterministic = True
+                torch.use_deterministic_algorithms(True)
+                os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8'
+            seed = self.seed + dist.get_rank() * 10000
+            os.environ['PYTHONHASHSEED'] = str(seed)
+            random.seed(seed)
+            np.random.seed(seed)
+            torch.manual_seed(seed)
+            torch.cuda.manual_seed(seed)
+            torch.cuda.manual_seed_all(seed)
+    def get_different_generator_for_each_rank(self) -> Optional[torch.Generator]:  # for random augmentation
+        if self.seed is None:
+            return None
+        g = torch.Generator()
+        g.manual_seed(self.seed * dist.get_world_size() + dist.get_rank())
+        return g
+    def state_dict(self, key_ordered=True) -> Union[OrderedDict, dict]:
+        d = (OrderedDict if key_ordered else dict)()
+        for k in self.class_variables.keys():
+            if k not in {'device'}:  # these are not serializable
+                d[k] = getattr(self, k)
+        return d
+    def load_state_dict(self, state_dict):
+        for k, v in state_dict.items():
+            try:
+                setattr(self, k, v)
+            except Exception as e:
+                print(f'k={k}, v={v}')
+                raise e
+    @staticmethod
+    def set_tf32(tf32: bool):
+        if torch.cuda.is_available():
+            torch.backends.cudnn.allow_tf32 = bool(tf32)
+            torch.backends.cuda.matmul.allow_tf32 = bool(tf32)
+            if hasattr(torch, 'set_float32_matmul_precision'):
+                torch.set_float32_matmul_precision('high' if tf32 else 'highest')
+                print(f'[tf32] [precis] torch.get_float32_matmul_precision(): {torch.get_float32_matmul_precision()}')
+            print(f'[tf32] [ conv ] torch.backends.cudnn.allow_tf32: {torch.backends.cudnn.allow_tf32}')
+            print(f'[tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: {torch.backends.cuda.matmul.allow_tf32}')
+    def __str__(self):
+        s = []
+        for k in self.class_variables.keys():
+            if k not in {'device', 'dbg_ks_fp'}:  # these are not serializable
+                s.append(f'  {k:20s}: {getattr(self, k)}')
+        s = '\n'.join(s)
+        return f'{{\n{s}\n}}\n'
+def init_dist_and_get_args():
+    for i in range(len(sys.argv)):
+        if sys.argv[i].startswith('--local-rank=') or sys.argv[i].startswith('--local_rank='):
+            del sys.argv[i]
+            break
+    args = Args(explicit_bool=True).parse_args(known_only=True)
+    # warn args.extra_args
+    if len(args.extra_args) > 0:
+        print(f'======================================================================================')
+        print(f'=========================== WARNING: UNEXPECTED EXTRA ARGS ===========================\n{args.extra_args}')
+        print(f'=========================== WARNING: UNEXPECTED EXTRA ARGS ===========================')
+        print(f'======================================================================================\n\n')
+    # init torch distributed
+    os.makedirs(args.output_dir, exist_ok=True)
+    dist.init_distributed_mode(local_out_path=args.output_dir, timeout_minutes=30)
+    # set env
+    args.set_tf32(args.tf32)
+    args.seed_everything()
+    args.device = dist.get_device()
+    # update args
+    if args.local_bs == 0:
+        args.local_bs = max(1, round(args.global_bs / args.grad_accu / dist.get_world_size()))
+    args.global_bs = args.local_bs * dist.get_world_size()
+    if args.fp16 or args.bf16:
+        args.dtype = torch.float16 if args.fp16 else torch.bfloat16
+    return args

unitok/dist.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import datetime
+import functools
+import os
+import sys
+from typing import List
+from typing import Union
+import pytz
+import torch
+import torch.distributed as tdist
+import torch.multiprocessing as mp
+__rank, __local_rank, __world_size, __device = 0, 0, 1, 'cuda' if torch.cuda.is_available() else 'cpu'
+__rank_str_zfill = '0'
+__initialized = False
+def initialized():
+    return __initialized
+def __initialize(fork=False, backend='nccl', gpu_id_if_not_distibuted=0, timeout_minutes=30):
+    global __device
+    if not torch.cuda.is_available():
+        print(f'[dist initialize] cuda is not available, use cpu instead', file=sys.stderr)
+        return
+    elif 'RANK' not in os.environ:
+        torch.cuda.set_device(gpu_id_if_not_distibuted)
+        __device = torch.empty(1).cuda().device
+        print(f'[dist initialize] env variable "RANK" is not set, use {__device} as the device', file=sys.stderr)
+        return
+    # then 'RANK' must exist
+    global_rank, num_gpus = int(os.environ['RANK']), torch.cuda.device_count()
+    local_rank = global_rank % num_gpus
+    torch.cuda.set_device(local_rank)
+    # ref: https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/dist_utils.py#L29
+    if mp.get_start_method(allow_none=True) is None:
+        method = 'fork' if fork else 'spawn'
+        print(f'[dist initialize] mp method={method}')
+        mp.set_start_method(method)
+    tdist.init_process_group(backend=backend, timeout=datetime.timedelta(seconds=timeout_minutes * 60))
+    global __rank, __local_rank, __world_size, __initialized, __rank_str_zfill
+    __local_rank = local_rank
+    __rank, __world_size = tdist.get_rank(), tdist.get_world_size()
+    __rank_str_zfill = str(__rank).zfill(len(str(__world_size)))
+    __device = torch.empty(1).cuda().device
+    __initialized = True
+    assert tdist.is_initialized(), 'torch.distributed is not initialized!'
+    print(f'[lrk={get_local_rank()}, rk={get_rank()}]')
+def get_rank():
+    return __rank
+def get_rank_str_zfill():
+    return __rank_str_zfill
+def get_local_rank():
+    return __local_rank
+def get_world_size():
+    return __world_size
+def get_device():
+    return __device
+def set_gpu_id(gpu_id: int):
+    if gpu_id is None: return
+    global __device
+    if isinstance(gpu_id, (str, int)):
+        torch.cuda.set_device(int(gpu_id))
+        __device = torch.empty(1).cuda().device
+    else:
+        raise NotImplementedError
+def is_master():
+    return __rank == 0
+def is_local_master():
+    return __local_rank == 0
+def new_group(ranks: List[int]):
+    if __initialized:
+        return tdist.new_group(ranks=ranks)
+    return None
+def new_local_machine_group():
+    if __initialized:
+        cur_subgroup, subgroups = tdist.new_subgroups()
+        return cur_subgroup
+    return None
+def barrier():
+    if __initialized:
+        tdist.barrier()
+def allreduce(t: torch.Tensor, async_op=False):
+    if __initialized:
+        if not t.is_cuda:
+            cu = t.detach().cuda()
+            ret = tdist.all_reduce(cu, async_op=async_op)
+            t.copy_(cu.cpu())
+        else:
+            ret = tdist.all_reduce(t, async_op=async_op)
+        return ret
+    return None
+def allgather(t: torch.Tensor, cat=True) -> Union[List[torch.Tensor], torch.Tensor]:
+    if __initialized:
+        if not t.is_cuda:
+            t = t.cuda()
+        ls = [torch.empty_like(t) for _ in range(__world_size)]
+        tdist.all_gather(ls, t)
+    else:
+        ls = [t]
+    if cat:
+        ls = torch.cat(ls, dim=0)
+    return ls
+def allgather_diff_shape(t: torch.Tensor, cat=True) -> Union[List[torch.Tensor], torch.Tensor]:
+    if __initialized:
+        if not t.is_cuda:
+            t = t.cuda()
+        t_size = torch.tensor(t.size(), device=t.device)
+        ls_size = [torch.empty_like(t_size) for _ in range(__world_size)]
+        tdist.all_gather(ls_size, t_size)
+        max_B = max(size[0].item() for size in ls_size)
+        pad = max_B - t_size[0].item()
+        if pad:
+            pad_size = (pad, *t.size()[1:])
+            t = torch.cat((t, t.new_empty(pad_size)), dim=0)
+        ls_padded = [torch.empty_like(t) for _ in range(__world_size)]
+        tdist.all_gather(ls_padded, t)
+        ls = []
+        for t, size in zip(ls_padded, ls_size):
+            ls.append(t[:size[0].item()])
+    else:
+        ls = [t]
+    if cat:
+        ls = torch.cat(ls, dim=0)
+    return ls
+def broadcast(t: torch.Tensor, src_rank) -> None:
+    if __initialized:
+        if not t.is_cuda:
+            cu = t.detach().cuda()
+            tdist.broadcast(cu, src=src_rank)
+            t.copy_(cu.cpu())
+        else:
+            tdist.broadcast(t, src=src_rank)
+def dist_fmt_vals(val: float, fmt: Union[str, None] = '%.2f') -> Union[torch.Tensor, List]:
+    if not initialized():
+        return torch.tensor([val]) if fmt is None else [fmt % val]
+    ts = torch.zeros(__world_size)
+    ts[__rank] = val
+    allreduce(ts)
+    if fmt is None:
+        return ts
+    return [fmt % v for v in ts.cpu().numpy().tolist()]
+def master_only(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if force or is_master():
+            ret = func(*args, **kwargs)
+        else:
+            ret = None
+        barrier()
+        return ret
+    return wrapper
+def local_master_only(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if force or is_local_master():
+            ret = func(*args, **kwargs)
+        else:
+            ret = None
+        barrier()
+        return ret
+    return wrapper
+def for_visualize(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_master():
+            # with torch.no_grad():
+            ret = func(*args, **kwargs)
+        else:
+            ret = None
+        return ret
+    return wrapper
+def finalize():
+    if __initialized:
+        tdist.destroy_process_group()
+def init_distributed_mode(local_out_path, only_sync_master=False, timeout_minutes=30):
+    try:
+        __initialize(fork=False, timeout_minutes=timeout_minutes)
+        barrier()
+    except RuntimeError as e:
+        print(f'{"!"*80}   dist init error (NCCL Error?), stopping training!   {"!"*80}', flush=True)
+        raise e
+    if local_out_path is not None: os.makedirs(local_out_path, exist_ok=True)
+    _change_builtin_print(is_local_master())
+    if (is_master() if only_sync_master else is_local_master()) and local_out_path is not None and len(local_out_path):
+        sys.stdout, sys.stderr = BackupStreamToFile(local_out_path, for_stdout=True), BackupStreamToFile(local_out_path, for_stdout=False)
+def _change_builtin_print(is_master):
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    if type(builtin_print) != type(open):
+        return
+    def prt(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        clean = kwargs.pop('clean', False)
+        deeper = kwargs.pop('deeper', False)
+        if is_master or force:
+            if not clean:
+                f_back = sys._getframe().f_back
+                if deeper and f_back.f_back is not None:
+                    f_back = f_back.f_back
+                file_desc = f'{f_back.f_code.co_filename:24s}'[-24:]
+                time_str = datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai')).strftime('[%m-%d %H:%M:%S]')
+                builtin_print(f'{time_str} ({file_desc}, line{f_back.f_lineno:-4d})=>', *args, **kwargs)
+            else:
+                builtin_print(*args, **kwargs)
+    __builtin__.print = prt
+class BackupStreamToFile(object):
+    def __init__(self, local_output_dir, for_stdout=True):
+        self.for_stdout = for_stdout
+        self.terminal_stream = sys.stdout if for_stdout else sys.stderr
+        fname = os.path.join(local_output_dir, 'backup1_stdout.txt' if for_stdout else 'backup2_stderr.txt')
+        existing = os.path.exists(fname)
+        self.file_stream = open(fname, 'a')
+        if existing:
+            time_str = datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai')).strftime('[%m-%d %H:%M:%S]')
+            self.file_stream.write('\n'*7 + '='*55 + f'   RESTART {time_str}   ' + '='*55 + '\n')
+        self.file_stream.flush()
+        self.enabled = True
+    def write(self, message):
+        self.terminal_stream.write(message)
+        self.file_stream.write(message)
+    def flush(self):
+        self.terminal_stream.flush()
+        self.file_stream.flush()
+    def close(self):
+        if not self.enabled:
+            return
+        self.enabled = False
+        self.file_stream.flush()
+        self.file_stream.close()
+        if self.for_stdout:
+            sys.stdout = self.terminal_stream
+            sys.stdout.flush()
+        else:
+            sys.stderr = self.terminal_stream
+            sys.stderr.flush()
+    def __del__(self):
+        self.close()

unitok/model.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import timm
+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+from contextlib import nullcontext
+from unitok.vitamin import GeGluMlp, ViTaminDecoder
+from unitok.quant import VectorQuantizerM
+from unitok.vqvae import AttnProjection
+class UniTok(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.num_query = args.num_query
+        self.encoder = timm.create_model(
+            args.model,
+            patch_size=1,
+            fc_norm=False,
+            drop_rate=0.0,
+            num_classes=0,
+            global_pool='',
+            pos_embed='none',
+            class_token=False,
+            mlp_layer=GeGluMlp,
+            reg_tokens=args.num_query,
+            img_size=args.img_size,
+            drop_path_rate=args.drop_path,
+        )
+        self.encoder.pos_embed = nn.Parameter(torch.zeros(1, 1, self.encoder.embed_dim), requires_grad=False)
+        if args.quant_proj == 'linear':
+            self.quant_proj = nn.Linear(self.encoder.embed_dim, args.vocab_width)
+        elif args.quant_proj == 'attn':
+            self.quant_proj = AttnProjection(self.encoder.embed_dim, args.vocab_width, self.encoder.embed_dim // args.vocab_width)
+        else:
+            raise NotImplementedError
+        self.quantizer = VectorQuantizerM(
+            vocab_size=args.vocab_size,
+            vocab_width=args.vocab_width,
+            beta=args.vq_beta,
+            use_entropy_loss=args.le > 0,
+            entropy_temp=args.e_temp,
+            num_codebooks=args.num_codebooks,
+        )
+        if args.quant_proj == 'linear':
+            self.post_quant_proj = nn.Linear(args.vocab_width, self.encoder.embed_dim)
+        elif args.quant_proj == 'attn':
+            self.post_quant_proj = AttnProjection(args.vocab_width, self.encoder.embed_dim, self.encoder.embed_dim // args.vocab_width)
+        else:
+            raise NotImplementedError
+        self.decoder = ViTaminDecoder(
+            args.model,
+            num_query=args.num_query,
+            img_size=args.img_size,
+            drop_path=args.drop_path,
+            grad_ckpt=args.grad_ckpt,
+        )
+        text_cfg = {
+            "width": args.text_width,
+            "heads": args.text_heads,
+            "layers": args.text_layers,
+            "vocab_size": args.text_vocab_size,
+            "context_length": args.text_context_length,
+        }
+        from open_clip.model import _build_text_tower
+        self.text_encoder = _build_text_tower(args.embed_dim, text_cfg)
+        self.fc_norm = nn.LayerNorm(self.encoder.embed_dim, eps=1e-6)
+        self.projection = nn.Linear(self.encoder.embed_dim, args.embed_dim)
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.context_length = self.text_encoder.context_length
+        self.vocab_size = self.text_encoder.vocab_size
+        self.maybe_record_function = nullcontext
+        self.text_no_grad = False
+        self.encoder.set_grad_checkpointing(args.grad_ckpt)
+        self.text_encoder.set_grad_checkpointing(args.grad_ckpt)
+    def forward(self, img, vae_bs, text=None, ret_usages=False):
+        img_tokens = self.encoder(img).float()
+        with torch.cuda.amp.autocast(enabled=False):
+            img_tokens = torch.utils.checkpoint.checkpoint(self.quant_proj, img_tokens, use_reentrant=False)
+            img_tokens, vq_loss, entropy_loss, usages = self.quantizer(img_tokens)
+            img_tokens = torch.utils.checkpoint.checkpoint(self.post_quant_proj, img_tokens, use_reentrant=False)
+        img_rec = self.decoder(img_tokens[:vae_bs]).float()
+        clip_visual = img_tokens.mean(dim=1)
+        clip_visual = self.projection(self.fc_norm(clip_visual))
+        clip_visual = F.normalize(clip_visual, dim=-1)
+        if text is not None:
+            clip_text = self.text_encoder(text)
+            clip_text = F.normalize(clip_text, dim=-1)
+        else:
+            clip_text = None
+        output_dict = {
+            "img_rec": img_rec,
+            "vq_loss": vq_loss,
+            "entropy_loss": entropy_loss,
+            "codebook_usages": usages,
+            "clip_image_features": clip_visual,
+            "clip_text_features": clip_text,
+            "logit_scale": self.logit_scale.exp()
+        }
+        return output_dict
+    def encode_image(self, image, normalize: bool = False):
+        img_tokens = self.encoder(image)
+        img_tokens = self.quant_proj(img_tokens)
+        img_indices = self.quantizer.f_to_idx(img_tokens)
+        img_tokens = self.quantizer.idx_to_f(img_indices)
+        img_tokens = self.post_quant_proj(img_tokens)
+        features = img_tokens.mean(dim=1)
+        features = self.projection(self.fc_norm(features))
+        return F.normalize(features, dim=-1) if normalize else features
+    def encode_text(self, text, normalize: bool = False):
+        features = self.text_encoder(text)
+        return F.normalize(features, dim=-1) if normalize else features
+    def img_to_idx(self, img):
+        features = self.encoder(img).float()
+        features = self.quant_proj(features)
+        return self.quantizer.f_to_idx(features)
+    def idx_to_img(self, indices):
+        features = self.quantizer.idx_to_f(indices)
+        features = self.post_quant_proj(features)
+        img = self.decoder(features).clamp_(-1, 1)
+        return img
+    def img_to_reconstructed_img(self, image) -> torch.Tensor:
+        img_tokens = self.encoder(image)
+        img_tokens = self.quant_proj(img_tokens)
+        img_tokens, _, _, _ = self.quantizer(img_tokens)
+        img_tokens = self.post_quant_proj(img_tokens)
+        img_rec = self.decoder(img_tokens).clamp_(-1, 1)
+        return img_rec
+    def lock_text_tower(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True, unlock_text_proj=False):
+        self.text.lock(unlocked_layers, freeze_layer_norm, unlock_text_proj)
+        self.text_no_grad = True
+if __name__ == '__main__':
+    model = timm.create_model(
+        'vitamin_base',
+        patch_size=1,
+        fc_norm=True,
+        drop_rate=0.0,
+        num_classes=0,
+        global_pool='',
+        pos_embed='none',
+        class_token=False,
+        mlp_layer=GeGluMlp,
+        reg_tokens=0,
+        img_size=256,
+        drop_path_rate=0.1,
+    )
+    model.pos_embed = nn.Parameter(torch.zeros(1, 1, model.embed_dim), requires_grad=False)
+    model_dict = model.state_dict()
+    ckpt_dict = torch.load('ViTamin-B/pytorch_model.bin')
+    visual_dict = dict()
+    for k, v in ckpt_dict.items():
+        if k.startswith('visual.'):
+            if 'head' in k or 'pos_embed' in k:
+                continue
+            new_k = k.replace('visual.trunk.', '')
+            visual_dict[new_k] = v
+    model.load_state_dict(visual_dict, strict=False)
+    print(set(model_dict.keys()) - set(visual_dict.keys()))
+    print(set(visual_dict.keys() - set(model_dict.keys())))

unitok/quant.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import torch
+from typing import List, Tuple
+from torch.nn import functional as F
+from torch import distributed as tdist, nn as nn
+from unitok import dist
+def get_entropy_loss(latent_embed, codebook_embed, inv_entropy_tau):
+    E_dist = latent_embed.square().sum(dim=1, keepdim=True) + codebook_embed.square().sum(dim=1, keepdim=False)
+    E_dist.addmm_(latent_embed, codebook_embed.T, alpha=-2, beta=1)  # E_dist: (N, vocab_size)
+    logits = -E_dist.float().mul_(inv_entropy_tau)
+    # calc per_sample_entropy
+    prob, log_prob = logits.softmax(dim=-1), logits.log_softmax(dim=-1)  # both are (N, vocab_size)
+    per_sample_entropy = torch.mean((-prob * log_prob).sum(dim=-1))
+    # calc codebook_entropy
+    avg_prob = prob.mean(dim=0)  # (vocab_size,)
+    log_avg_prob = torch.log(avg_prob + 1e-7)
+    codebook_entropy = (-avg_prob * log_avg_prob).sum()
+    # calc entropy_loss
+    entropy_loss = per_sample_entropy - codebook_entropy
+    return entropy_loss
+class NormalizedEmbedding(nn.Embedding):
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        super().__init__(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
+        # self.norm_scale = nn.Parameter(torch.tensor(0.0, dtype=torch.float32))
+    def forward(self, idx):
+        return F.embedding(
+            idx, F.normalize(self.weight, dim=1), self.padding_idx, self.max_norm,
+            self.norm_type, self.scale_grad_by_freq, self.sparse
+        )
+    def get_norm_weight(self):
+        return F.normalize(self.weight, dim=1)
+class ResConv(nn.Conv2d):
+    def __init__(self, embed_dim, quant_resi):
+        ks = 3 if quant_resi < 0 else 1
+        super().__init__(in_channels=embed_dim, out_channels=embed_dim, kernel_size=ks, stride=1, padding=ks // 2)
+        self.resi_ratio = abs(quant_resi)
+    def forward(self, h_BChw):
+        return h_BChw.mul(1 - self.resi_ratio) + super().forward(h_BChw).mul_(self.resi_ratio)
+class VectorQuantizer(nn.Module):
+    def __init__(
+        self,
+        vocab_size: int,
+        vocab_width: int,
+        beta: float = 0.25,
+        use_entropy_loss=False,
+        entropy_temp=0.01,
+    ):
+        super().__init__()
+        self.beta = beta
+        self.vocab_size = vocab_size
+        self.vocab_width = vocab_width
+        self.vocab_usage_record_times: int = 0
+        self.register_buffer('vocab_usage', torch.zeros(self.vocab_size))
+        self.codebook = NormalizedEmbedding(self.vocab_size, self.vocab_width)
+        self.use_entropy_loss = use_entropy_loss
+        self.inv_entropy_tau = 1 / entropy_temp
+    def init_vocab(self, eini: float):
+        if eini > 0:
+            nn.init.trunc_normal_(self.codebook.weight.data, std=eini)
+        elif eini < 0:
+            base = self.vocab_width ** -0.5
+            base /= 36
+            self.codebook.weight.data.uniform_(-abs(eini) * base, abs(eini) * base)
+    def extra_repr(self) -> str:
+        return f'beta={self.beta:g}'
+    def forward(self, features):
+        B, L, C = features.shape
+        features = features.reshape(-1, C)
+        features = F.normalize(features, dim=-1).float()
+        codebook_embed = self.codebook.get_norm_weight()
+        indices = torch.argmax(features.detach() @ codebook_embed.T, dim=1)
+        entropy_loss = get_entropy_loss(features, codebook_embed, self.inv_entropy_tau) if self.use_entropy_loss else 0
+        features_hat = self.codebook(indices)
+        # calc loss
+        vq_loss = F.mse_loss(features_hat.detach(), features).mul_(self.beta) + F.mse_loss(features_hat,
+                                                                                           features.detach())
+        features_hat = (features_hat.detach() - features.detach()).add_(features)
+        # update vocab_usage
+        prob_per_class_is_chosen = indices.bincount(minlength=self.vocab_size).float()
+        handler = tdist.all_reduce(prob_per_class_is_chosen, async_op=True) if (
+                self.training and dist.initialized()) else None
+        if handler is not None:
+            handler.wait()
+        prob_per_class_is_chosen /= prob_per_class_is_chosen.sum()
+        vocab_usage = (prob_per_class_is_chosen > 0.01 / self.vocab_size).float().mean().mul_(100)
+        if self.vocab_usage_record_times == 0:
+            self.vocab_usage.copy_(prob_per_class_is_chosen)
+        elif self.vocab_usage_record_times < 100:
+            self.vocab_usage.mul_(0.9).add_(prob_per_class_is_chosen, alpha=0.1)
+        else:
+            self.vocab_usage.mul_(0.99).add_(prob_per_class_is_chosen, alpha=0.01)
+        self.vocab_usage_record_times += 1
+        return features_hat.view(B, L, C), vq_loss, entropy_loss, vocab_usage
+    def f_to_idx(self, features):
+        B, L, C = features.shape
+        features = features.reshape(-1, C)
+        features = F.normalize(features, dim=-1).float()
+        codebook_embed = self.codebook.get_norm_weight().float()
+        indices = torch.argmax(features.detach() @ codebook_embed.T, dim=1)
+        return indices.view(B, L)
+class VectorQuantizerM(nn.Module):
+    def __init__(
+        self,
+        vocab_size,
+        vocab_width,
+        beta=0.25,
+        use_entropy_loss=False,
+        entropy_temp=0.01,
+        num_codebooks=16
+    ):
+        super().__init__()
+        self.num_codebooks = num_codebooks
+        self.codebooks = nn.ModuleList()
+        for _ in range(num_codebooks):
+            codebook = VectorQuantizer(
+                vocab_size=vocab_size // num_codebooks,
+                vocab_width=vocab_width // num_codebooks,
+                beta=beta,
+                use_entropy_loss=use_entropy_loss,
+                entropy_temp=entropy_temp,
+            )
+            self.codebooks.append(codebook)
+    def init_vocab(self, eini: float):
+        for codebook in self.codebooks:
+            codebook.init_vocab(eini)
+    def f_to_idx(self, features):
+        indices = []
+        chunk_size = features.shape[-1] // self.num_codebooks
+        splited_features = features.split(chunk_size, dim=-1)
+        for i, codebook in enumerate(self.codebooks):
+            indices.append(codebook.f_to_idx(splited_features[i]))
+        indices = torch.stack(indices, dim=1)
+        return indices
+    def idx_to_f(self, indices):
+        assert indices.shape[1] == self.num_codebooks
+        latent_features = []
+        for i, codebook in enumerate(self.codebooks):
+            sub_indices = indices[:, i].flatten(start_dim=1)
+            latent_feature = codebook.codebook(sub_indices)
+            latent_features.append(latent_feature)
+        latent_features = torch.cat(latent_features, dim=-1)
+        return latent_features
+    def forward(self, features):
+        latent_features = []
+        global_vq_loss = 0.
+        global_entropy_loss = 0.
+        global_vocab_usage = 0.
+        chunk_size = features.shape[-1] // self.num_codebooks
+        splited_features = features.split(chunk_size, dim=-1)
+        for i, codebook in enumerate(self.codebooks):
+            latent_feature, vq_loss, entropy_loss, vocab_usage = codebook(splited_features[i])
+            latent_features.append(latent_feature)
+            global_vq_loss += vq_loss
+            global_entropy_loss += entropy_loss
+            global_vocab_usage += vocab_usage
+        latent_features = torch.cat(latent_features, dim=-1)
+        global_entropy_loss /= self.num_codebooks
+        global_vq_loss /= self.num_codebooks
+        global_vocab_usage /= self.num_codebooks
+        return latent_features, global_vq_loss, global_entropy_loss, global_vocab_usage

unitok/vitamin.py ADDED Viewed

	@@ -0,0 +1,792 @@

+"""
+TODO: FIXME:
+/usr/local/lib/python3.9/dist-packages/torch/autograd/__init__.py:251: UserWarning: Grad strides do not match bucket view strides. This may indicate grad was not created according to the gradient layout contract, or that the param's strides changed since DDP was constructed.  This is not an error, but may impair performance.
+grad.sizes() = [256, 1024, 1, 1], strides() = [1024, 1, 1024, 1024]
+bucket_view.sizes() = [256, 1024, 1, 1], strides() = [1024, 1, 1, 1] (Triggered internally at ../torch/csrc/distributed/c10d/reducer.cpp:334.)  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+/usr/local/lib/python3.9/dist-packages/torch/autograd/__init__.py:251: UserWarning: Grad strides do not match bucket view strides. This may indicate grad was not created according to the gradient layout contract, or that the param's strides changed since DDP was constructed.  This is not an error, but may impair performance.
+grad.sizes() = [256, 1024, 1, 1], strides() = [1024, 1, 1024, 1024]
+"""
+""" ViTamin
+Paper: Designing Scalable Vison Models in the Vision-Language Era
+@misc{chen2023designing,
+      title={Designing Scalable Vison Models in the Vision-Language Era},
+      author={Jieneng Chen and Qihang Yu and Xiaohui Shen and Alan Yuille and Liang-Cheih Chen},
+      year={2023},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+Based on Apache 2.0 licensed code at https://github.com/ViTamin/ViTamin
+Modifications and timm support by Jieneng Chen 2023
+Reference:
+https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py
+https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer_hybrid.py
+"""
+import math
+from dataclasses import dataclass
+from functools import partial
+import torch.nn.functional as F
+from typing import Optional, Tuple, Union
+import timm
+import torch
+import torch.nn as nn
+from timm.layers import to_2tuple
+from timm.layers.norm_act import _create_act
+from timm.models._builder import build_model_with_cfg
+from timm.models._manipulate import checkpoint_seq, named_apply
+from timm.models._registry import register_model
+from timm.models.layers import DropPath
+from timm.models.layers import create_conv2d, get_norm_act_layer, get_norm_layer, make_divisible
+from timm.models.vision_transformer import VisionTransformer, checkpoint_filter_fn
+from timm.models.vision_transformer_hybrid import HybridEmbed
+from torch.utils.checkpoint import checkpoint
+DropPath.__repr__ = lambda self: f'{type(self).__name__}(...)'
+@dataclass
+class VitConvCfg:
+    expand_ratio: float = 4.0
+    expand_output: bool = True  # calculate expansion channels from output (vs input chs)
+    kernel_size: int = 3
+    group_size: int = 1  # 1 == depthwise
+    pre_norm_act: bool = False  # activation after pre-norm
+    stride_mode: str = 'dw'  # stride done via one of 'pool', '1x1', 'dw'
+    pool_type: str = 'avg2'
+    downsample_pool_type: str = 'avg2'
+    act_layer: str = 'gelu' # stem & stage 1234
+    act_layer1: str = 'gelu' # stage 1234
+    act_layer2: str = 'gelu' # stage 1234
+    norm_layer: str = ''
+    norm_layer_cl: str = ''
+    norm_eps: Optional[float] = None
+    down_shortcut: Optional[bool] = True
+    mlp: str = 'mlp'
+    def __post_init__(self):
+        # mbconv vs convnext blocks have different defaults, set in post_init to avoid explicit config args
+        use_mbconv = True
+        if not self.norm_layer:
+            self.norm_layer = 'batchnorm2d' if use_mbconv else 'layernorm2d'
+        if not self.norm_layer_cl and not use_mbconv:
+            self.norm_layer_cl = 'layernorm'
+        if self.norm_eps is None:
+            self.norm_eps = 1e-5 if use_mbconv else 1e-6
+        self.downsample_pool_type = self.downsample_pool_type or self.pool_type
+@dataclass
+class VitCfg:
+    # embed_dim: Tuple[int, ...] = (96, 192, 384, 768)
+    embed_dim: Tuple[Union[int, Tuple[int, ...]], ...] = (96, 192, 384, 768)
+    depths: Tuple[Union[int, Tuple[int, ...]], ...] = (2, 3, 5, 2)
+    stem_width: int = 64
+    conv_cfg: VitConvCfg = None
+    weight_init: str = 'vit_eff'
+    head_type: str = ""
+    stem_type: str = "stem"
+    ln2d_permute: bool = True
+    # memory_format: str=""
+def _init_conv(module, name, scheme=''):
+    if isinstance(module, nn.Conv2d):
+        fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels
+        fan_out //= module.groups
+        nn.init.normal_(module.weight, 0, math.sqrt(2.0 / fan_out))
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+class Stem(nn.Module):
+    def __init__(
+            self,
+            in_chs: int,
+            out_chs: int,
+            act_layer: str = 'gelu',
+            norm_layer: str = 'layernorm2d',
+            norm_eps: float = 1e-6,
+            bias: bool = True,
+    ):
+        super().__init__()
+        self.grad_checkpointing=False
+        norm_act_layer = partial(get_norm_act_layer(norm_layer, act_layer), eps=norm_eps)
+        self.out_chs = out_chs
+        self.conv1 = create_conv2d(in_chs, out_chs, 3, stride=2, bias=bias)
+        self.norm1 = norm_act_layer(out_chs)
+        self.conv2 = create_conv2d(out_chs, out_chs, 3, stride=1, bias=bias)
+        named_apply(_init_conv, self)
+    def forward(self, x):
+        if self.grad_checkpointing:
+            x = checkpoint(self.conv1, x)
+            x = self.norm1(x)
+            x = checkpoint(self.conv2, x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.conv2(x)
+        return x
+class Downsample2d(nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            dim_out: int,
+            pool_type: str = 'avg2',
+            bias: bool = True,
+    ):
+        super().__init__()
+        self.pool = nn.AvgPool2d(kernel_size=3, stride=2, padding=1, count_include_pad=False)
+        if dim != dim_out:
+            self.expand = nn.Conv2d(dim, dim_out, 1, bias=bias) # 1x1 conv
+        else:
+            self.expand = nn.Identity()
+    def forward(self, x):
+        x = self.pool(x)  # spatial downsample
+        x = self.expand(x)  # expand chs
+        return x
+class StridedConv(nn.Module):
+    """ downsample 2d as well
+    """
+    def __init__(
+            self,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            in_chans=3,
+            embed_dim=768,
+            ln2d_permute=True
+    ):
+        super().__init__()
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding)
+        self.permute = ln2d_permute # TODO: disable
+        norm_layer = partial(get_norm_layer('layernorm2d'), eps=1e-6)
+        self.norm = norm_layer(in_chans) # affine over C
+    def forward(self, x):
+        x = self.norm(x)
+        x = self.proj(x)
+        return x
+class MbConvLNBlock(nn.Module):
+    """ Pre-Norm Conv Block - 1x1 - kxk - 1x1, w/ inverted bottleneck (expand)
+    """
+    def __init__(
+        self,
+        in_chs: int,
+        out_chs: int,
+        stride: int = 1,
+        drop_path: float = 0.,
+        kernel_size: int = 3,
+        norm_layer: str = 'layernorm2d',
+        norm_eps: float = 1e-6,
+        act_layer: str = 'gelu',
+        expand_ratio: float = 4.0,
+    ):
+        super(MbConvLNBlock, self).__init__()
+        self.stride, self.in_chs, self.out_chs = stride, in_chs, out_chs
+        mid_chs = make_divisible(out_chs * expand_ratio)
+        prenorm_act_layer = partial(get_norm_act_layer(norm_layer, act_layer), eps=norm_eps)
+        if stride == 2:
+            self.shortcut = Downsample2d(in_chs, out_chs, pool_type='avg', bias=True)
+        elif in_chs != out_chs:
+            self.shortcut = nn.Conv2d(in_chs, out_chs, 1, bias=True)
+        else:
+            self.shortcut = nn.Identity()
+        self.pre_norm = prenorm_act_layer(in_chs, apply_act=False)
+        self.down = nn.Identity()
+        self.conv1_1x1 = create_conv2d(in_chs, mid_chs, 1, stride=1, bias=True)
+        self.act1 = _create_act(act_layer, inplace=True)
+        self.act2 = _create_act(act_layer, inplace=True)
+        self.conv2_kxk = create_conv2d(mid_chs, mid_chs, kernel_size, stride=stride, dilation=1, groups=mid_chs, bias=True)
+        self.conv3_1x1 = create_conv2d(mid_chs, out_chs, 1, bias=True)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def init_weights(self, scheme=''):
+        named_apply(partial(_init_conv, scheme=scheme), self)
+    def forward(self, x):
+        shortcut = self.shortcut(x)
+        x = self.pre_norm(x)
+        x = self.down(x) # nn.Identity()
+        # 1x1 expansion conv & act
+        x = self.conv1_1x1(x)
+        x = self.act1(x)
+        # (strided) depthwise 3x3 conv & act
+        x = self.conv2_kxk(x)
+        x = self.act2(x)
+        # 1x1 linear projection to output width
+        x = self.conv3_1x1(x)
+        x = self.drop_path(x) + shortcut
+        return x
+class MbConvStages(nn.Module):
+    """ MobileConv for stage 1 and stage 2 of ViTamin
+    """
+    def __init__(
+            self,
+        cfg: VitCfg,
+        img_size: Union[int, Tuple[int, int]] = 224, # place holder
+        in_chans: int = 3,
+    ):
+        super().__init__()
+        self.grad_checkpointing = False
+        self.stem = Stem(
+            in_chs=in_chans,
+            out_chs=cfg.stem_width,
+        )
+        stages = []
+        self.num_stages = len(cfg.embed_dim)
+        for s, dim in enumerate(cfg.embed_dim[:2]): # stage
+            blocks = []
+            stage_in_chs = cfg.embed_dim[s-1] if s>0 else cfg.stem_width
+            for d in range(cfg.depths[s]):
+                blocks += [MbConvLNBlock(
+                        in_chs = stage_in_chs if d==0 else dim,
+                        out_chs = dim,
+                        stride = 2 if d == 0 else 1,
+                        # cfg = cfg.conv_cfg,
+                    )]
+            blocks = nn.Sequential(*blocks)
+            stages += [blocks]
+        self.stages = nn.ModuleList(stages)
+        self.pool = StridedConv(
+                        stride=2,
+                        in_chans=cfg.embed_dim[1],
+                        embed_dim=cfg.embed_dim[2]
+                    )
+    def forward(self, x):
+        x = self.stem(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            for stage in self.stages:
+                x = checkpoint_seq(stage, x)
+            x = checkpoint(self.pool, x)
+        else:
+            for stage in self.stages:
+                x = stage(x)
+            x = self.pool(x)
+        return x
+class GeGluMlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features,
+        act_layer = None,
+        drop = 0.0,
+    ):
+        super().__init__()
+        norm_layer = partial(get_norm_layer('layernorm'), eps=1e-6)
+        self.norm = norm_layer(in_features)
+        self.act = nn.GELU(approximate='tanh')
+        self.w0 = nn.Linear(in_features, hidden_features)
+        self.w1 = nn.Linear(in_features, hidden_features)
+        self.w2 = nn.Linear(hidden_features, in_features)
+    def forward(self, x):
+        x = self.norm(x)
+        x = self.act(self.w0(x)) * self.w1(x)
+        x = self.w2(x)
+        return x
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+    def __init__(
+        self,
+        backbone,
+        img_size=256,
+        patch_size=1,
+        feature_size=None,
+        in_chans=3,
+        embed_dim=1024,
+        bias=True,
+        dynamic_img_pad=False,
+    ):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.backbone = backbone
+        with torch.no_grad():
+            training = backbone.training
+            if training:
+                backbone.eval()
+            o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))
+            if isinstance(o, (list, tuple)):
+                o = o[-1]  # last feature if backbone outputs list/tuple of features
+            feature_size = o.shape[-2:]
+            feature_dim = o.shape[1]
+            backbone.train(training)
+        assert feature_size[0] % patch_size[0] == 0 and feature_size[1] % patch_size[1] == 0
+        self.grid_size = (feature_size[0] // patch_size[0], feature_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.proj = nn.Identity()
+    def forward(self, x):
+        x = self.backbone(x)
+        if isinstance(x, (list, tuple)):
+            x = x[-1]  # last feature if backbone outputs list/tuple of features
+        x = self.proj(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+class Upsample2d(nn.Module):
+    def __init__(self, dim, dim_out):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(dim, dim_out, kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        x = F.interpolate(x, scale_factor=2, mode='nearest')
+        x = self.conv(x)
+        return x
+class InvMbConvLNBlock(nn.Module):
+    """ Pre-Norm Conv Block - 1x1 - kxk - 1x1, w/ inverted bottleneck (expand)
+    """
+    def __init__(
+        self,
+        in_chs: int,
+        out_chs: int,
+        stride: int = 1,
+        drop_path: float = 0.,
+        kernel_size: int = 3,
+        norm_layer: str = 'layernorm2d',
+        norm_eps: float = 1e-6,
+        act_layer: str = 'gelu',
+        expand_ratio: float = 4.0,
+    ):
+        super().__init__()
+        self.stride, self.in_chs, self.out_chs = stride, in_chs, out_chs
+        mid_chs = make_divisible(out_chs * expand_ratio)
+        prenorm_act_layer = partial(get_norm_act_layer(norm_layer, act_layer), eps=norm_eps)
+        if stride == 2:
+            self.shortcut = Upsample2d(in_chs, out_chs)
+        elif in_chs != out_chs:
+            self.shortcut = nn.Conv2d(in_chs, out_chs, 1, bias=True)
+        else:
+            self.shortcut = nn.Identity()
+        self.pre_norm = prenorm_act_layer(in_chs, apply_act=False)
+        self.conv1_1x1 = create_conv2d(in_chs, mid_chs, 1, stride=1, bias=True)
+        self.act1 = _create_act(act_layer, inplace=True)
+        self.act2 = _create_act(act_layer, inplace=True)
+        self.up = Upsample2d(mid_chs, mid_chs) if stride == 2 else nn.Identity()
+        self.conv2_kxk = create_conv2d(mid_chs, mid_chs, kernel_size, stride=1, dilation=1, groups=mid_chs, bias=True)
+        self.conv3_1x1 = create_conv2d(mid_chs, out_chs, 1, bias=True)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def init_weights(self, scheme=''):
+        named_apply(partial(_init_conv, scheme=scheme), self)
+    def forward(self, x):
+        shortcut = self.shortcut(x)
+        x = self.pre_norm(x)
+        # 1x1 expansion conv & act
+        x = self.conv1_1x1(x)
+        x = self.act1(x)
+        x = self.up(x)
+        # (strided) depthwise 3x3 conv & act
+        x = self.conv2_kxk(x)
+        x = self.act2(x)
+        # 1x1 linear projection to output width
+        x = self.conv3_1x1(x)
+        x = self.drop_path(x) + shortcut
+        return x
+class InvStem(nn.Module):
+    def __init__(
+        self,
+        in_chs: int,
+        out_chs: int,
+        act_layer: str = 'gelu',
+        norm_layer: str = 'layernorm2d',
+        norm_eps: float = 1e-6,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.grad_checkpointing=False
+        norm_act_layer = partial(get_norm_act_layer(norm_layer, act_layer), eps=norm_eps)
+        self.out_chs = out_chs
+        self.conv1 = Upsample2d(in_chs, in_chs)
+        self.norm1 = norm_act_layer(in_chs)
+        self.conv2 = create_conv2d(in_chs, out_chs, 3, stride=1, bias=bias)
+        named_apply(_init_conv, self)
+    def forward(self, x):
+        if self.grad_checkpointing:
+            x = checkpoint(self.conv1, x)
+            x = self.norm1(x)
+            x = checkpoint(self.conv2, x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.conv2(x)
+        return x
+class ViTaminDecoder(nn.Module):
+    def __init__(
+        self,
+        model,
+        num_query=0,
+        img_size=256,
+        drop_path=0.,
+        depths=(4, 2),
+        grad_ckpt=False,
+    ):
+        super().__init__()
+        self.num_query = num_query
+        vit = timm.create_model(
+            model,
+            fc_norm=False,
+            patch_size=1,
+            drop_rate=0.0,
+            num_classes=0,
+            global_pool='',
+            pos_embed='none',
+            mlp_layer=GeGluMlp,
+            class_token=False,
+            reg_tokens=num_query,
+            img_size=img_size,
+            drop_path_rate=drop_path,
+        )
+        self.blocks = vit.blocks
+        self.norm_pre = vit.norm_pre
+        self.norm = vit.norm
+        embed_dims = {
+            'vitamin_base': (768, 256, 128),
+            'vitamin_large': (1024, 320, 160)
+        }[model]
+        self.up_conv1 = Upsample2d(embed_dims[0], embed_dims[1])
+        self.up_conv2 = nn.Sequential(*[
+            InvMbConvLNBlock(
+                in_chs=embed_dims[1],
+                out_chs=embed_dims[1],
+                stride=2 if d == 0 else 1)
+            for d in range(depths[0])]
+        )
+        self.up_conv3 = nn.Sequential(*[
+            InvMbConvLNBlock(
+                in_chs=embed_dims[1] if d == 0 else embed_dims[2],
+                out_chs=embed_dims[2],
+                stride=2 if d == 0 else 1)
+            for d in range(depths[1])]
+        )
+        self.up_conv4 = InvStem(in_chs=embed_dims[2], out_chs=3)
+        self.grad_ckpt = grad_ckpt
+    def get_last_param(self):
+        return self.up_conv4.conv2.weight
+    def forward(self, x):
+        B, L, C = x.shape
+        H = W = int((L-self.num_query) ** 0.5)
+        x = self.norm_pre(x)
+        if self.grad_ckpt:
+            x = checkpoint_seq(self.blocks, x)
+            x = x[:, self.num_query:, :]
+            x = self.norm(x)
+            x = x.view(B, H, W, C).permute(0, 3, 1, 2)
+            x = checkpoint(self.up_conv1, x)
+            x = checkpoint_seq(self.up_conv2, x)
+            x = checkpoint_seq(self.up_conv3, x)
+        else:
+            x = self.blocks(x)
+            x = x[:, self.num_query:, :]
+            x = self.norm(x)
+            x = x.view(B, H, W, C).permute(0, 3, 1, 2)
+            x = self.up_conv1(x)
+            x = self.up_conv2(x)
+            x = self.up_conv3(x)
+        x = self.up_conv4(x)
+        return x
+def _create_vision_transformer(variant, pretrained=False, grad_ckpt=False, **kwargs) -> VisionTransformer:
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Vision Transformer models.')
+    if 'flexi' in variant:
+        # FIXME Google FlexiViT pretrained models have a strong preference for bilinear patch / embed
+        # interpolation, other pretrained models resize better w/ anti-aliased bicubic interpolation.
+        _filter_fn = partial(checkpoint_filter_fn, interpolation='bilinear', antialias=False)
+    else:
+        _filter_fn = checkpoint_filter_fn
+    return build_model_with_cfg(
+        VisionTransformer,
+        variant,
+        pretrained,
+        pretrained_filter_fn=_filter_fn,
+        **kwargs,
+    )
+def _create_vision_transformer_hybrid(variant, backbone, pretrained=False, **kwargs):
+    embed_layer = partial(HybridEmbed, backbone=backbone)
+    kwargs.setdefault('patch_size', 1)  # default patch size for hybrid models if not set
+    return _create_vision_transformer(variant, pretrained=pretrained, embed_layer=embed_layer, **kwargs)
+@register_model
+def vitamin_small(pretrained=False, **kwargs) -> VisionTransformer:
+    stage_1_2 = MbConvStages(cfg=VitCfg(
+            embed_dim=(64, 128, 384),
+            depths=(2, 4, 1),
+            stem_width=64,
+            conv_cfg = VitConvCfg(
+                norm_layer='layernorm2d',
+                norm_eps=1e-6,
+            ),
+            head_type='1d',
+        ),
+    )
+    stage3_args = dict(embed_dim=384, depth=14, num_heads=6, mlp_layer=GeGluMlp, mlp_ratio=2., class_token=False, global_pool='avg')
+    model = _create_vision_transformer_hybrid('vitamin_small', backbone=stage_1_2, pretrained=pretrained, **dict(stage3_args, **kwargs))
+    return model
+@register_model
+def vitamin_base(pretrained=False, **kwargs) -> VisionTransformer:
+    stage_1_2 = MbConvStages(cfg=VitCfg(
+            embed_dim=(128, 256, 768),
+            depths=(2, 4, 1),
+            stem_width=128,
+            conv_cfg = VitConvCfg(
+                norm_layer='layernorm2d',
+                norm_eps=1e-6,
+            ),
+            head_type='1d',
+        ),
+    )
+    stage3_args = dict(embed_dim=768, depth=14, num_heads=12, mlp_layer=GeGluMlp, mlp_ratio=2., class_token=False, global_pool='avg')
+    model = _create_vision_transformer_hybrid('vitamin_base', backbone=stage_1_2, pretrained=pretrained, **dict(stage3_args, **kwargs))
+    return model
+@register_model
+def vitamin_base_256(pretrained=False, **kwargs) -> VisionTransformer:
+    stage_1_2 = MbConvStages(cfg=VitCfg(
+            embed_dim=(128, 256, 768),
+            depths=(2, 4, 1),
+            stem_width=128,
+            conv_cfg = VitConvCfg(
+                norm_layer='layernorm2d',
+                norm_eps=1e-6,
+            ),
+            head_type='1d',
+        ),
+    )
+    stage3_args = dict(img_size=256, embed_dim=768, depth=14, num_heads=12, mlp_layer=GeGluMlp, mlp_ratio=2., class_token=False, global_pool='avg')
+    model = _create_vision_transformer_hybrid('vitamin_base_256', backbone=stage_1_2, pretrained=pretrained, **dict(stage3_args, **kwargs))
+    return model
+@register_model
+def vitamin_large(pretrained=False, **kwargs) -> VisionTransformer:
+    stage_1_2 = MbConvStages(cfg=VitCfg(
+        embed_dim=(160, 320, 1024),
+        depths=(2, 4, 1),
+        stem_width=160,
+        conv_cfg = VitConvCfg(
+            norm_layer='layernorm2d',
+            norm_eps=1e-6,
+        ),
+        head_type='1d',
+    ),
+    )
+    stage3_args = dict(embed_dim=1024, depth=31, num_heads=16, mlp_layer=GeGluMlp, mlp_ratio=2., class_token=False, global_pool='avg')
+    model = _create_vision_transformer_hybrid(
+        'vitamin_large', backbone=stage_1_2, pretrained=pretrained, **dict(stage3_args, **kwargs))
+    return model
+# @register_model
+# def vitamin_large_256(pretrained=False, **kwargs) -> VisionTransformer:
+#     backbone = MbConvStages(cfg=VitCfg(
+#         embed_dim=(160, 320, 1024),
+#         depths=(2, 4, 1),
+#         stem_width=160,
+#         conv_cfg = VitConvCfg(
+#             norm_layer='layernorm2d',
+#             norm_eps=1e-6,
+#         ),
+#         head_type='1d',
+#     ),
+#     )
+#     model_args = dict(img_size=256, embed_dim=1024, depth=31, num_heads=16, mlp_layer=GeGluMlp, mlp_ratio=2., class_token=False, global_pool='avg')
+#     model = _create_vision_transformer_hybrid(
+#         'vitamin_large_256', backbone=backbone, pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def vitamin_large_336(pretrained=False, **kwargs) -> VisionTransformer:
+#     backbone = MbConvStages(cfg=VitCfg(
+#         embed_dim=(160, 320, 1024),
+#         depths=(2, 4, 1),
+#         stem_width=160,
+#         conv_cfg = VitConvCfg(
+#             norm_layer='layernorm2d',
+#             norm_eps=1e-6,
+#         ),
+#         head_type='1d',
+#     ),
+#     )
+#     model_args = dict(img_size=336, embed_dim=1024, depth=31, num_heads=16, mlp_layer=GeGluMlp, mlp_ratio=2., class_token=False, global_pool='avg')
+#     model = _create_vision_transformer_hybrid(
+#         'vitamin_large_336', backbone=backbone, pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def vitamin_large_384(pretrained=False, **kwargs) -> VisionTransformer:
+#     backbone = MbConvStages(cfg=VitCfg(
+#         embed_dim=(160, 320, 1024),
+#         depths=(2, 4, 1),
+#         stem_width=160,
+#         conv_cfg = VitConvCfg(
+#             norm_layer='layernorm2d',
+#             norm_eps=1e-6,
+#         ),
+#         head_type='1d',
+#     ),
+#     )
+#     model_args = dict(img_size=384, embed_dim=1024, depth=31, num_heads=16, mlp_layer=GeGluMlp, mlp_ratio=2., class_token=False, global_pool='avg')
+#     model = _create_vision_transformer_hybrid(
+#         'vitamin_large_384', backbone=backbone, pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def vitamin_xlarge_256(pretrained=False, **kwargs) -> VisionTransformer:
+#     backbone = MbConvStages(cfg=VitCfg(
+#         embed_dim=(192, 384, 1152),
+#         depths=(2, 4, 1),
+#         stem_width=192,
+#         conv_cfg = VitConvCfg(
+#             norm_layer='layernorm2d',
+#             norm_eps=1e-6,
+#         ),
+#         head_type='1d',
+#     ),
+#     )
+#     model_args = dict(img_size=256, embed_dim=1152, depth=32, num_heads=16, mlp_layer=GeGluMlp, mlp_ratio=2., class_token=False, global_pool='avg')
+#     model = _create_vision_transformer_hybrid(
+#         'vitamin_xlarge_256', backbone=backbone, pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def vitamin_xlarge_336(pretrained=False, **kwargs) -> VisionTransformer:
+#     backbone = MbConvStages(cfg=VitCfg(
+#         embed_dim=(192, 384, 1152),
+#         depths=(2, 4, 1),
+#         stem_width=192,
+#         conv_cfg = VitConvCfg(
+#             norm_layer='layernorm2d',
+#             norm_eps=1e-6,
+#         ),
+#         head_type='1d',
+#     ),
+#     )
+#     model_args = dict(img_size=336, embed_dim=1152, depth=32, num_heads=16, mlp_layer=GeGluMlp, mlp_ratio=2., class_token=False, global_pool='avg')
+#     model = _create_vision_transformer_hybrid(
+#         'vitamin_xlarge_256', backbone=backbone, pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def vitamin_xlarge_384(pretrained=False, **kwargs) -> VisionTransformer:
+#     backbone = MbConvStages(cfg=VitCfg(
+#         embed_dim=(192, 384, 1152),
+#         depths=(2, 4, 1),
+#         stem_width=192,
+#         conv_cfg = VitConvCfg(
+#             norm_layer='layernorm2d',
+#             norm_eps=1e-6,
+#         ),
+#         head_type='1d',
+#     ),
+#     )
+#     model_args = dict(img_size=384, embed_dim=1152, depth=32, num_heads=16, mlp_layer=GeGluMlp, mlp_ratio=2., class_token=False, global_pool='avg')
+#     model = _create_vision_transformer_hybrid(
+#         'vitamin_xlarge_384', backbone=backbone, pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+def count_params(model: nn.Module):
+    return sum([m.numel() for m in model.parameters()])
+def count_stage_params(model: nn.Module, prefix='none'):
+    collections = []
+    for name, m in model.named_parameters():
+        print(name)
+        if name.startswith(prefix):
+            collections.append(m.numel())
+    return sum(collections)
+if __name__ == "__main__":
+    # ViTaminDecoder('vitamin_base', img_size=256, patch_size=16)
+    # model = timm.create_model(
+    #     'vitamin_base',
+    #     fc_norm=True,
+    #     drop_rate=0.0,
+    #     num_classes=0,
+    #     global_pool='',
+    #     mlp_layer=GeGluMlp,
+    #     class_token=False,
+    #     reg_tokens=32,
+    #     img_size=256,
+    #     patch_size=1,
+    #     drop_path_rate=0.1,
+    # )
+    # print(model.has_class_token)
+    # print(model.num_prefix_tokens)
+    # print(model.pos_embed.shape)
+    Stem(64, 64)

unitok/vqvae.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import timm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from contextlib import nullcontext
+from torch.nn.functional import scaled_dot_product_attention
+from unitok.quant import VectorQuantizerM
+from unitok.vitamin import ViTaminDecoder, GeGluMlp
+class PlainAttention(nn.Module):
+    def __init__(self, in_dim, out_dim, num_heads):
+        super().__init__()
+        if in_dim > out_dim:
+            # assert in_dim // num_heads == out_dim
+            self.head_dim = in_dim // num_heads
+            self.qkv = nn.Linear(in_dim, in_dim * 3, bias=False)
+            self.q_bias = nn.Parameter(torch.zeros(in_dim))
+            self.v_bias = nn.Parameter(torch.zeros(in_dim))
+            self.register_buffer('zero_k_bias', torch.zeros(in_dim))
+        else:
+            # assert out_dim // num_heads == in_dim
+            self.head_dim = out_dim // num_heads
+            self.qkv = nn.Linear(in_dim, out_dim * 3, bias=False)
+            self.q_bias = nn.Parameter(torch.zeros(out_dim))
+            self.v_bias = nn.Parameter(torch.zeros(out_dim))
+            self.register_buffer('zero_k_bias', torch.zeros(out_dim))
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.scale = self.head_dim ** -0.5
+        self.proj = nn.Linear(out_dim, out_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=torch.cat((self.q_bias, self.zero_k_bias, self.v_bias)))
+        q, k, v = qkv.reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4).unbind(0)
+        x = scaled_dot_product_attention(q, k, v)
+        if self.in_dim > self.out_dim:
+            x = torch.mean(x, dim=1)
+            if self.in_dim // self.num_heads != self.out_dim:
+                x = nn.functional.adaptive_avg_pool1d(x, self.out_dim)
+        else:
+            x = x.transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        return x
+class AttnProjection(nn.Module):
+    def __init__(self, in_dim, out_dim, num_heads, norm_layer=nn.LayerNorm, mlp_ratio=2):
+        super().__init__()
+        assert out_dim % in_dim == 0 or in_dim % out_dim == 0
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.norm1 = norm_layer(in_dim)
+        self.attn = PlainAttention(in_dim, out_dim, num_heads)
+        self.proj = nn.Linear(in_dim, out_dim)
+        self.norm3 = norm_layer(in_dim)
+        self.norm2 = norm_layer(out_dim)
+        hidden_dim = int(out_dim * mlp_ratio)
+        self.mlp = GeGluMlp(
+            in_features=out_dim,
+            hidden_features=hidden_dim
+        )
+    def forward(self, x):
+        x = self.proj(self.norm3(x)) + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+class VQVAE(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        # 1. build encoder
+        self.encoder = timm.create_model(
+            args.model,
+            patch_size=1,
+            fc_norm=True,
+            drop_rate=0.0,
+            num_classes=0,
+            global_pool='',
+            pos_embed='none',
+            class_token=False,
+            mlp_layer=GeGluMlp,
+            img_size=args.img_size,
+            drop_path_rate=args.drop_path,
+        )
+        self.encoder.set_grad_checkpointing(args.grad_ckpt)
+        # 2. build conv before quant
+        if args.quant_proj == 'linear':
+            self.quant_proj = nn.Linear(self.encoder.embed_dim, args.vocab_width)
+        elif args.quant_proj == 'attn':
+            self.quant_proj = AttnProjection(self.encoder.embed_dim, args.vocab_width, args.num_codebooks)
+        else:
+            raise NotImplementedError
+        # 3. build quant
+        self.quantize = VectorQuantizerM(
+            vocab_size=args.vocab_size,
+            vocab_width=args.vocab_width,
+            beta=args.vq_beta,
+            use_entropy_loss=args.le > 0,
+            entropy_temp=args.e_temp,
+            num_codebooks=args.num_codebooks,
+        )
+        # 4. build conv after quant
+        if args.quant_proj == 'linear':
+            self.post_quant_proj = nn.Linear(args.vocab_width, self.encoder.embed_dim)
+        elif args.quant_proj == 'attn':
+            self.post_quant_proj = AttnProjection(args.vocab_width, self.encoder.embed_dim, args.num_codebooks)
+        else:
+            raise NotImplementedError
+        # 5. build decoder
+        self.decoder = ViTaminDecoder(
+            args.model,
+            depths=(4, 2),
+            img_size=args.img_size,
+            drop_path=args.drop_path,
+            grad_ckpt=args.grad_ckpt
+        )
+        self.maybe_record_function = nullcontext
+    def forward(self, img):
+        features = self.encoder(img).float()
+        with torch.cuda.amp.autocast(enabled=False):
+            features = self.quant_proj(features)
+            quant_out = self.quantize(features)
+            features, vq_loss, entropy_loss, usages = quant_out
+            features = self.post_quant_proj(features)
+        rec_img = self.decoder(features).float()
+        return rec_img, vq_loss, entropy_loss, usages
+    def img_to_idx(self, img):
+        features = self.encoder(img).float()
+        features = self.quant_proj(features)
+        return self.quantize.f_to_idx(features)
+    def idx_to_img(self, indices):
+        features = self.quantize.idx_to_f(indices)
+        features = self.post_quant_proj(features)
+        img = self.decoder(features).clamp_(-1, 1)
+        return img
+    def img_to_reconstructed_img(self, img) -> torch.Tensor:
+        features = self.encoder(img).float()
+        with torch.cuda.amp.autocast(enabled=False):
+            features = self.quant_proj(features)
+            quant_out = self.quantize(features)
+            features, _, _, _ = quant_out
+            features = self.post_quant_proj(features)
+        rec_img = self.decoder(features).float().clamp_(-1, 1)
+        return rec_img
+if __name__ == '__main__':
+    for clz in (nn.Linear, nn.LayerNorm, nn.BatchNorm2d, nn.SyncBatchNorm, nn.Conv1d, nn.Conv2d, nn.ConvTranspose1d,
+                nn.ConvTranspose2d):
+        setattr(clz, 'reset_parameters', lambda self: None)
+    cnn = VQVAE(channel_num=64, vocab_norm=False)
+    from models import init_weights
+    init_weights(cnn, -0.5)
+    torch.save(cnn.state_dict(), r'C:\Users\16333\Desktop\PyCharm\vlip\local_output\cnn.pth')