tuandunghcmut
/

vlm_clone_2

Model card Files Files and versions Community

tuandunghcmut commited on Apr 10

Commit

42c85ac

verified ·

1 Parent(s): d2ab3a0

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

DeepSeek-VL2/.ipynb_checkpoints/README-checkpoint.md +397 -0
DeepSeek-VL2/.ipynb_checkpoints/inference-checkpoint.py +185 -0
DeepSeek-VL2/.ipynb_checkpoints/requirements-checkpoint.txt +19 -0
DeepSeek-VL2/deepseek_vl2.egg-info/requires.txt +33 -0
DeepSeek-VL2/deepseek_vl2.egg-info/top_level.txt +1 -0
DeepSeek-VL2/deepseek_vl2/__init__.py +31 -0
DeepSeek-VL2/deepseek_vl2/__pycache__/__init__.cpython-310.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/__pycache__/__init__.cpython-39.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/models/__pycache__/__init__.cpython-310.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/models/__pycache__/__init__.cpython-311.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/models/__pycache__/configuration_deepseek.cpython-39.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/models/__pycache__/conversation.cpython-311.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/models/__pycache__/conversation.cpython-312.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/models/__pycache__/conversation.cpython-39.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/models/__pycache__/modeling_deepseek_vl_v2.cpython-311.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/models/__pycache__/modeling_deepseek_vl_v2.cpython-312.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/models/__pycache__/modeling_deepseek_vl_v2.cpython-39.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/models/__pycache__/processing_deepseek_vl_v2.cpython-310.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/models/__pycache__/processing_deepseek_vl_v2.cpython-311.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/models/__pycache__/processing_deepseek_vl_v2.cpython-312.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/models/__pycache__/processing_deepseek_vl_v2.cpython-39.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/models/__pycache__/siglip_vit.cpython-310.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/models/__pycache__/siglip_vit.cpython-312.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/models/__pycache__/siglip_vit.cpython-39.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/serve/app_modules/__pycache__/gradio_utils.cpython-312.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/serve/app_modules/__pycache__/utils.cpython-312.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/utils/__pycache__/__init__.cpython-310.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/utils/__pycache__/__init__.cpython-39.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/utils/__pycache__/io.cpython-312.pyc +0 -0
DeepSeek-VL2/deepseek_vl2/utils/__pycache__/io.cpython-39.pyc +0 -0
DeepSeek-VL2/images/logo.png +0 -0
DeepSeek-VL2/images/logo.svg +22 -0
DeepSeek-VL2/images/monday.jpg +0 -0
DeepSeek-VL2/images/visual_grounding_2.jpg +0 -0
DeepSeek-VL2/images/vl2_teaser.jpeg +0 -0
VLM2Vec/archive/gather_score_byckpt_aws.py +132 -0
VLM2Vec/archive/merge.py +26 -0
VLM2Vec/archive/testset_stats.py +66 -0
VLM2Vec/evaluation/eval_flickr.py +124 -0
VLM2Vec/figures/example.jpg +0 -0
VLM2Vec/grad_cache/__init__.py +4 -0
VLM2Vec/grad_cache/cachex/__init__.py +3 -0
VLM2Vec/grad_cache/context_managers.py +21 -0
VLM2Vec/grad_cache/functional.py +91 -0
VLM2Vec/grad_cache/grad_cache.py +279 -0
VLM2Vec/grad_cache/loss.py +80 -0
VLM2Vec/grad_cache/minigc_cmd.md +90 -0
VLM2Vec/scripts/llava_next/demo.py +46 -0
VLM2Vec/scripts/llava_next/run_eval_flickr_llava_next.sh +21 -0
VLM2Vec/src/arguments.py +121 -0

DeepSeek-VL2/.ipynb_checkpoints/README-checkpoint.md ADDED Viewed

	@@ -0,0 +1,397 @@

+<!-- markdownlint-disable first-line-h1 -->
+<!-- markdownlint-disable html -->
+<!-- markdownlint-disable no-duplicate-header -->
+<div align="center">
+  <img src="images/logo.svg" width="60%" alt="DeepSeek LLM" />
+</div>
+<hr>
+<div align="center">
+  <a href="https://www.deepseek.com/" target="_blank">
+    <img alt="Homepage" src="images/badge.svg" />
+  </a>
+  <a href="" target="_blank">
+    <img alt="Chat" src="https://img.shields.io/badge/🤖%20Chat-DeepSeek%20VL-536af5?color=536af5&logoColor=white" />
+  </a>
+  <a href="https://huggingface.co/deepseek-ai" target="_blank">
+    <img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-DeepSeek%20AI-ffc107?color=ffc107&logoColor=white" />
+  </a>
+</div>
+<div align="center">
+  <a href="https://discord.gg/Tc7c45Zzu5" target="_blank">
+    <img alt="Discord" src="https://img.shields.io/badge/Discord-DeepSeek%20AI-7289da?logo=discord&logoColor=white&color=7289da" />
+  </a>
+  <a href="images/qr.jpeg" target="_blank">
+    <img alt="Wechat" src="https://img.shields.io/badge/WeChat-DeepSeek%20AI-brightgreen?logo=wechat&logoColor=white" />
+  </a>
+  <a href="https://twitter.com/deepseek_ai" target="_blank">
+    <img alt="Twitter Follow" src="https://img.shields.io/badge/Twitter-deepseek_ai-white?logo=x&logoColor=white" />
+  </a>
+</div>
+<div align="center">
+  <a href="LICENSE-CODE">
+    <img alt="Code License" src="https://img.shields.io/badge/Code_License-MIT-f5de53?&color=f5de53">
+  </a>
+  <a href="LICENSE-MODEL">
+    <img alt="Model License" src="https://img.shields.io/badge/Model_License-Model_Agreement-f5de53?&color=f5de53">
+  </a>
+</div>
+<p align="center">
+  <a href="https://github.com/deepseek-ai/DeepSeek-VL2/tree/main?tab=readme-ov-file#3-model-download"><b>📥 Model Download</b></a> |
+  <a href="https://github.com/deepseek-ai/DeepSeek-VL2/tree/main?tab=readme-ov-file#4-quick-start"><b>⚡ Quick Start</b></a> |
+  <a href="https://github.com/deepseek-ai/DeepSeek-VL2/tree/main?tab=readme-ov-file#5-license"><b>📜 License</b></a> |
+  <a href="https://github.com/deepseek-ai/DeepSeek-VL2/tree/main?tab=readme-ov-file#6-citation"><b>📖 Citation</b></a> <br>
+  <a href="./DeepSeek_VL2_paper.pdf"><b>📄 Paper Link</b></a> |
+  <a href="https://arxiv.org/abs/2412.10302"><b>📄 Arxiv Paper Link</b></a> |
+  <a href=""><b>👁️ Demo</b></a>
+</p>
+## 1. Introduction
+Introducing DeepSeek-VL2, an advanced series of large Mixture-of-Experts (MoE) Vision-Language Models that significantly improves upon its predecessor, DeepSeek-VL. DeepSeek-VL2 demonstrates superior capabilities across various tasks, including but not limited to visual question answering, optical character recognition,  document/table/chart understanding, and visual grounding. Our model series is composed of three variants: DeepSeek-VL2-Tiny, DeepSeek-VL2-Small and DeepSeek-VL2, with 1.0B, 2.8B and 4.5B activated parameters respectively.
+DeepSeek-VL2 achieves competitive or state-of-the-art performance with similar or fewer activated parameters compared to existing open-source dense and MoE-based models.
+[DeepSeek-VL2: Mixture-of-Experts Vision-Language Models for Advanced Multimodal Understanding]()
+Zhiyu Wu*, Xiaokang Chen*, Zizheng Pan*, Xingchao Liu*, Wen Liu**, Damai Dai, Huazuo Gao, Yiyang Ma, Chengyue Wu, Bingxuan Wang, Zhenda Xie, Yu Wu, Kai Hu, Jiawei Wang, Yaofeng Sun, Yukun Li, Yishi Piao, Kang Guan, Aixin Liu, Xin Xie, Yuxiang You, Kai Dong, Xingkai Yu, Haowei Zhang, Liang Zhao, Yisong Wang, Chong Ruan*** (* Equal Contribution, ** Project Lead, *** Corresponding author)
+![](./images/vl2_teaser.jpeg)
+## 2. Release
+✅ <b>2024-12-25</b>: Gradio Demo Example, Incremental Prefilling and VLMEvalKit Support.
+✅ <b>2024-12-13</b>: DeepSeek-VL2 family released, including <code>DeepSeek-VL2-tiny</code>, <code>DeepSeek-VL2-small</code>, <code>DeepSeek-VL2</code>.
+## 3. Model Download
+We release the DeepSeek-VL2 family, including <code>DeepSeek-VL2-tiny</code>, <code>DeepSeek-VL2-small</code>, <code>DeepSeek-VL2</code>.
+To support a broader and more diverse range of research within both academic and commercial communities.
+Please note that the use of this model is subject to the terms outlined in [License section](#5-license).
+### Huggingface
+| Model        | Sequence Length | Download                                                                    |
+|--------------|-----------------|-----------------------------------------------------------------------------|
+| DeepSeek-VL2-tiny | 4096            | [🤗 Hugging Face](https://huggingface.co/deepseek-ai/deepseek-vl2-tiny) |
+| DeepSeek-VL2-small | 4096            | [🤗 Hugging Face](https://huggingface.co/deepseek-ai/deepseek-vl2-small) |
+| DeepSeek-VL2 | 4096            | [🤗 Hugging Face](https://huggingface.co/deepseek-ai/deepseek-vl2)   |
+## 4. Quick Start
+### Installation
+On the basis of `Python >= 3.8` environment, install the necessary dependencies by running the following command:
+```shell
+pip install -e .
+```
+### Simple Inference Example with One Image
+**Note: You may need 80GB GPU memory to run this script with deepseek-vl2-small and even larger for deepseek-vl2.**
+```python
+import torch
+from transformers import AutoModelForCausalLM
+from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
+from deepseek_vl2.utils.io import load_pil_images
+# specify the path to the model
+model_path = "deepseek-ai/deepseek-vl2-tiny"
+vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(model_path)
+tokenizer = vl_chat_processor.tokenizer
+vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
+vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
+## single image conversation example
+conversation = [
+    {
+        "role": "<|User|>",
+        "content": "<image>\n<|ref|>The giraffe at the back.<|/ref|>.",
+        "images": ["./images/visual_grounding_1.jpeg"],
+    },
+    {"role": "<|Assistant|>", "content": ""},
+]
+# load images and prepare for inputs
+pil_images = load_pil_images(conversation)
+prepare_inputs = vl_chat_processor(
+    conversations=conversation,
+    images=pil_images,
+    force_batchify=True,
+    system_prompt=""
+).to(vl_gpt.device)
+# run image encoder to get the image embeddings
+inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
+# run the model to get the response
+outputs = vl_gpt.language.generate(
+    inputs_embeds=inputs_embeds,
+    attention_mask=prepare_inputs.attention_mask,
+    pad_token_id=tokenizer.eos_token_id,
+    bos_token_id=tokenizer.bos_token_id,
+    eos_token_id=tokenizer.eos_token_id,
+    max_new_tokens=512,
+    do_sample=False,
+    use_cache=True
+)
+answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=False)
+print(f"{prepare_inputs['sft_format'][0]}", answer)
+```
+And the output is something like:
+```
+<|User|>: <image>
+<|ref|>The giraffe at the back.<|/ref|>.
+<|Assistant|>: <|ref|>The giraffe at the back.<|/ref|><|det|>[[580, 270, 999, 900]]<|/det|><｜end▁of▁sentence｜>
+```
+### Simple Inference Example with Multiple Images
+**Note: You may need 80GB GPU memory to run this script with deepseek-vl2-small and even larger for deepseek-vl2.**
+```python
+import torch
+from transformers import AutoModelForCausalLM
+from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
+from deepseek_vl2.utils.io import load_pil_images
+# specify the path to the model
+model_path = "deepseek-ai/deepseek-vl2-tiny"
+vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(model_path)
+tokenizer = vl_chat_processor.tokenizer
+vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
+vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
+# multiple images/interleaved image-text
+conversation = [
+    {
+        "role": "<|User|>",
+        "content": "This is image_1: <image>\n"
+                   "This is image_2: <image>\n"
+                   "This is image_3: <image>\n Can you tell me what are in the images?",
+        "images": [
+            "images/multi_image_1.jpeg",
+            "images/multi_image_2.jpeg",
+            "images/multi_image_3.jpeg",
+        ],
+    },
+    {"role": "<|Assistant|>", "content": ""}
+]
+# load images and prepare for inputs
+pil_images = load_pil_images(conversation)
+prepare_inputs = vl_chat_processor(
+    conversations=conversation,
+    images=pil_images,
+    force_batchify=True,
+    system_prompt=""
+).to(vl_gpt.device)
+# run image encoder to get the image embeddings
+inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
+# run the model to get the response
+outputs = vl_gpt.language.generate(
+    inputs_embeds=inputs_embeds,
+    attention_mask=prepare_inputs.attention_mask,
+    pad_token_id=tokenizer.eos_token_id,
+    bos_token_id=tokenizer.bos_token_id,
+    eos_token_id=tokenizer.eos_token_id,
+    max_new_tokens=512,
+    do_sample=False,
+    use_cache=True
+)
+answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=False)
+print(f"{prepare_inputs['sft_format'][0]}", answer)
+```
+And the output is something like:
+```
+<|User|>: This is image_1: <image>
+This is image_2: <image>
+This is image_3: <image>
+ Can you tell me what are in the images?
+<|Assistant|>: The images show three different types of vegetables. Image_1 features carrots, which are orange with green tops. Image_2 displays corn cobs, which are yellow with green husks. Image_3 contains raw pork ribs, which are pinkish-red with some marbling.<｜end▁of▁sentence｜>
+```
+### Simple Inference Example with Incremental Prefilling
+**Note: We use incremental prefilling to inference within 40GB GPU using deepseek-vl2-small.**
+```python
+import torch
+from transformers import AutoModelForCausalLM
+from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
+from deepseek_vl2.utils.io import load_pil_images
+# specify the path to the model
+model_path = "deepseek-ai/deepseek-vl2-small"
+vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(model_path)
+tokenizer = vl_chat_processor.tokenizer
+vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
+vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
+# multiple images/interleaved image-text
+conversation = [
+    {
+        "role": "<|User|>",
+        "content": "This is image_1: <image>\n"
+                   "This is image_2: <image>\n"
+                   "This is image_3: <image>\n Can you tell me what are in the images?",
+        "images": [
+            "images/multi_image_1.jpeg",
+            "images/multi_image_2.jpeg",
+            "images/multi_image_3.jpeg",
+        ],
+    },
+    {"role": "<|Assistant|>", "content": ""}
+]
+# load images and prepare for inputs
+pil_images = load_pil_images(conversation)
+prepare_inputs = vl_chat_processor(
+    conversations=conversation,
+    images=pil_images,
+    force_batchify=True,
+    system_prompt=""
+).to(vl_gpt.device)
+with torch.no_grad():
+    # run image encoder to get the image embeddings
+    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
+    # incremental_prefilling when using 40G GPU for vl2-small
+    inputs_embeds, past_key_values = vl_gpt.incremental_prefilling(
+        input_ids=prepare_inputs.input_ids,
+        images=prepare_inputs.images,
+        images_seq_mask=prepare_inputs.images_seq_mask,
+        images_spatial_crop=prepare_inputs.images_spatial_crop,
+        attention_mask=prepare_inputs.attention_mask,
+        chunk_size=512 # prefilling size
+    )
+    # run the model to get the response
+    outputs = vl_gpt.generate(
+        inputs_embeds=inputs_embeds,
+        input_ids=prepare_inputs.input_ids,
+        images=prepare_inputs.images,
+        images_seq_mask=prepare_inputs.images_seq_mask,
+        images_spatial_crop=prepare_inputs.images_spatial_crop,
+        attention_mask=prepare_inputs.attention_mask,
+        past_key_values=past_key_values,
+        pad_token_id=tokenizer.eos_token_id,
+        bos_token_id=tokenizer.bos_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        max_new_tokens=512,
+        do_sample=False,
+        use_cache=True,
+    )
+    answer = tokenizer.decode(outputs[0][len(prepare_inputs.input_ids[0]):].cpu().tolist(), skip_special_tokens=False)
+print(f"{prepare_inputs['sft_format'][0]}", answer)
+```
+And the output is something like:
+```
+<|User|>: This is image_1: <image>
+This is image_2: <image>
+This is image_3: <image>
+ Can you tell me what are in the images?
+<|Assistant|>: The first image contains carrots. The second image contains corn. The third image contains meat.<｜end▁of▁sentence｜>
+```
+### Full Inference Example
+```shell
+# without incremental prefilling
+CUDA_VISIBLE_DEVICES=0 python inference.py --model_patn "deepseek-ai/deepseek-vl2"
+# with incremental prefilling, when using 40G GPU for vl2-small
+CUDA_VISIBLE_DEVICES=0 python inference.py --model_patn "deepseek-ai/deepseek-vl2-small" --chunck_size 512
+```
+### Gradio Demo
+* Install the necessary dependencies:
+```shell
+pip install -e .[gradio]
+```
+* then run the following command:
+```shell
+# vl2-tiny, 3.37B-MoE in total, activated 1B, can be run on a single GPU < 40GB
+CUDA_VISIBLE_DEVICES=2 python web_demo.py \
+--model_name "deepseek-ai/deepseek-vl2-tiny"  \
+--port 37914
+# vl2-small, 16.1B-MoE in total, activated 2.4B
+# If run on A100 40GB GPU, you need to set the `--chunk_size 512` for incremental prefilling for saving memory and it might be slow.
+# If run on > 40GB GPU, you can ignore the `--chunk_size 512` for faster response.
+CUDA_VISIBLE_DEVICES=2 python web_demo.py \
+--model_name "deepseek-ai/deepseek-vl2-small"  \
+--port 37914 \
+--chunk_size 512
+# # vl27.5-MoE in total, activated 4.2B
+CUDA_VISIBLE_DEVICES=2 python web_demo.py \
+--model_name "deepseek-ai/deepseek-vl2"  \
+--port 37914
+```
+* **Important**: This is a basic and native demo implementation without any deployment optimizations, which may result in slower performance. For production environments, consider using optimized deployment solutions, such as vllm, sglang, lmdeploy, etc. These optimizations will help achieve faster response times and better cost efficiency.
+## 5. License
+This code repository is licensed under [MIT License](./LICENSE-CODE). The use of DeepSeek-VL2 models is subject to [DeepSeek Model License](./LICENSE-MODEL). DeepSeek-VL2 series supports commercial use.
+## 6. Citation
+```
+@misc{wu2024deepseekvl2mixtureofexpertsvisionlanguagemodels,
+      title={DeepSeek-VL2: Mixture-of-Experts Vision-Language Models for Advanced Multimodal Understanding},
+      author={Zhiyu Wu and Xiaokang Chen and Zizheng Pan and Xingchao Liu and Wen Liu and Damai Dai and Huazuo Gao and Yiyang Ma and Chengyue Wu and Bingxuan Wang and Zhenda Xie and Yu Wu and Kai Hu and Jiawei Wang and Yaofeng Sun and Yukun Li and Yishi Piao and Kang Guan and Aixin Liu and Xin Xie and Yuxiang You and Kai Dong and Xingkai Yu and Haowei Zhang and Liang Zhao and Yisong Wang and Chong Ruan},
+      year={2024},
+      eprint={2412.10302},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2412.10302},
+}
+```
+## 7. Contact
+If you have any questions, please raise an issue or contact us at [[email protected]](mailto:[email protected]).

DeepSeek-VL2/.ipynb_checkpoints/inference-checkpoint.py ADDED Viewed

	@@ -0,0 +1,185 @@

+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+from argparse import ArgumentParser
+from typing import List, Dict
+import torch
+from transformers import AutoModelForCausalLM
+import PIL.Image
+from deepseek_vl2.models import DeepseekVLV2ForCausalLM, DeepseekVLV2Processor
+from deepseek_vl2.serve.app_modules.utils import parse_ref_bbox
+def load_pil_images(conversations: List[Dict[str, str]]) -> List[PIL.Image.Image]:
+    """
+    Args:
+        conversations (List[Dict[str, str]]): the conversations with a list of messages. An example is :
+            [
+                {
+                    "role": "User",
+                    "content": "<image>\nExtract all information from this image and convert them into markdown format.",
+                    "images": ["./examples/table_datasets.png"]
+                },
+                {"role": "Assistant", "content": ""},
+            ]
+    Returns:
+        pil_images (List[PIL.Image.Image]): the list of PIL images.
+    """
+    pil_images = []
+    for message in conversations:
+        if "images" not in message:
+            continue
+        for image_path in message["images"]:
+            pil_img = PIL.Image.open(image_path)
+            pil_img = pil_img.convert("RGB")
+            pil_images.append(pil_img)
+    return pil_images
+def main(args):
+    dtype = torch.bfloat16
+    # specify the path to the model
+    model_path = args.model_path
+    vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(model_path)
+    tokenizer = vl_chat_processor.tokenizer
+    vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        trust_remote_code=True,
+        torch_dtype=dtype
+    )
+    vl_gpt = vl_gpt.cuda().eval()
+    # single image conversation example
+    conversation = [
+        {
+            "role": "<|User|>",
+            "content": "<image>\n<image>\n<|grounding|>In the first image, an object within the red rectangle is marked. Locate the object of the same category in the second image.",
+            "images": [
+                "images/incontext_visual_grounding_1.jpeg",
+                "images/icl_vg_2.jpeg"
+            ],
+        },
+        {"role": "<|Assistant|>", "content": ""},
+    ]
+    # conversation = [
+    #     {
+    #         "role": "<|User|>",
+    #         "content": "<image>\n<|ref|>The giraffe at the back.<|/ref|>.",
+    #         "images": ["./images/visual_grounding_1.jpeg"],
+    #     },
+    #     {"role": "<|Assistant|>", "content": ""},
+    # ]
+    # load images and prepare for inputs
+    pil_images = load_pil_images(conversation)
+    print(f"len(pil_images) = {len(pil_images)}")
+    # input_ids = batched_input_ids,
+    # attention_mask = batched_attention_mask,
+    # labels = batched_labels,
+    # images_tiles = batched_images,
+    # images_seq_mask = batched_images_seq_mask,
+    # images_spatial_crop = batched_images_spatial_crop,
+    # sft_format = batched_sft_format,
+    # seq_lens = seq_lens
+    prepare_inputs = vl_chat_processor.__call__(
+        conversations=conversation,
+        images=pil_images,
+        force_batchify=True,
+        system_prompt=""
+    ).to(vl_gpt.device, dtype=dtype)
+    # for key in prepare_inputs.keys():
+    #     value = prepare_inputs[key]
+    #     if isinstance(value, list):
+    #         print(key, len(value), type(value))
+    #     elif isinstance(value, torch.Tensor):
+    #         print(key, value.shape, type(value))
+    with torch.no_grad():
+        # run image encoder to get the image embeddings
+        # inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
+        # incremental_prefilling when using 40G GPU for vl2-small
+        inputs_embeds, past_key_values = vl_gpt.incremental_prefilling(
+            input_ids=prepare_inputs.input_ids,
+            images=prepare_inputs.images,
+            images_seq_mask=prepare_inputs.images_seq_mask,
+            images_spatial_crop=prepare_inputs.images_spatial_crop,
+            attention_mask=prepare_inputs.attention_mask,
+            chunk_size=args.chunk_size
+        )
+        # run the model to get the response
+        outputs = vl_gpt.generate(
+            # inputs_embeds=inputs_embeds[:, -1:],
+            # input_ids=prepare_inputs.input_ids[:, -1:],
+            inputs_embeds=inputs_embeds,
+            input_ids=prepare_inputs.input_ids,
+            images=prepare_inputs.images,
+            images_seq_mask=prepare_inputs.images_seq_mask,
+            images_spatial_crop=prepare_inputs.images_spatial_crop,
+            attention_mask=prepare_inputs.attention_mask,
+            past_key_values=past_key_values,
+            pad_token_id=tokenizer.eos_token_id,
+            bos_token_id=tokenizer.bos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            max_new_tokens=512,
+            # do_sample=False,
+            # repetition_penalty=1.1,
+            do_sample=True,
+            temperature=0.4,
+            top_p=0.9,
+            repetition_penalty=1.1,
+            use_cache=True,
+        )
+        answer = tokenizer.decode(outputs[0][len(prepare_inputs.input_ids[0]):].cpu().tolist(), skip_special_tokens=False)
+        print(f"{prepare_inputs['sft_format'][0]}", answer)
+        vg_image = parse_ref_bbox(answer, image=pil_images[-1])
+        if vg_image is not None:
+            vg_image.save("./vg.jpg", format="JPEG", quality=85)
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--model_path", type=str, required=True,
+                        default="deepseek-ai/deepseek-vl2",
+                        help="model name or local path to the model")
+    parser.add_argument("--chunk_size", type=int, default=512, help="chunk size for the model for prefiiling")
+    args = parser.parse_args()
+    main(args)

DeepSeek-VL2/.ipynb_checkpoints/requirements-checkpoint.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+torch==2.0.1
+transformers==4.38.2
+timm>=0.9.16
+accelerate
+sentencepiece
+attrdict
+einops
+# for gradio demo
+gradio==3.48.0
+gradio-client==0.6.1
+mdtex2html==1.3.0
+pypinyin==0.50.0
+tiktoken==0.5.2
+tqdm==4.64.0
+colorama==0.4.5
+Pygments==2.12.0
+markdown==3.4.1
+SentencePiece==0.1.96

DeepSeek-VL2/deepseek_vl2.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+torch>=2.0.1
+transformers>=4.38.2
+timm>=0.9.16
+accelerate
+sentencepiece
+attrdict
+einops
+[gradio]
+gradio==3.48.0
+gradio-client==0.6.1
+mdtex2html==1.3.0
+pypinyin==0.50.0
+tiktoken==0.5.2
+tqdm==4.64.0
+colorama==0.4.5
+Pygments==2.12.0
+markdown==3.4.1
+SentencePiece==0.1.96
+[lint]
+isort
+black[jupyter]>=22.6.0
+pylint[spelling]>=2.15.0
+flake8
+flake8-bugbear
+flake8-comprehensions
+flake8-docstrings
+flake8-pyi
+flake8-simplify
+ruff
+pyenchant
+pre-commit

DeepSeek-VL2/deepseek_vl2.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ deepseek_vl2

DeepSeek-VL2/deepseek_vl2/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# check if python version is above 3.10
+import sys
+if sys.version_info >= (3, 10):
+    print("Python version is above 3.10, patching the collections module.")
+    # Monkey patch collections
+    import collections
+    import collections.abc
+    for type_name in collections.abc.__all__:
+        setattr(collections, type_name, getattr(collections.abc, type_name))

DeepSeek-VL2/deepseek_vl2/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (439 Bytes). View file

DeepSeek-VL2/deepseek_vl2/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (429 Bytes). View file

DeepSeek-VL2/deepseek_vl2/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (348 Bytes). View file

DeepSeek-VL2/deepseek_vl2/models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (403 Bytes). View file

DeepSeek-VL2/deepseek_vl2/models/__pycache__/configuration_deepseek.cpython-39.pyc ADDED Viewed

Binary file (9.47 kB). View file

DeepSeek-VL2/deepseek_vl2/models/__pycache__/conversation.cpython-311.pyc ADDED Viewed

Binary file (11.3 kB). View file

DeepSeek-VL2/deepseek_vl2/models/__pycache__/conversation.cpython-312.pyc ADDED Viewed

Binary file (10.5 kB). View file

DeepSeek-VL2/deepseek_vl2/models/__pycache__/conversation.cpython-39.pyc ADDED Viewed

Binary file (6.43 kB). View file

DeepSeek-VL2/deepseek_vl2/models/__pycache__/modeling_deepseek_vl_v2.cpython-311.pyc ADDED Viewed

Binary file (31.5 kB). View file

DeepSeek-VL2/deepseek_vl2/models/__pycache__/modeling_deepseek_vl_v2.cpython-312.pyc ADDED Viewed

Binary file (29.4 kB). View file

DeepSeek-VL2/deepseek_vl2/models/__pycache__/modeling_deepseek_vl_v2.cpython-39.pyc ADDED Viewed

Binary file (17.5 kB). View file

DeepSeek-VL2/deepseek_vl2/models/__pycache__/processing_deepseek_vl_v2.cpython-310.pyc ADDED Viewed

Binary file (18.6 kB). View file

DeepSeek-VL2/deepseek_vl2/models/__pycache__/processing_deepseek_vl_v2.cpython-311.pyc ADDED Viewed

Binary file (33.5 kB). View file

DeepSeek-VL2/deepseek_vl2/models/__pycache__/processing_deepseek_vl_v2.cpython-312.pyc ADDED Viewed

Binary file (30.4 kB). View file

DeepSeek-VL2/deepseek_vl2/models/__pycache__/processing_deepseek_vl_v2.cpython-39.pyc ADDED Viewed

Binary file (18.5 kB). View file

DeepSeek-VL2/deepseek_vl2/models/__pycache__/siglip_vit.cpython-310.pyc ADDED Viewed

Binary file (20.1 kB). View file

DeepSeek-VL2/deepseek_vl2/models/__pycache__/siglip_vit.cpython-312.pyc ADDED Viewed

Binary file (32.6 kB). View file

DeepSeek-VL2/deepseek_vl2/models/__pycache__/siglip_vit.cpython-39.pyc ADDED Viewed

Binary file (19.8 kB). View file

DeepSeek-VL2/deepseek_vl2/serve/app_modules/__pycache__/gradio_utils.cpython-312.pyc ADDED Viewed

Binary file (2.57 kB). View file

DeepSeek-VL2/deepseek_vl2/serve/app_modules/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (15 kB). View file

DeepSeek-VL2/deepseek_vl2/utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (172 Bytes). View file

DeepSeek-VL2/deepseek_vl2/utils/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (170 Bytes). View file

DeepSeek-VL2/deepseek_vl2/utils/__pycache__/io.cpython-312.pyc ADDED Viewed

Binary file (2.6 kB). View file

DeepSeek-VL2/deepseek_vl2/utils/__pycache__/io.cpython-39.pyc ADDED Viewed

Binary file (1.94 kB). View file

DeepSeek-VL2/images/logo.png ADDED Viewed

DeepSeek-VL2/images/logo.svg ADDED Viewed

DeepSeek-VL2/images/monday.jpg ADDED Viewed

DeepSeek-VL2/images/visual_grounding_2.jpg ADDED Viewed

DeepSeek-VL2/images/vl2_teaser.jpeg ADDED Viewed

VLM2Vec/archive/gather_score_byckpt_aws.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import os
+import json
+import re
+# Define the datasets
+datasets = [
+    "ImageNet-1K", "N24News", "HatefulMemes", "VOC2007", "SUN397", "Place365", "ImageNet-A", "ImageNet-R", "ObjectNet", "Country211",
+    "OK-VQA", "A-OKVQA", "DocVQA", "InfographicsVQA", "ChartQA", "Visual7W", "ScienceQA", "VizWiz", "GQA", "TextVQA",
+    "VisDial", "CIRR", "VisualNews_t2i", "VisualNews_i2t", "MSCOCO_t2i", "MSCOCO_i2t", "NIGHTS", "WebQA", "FashionIQ", "Wiki-SS-NQ", "OVEN", "EDIS",
+    "MSCOCO", "RefCOCO", "RefCOCO-Matching", "Visual7W-Pointing"
+]
+# Define the root directory containing the experiment directories
+checkpoint_paths = [
+     # llava-next
+    "/fsx/home/ruimeng/runs/mmeb/mmeb005-llava16_mistral-3.lora8.mmeb20_sub100k-1344.bs1024pergpu128.GCq1p1.NormTemp002.len256crop4.lr2e5.step2kwarm100.8H100/checkpoint-1000/",
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb005-llava16_mistral-3.lora8.mmeb20_sub100k-1344.bs1024pergpu128.GCq1p1.NormTemp002.len256crop4.lr2e5.step2kwarm100.8H100/checkpoint-1400/",
+     # "/fsx/home/ruimeng/runs/mmeb/mmeb005-llava16_mistral-1.lora8.mmeb20_sub50k.bs256pergpu32.GCq2p2.NormTemp002.len256crop4.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+     # "/fsx/home/ruimeng/runs/mmeb/mmeb005-llava16_mistral-2.lora8.mmeb20_sub50k.bs1024pergpu128.GCq2p2.NormTemp002.len256crop4.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+     # "/fsx/home/ruimeng/runs/mmeb/mmeb005-e5v-1.lora8.mmeb20_sub50k.bs1024pergpu128.GCq2p2.NormTemp002.len256crop4.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb005-llava16_vicuna-1.lora8.mmeb20_sub50k.bs256pergpu32.GCq2p2.NormTemp002.len256crop4.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb005-llava16_vicuna-2.lora8.mmeb20_sub50k.bs1024pergpu128.GCq2p2.NormTemp002.len256crop4.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+    # scale-up
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb005-scale002.lora8.mmeb17_sub100k_NoMSCOCO.bs1024pergpu128.GCq2p2.phi35.NormTemp002.len256crop9.lr5e5.step5kwarm200.8H100/checkpoint-1500/",
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb005-scale001.lora8.mmeb20_sub100k.bs1024pergpu128.GCq2p2.phi35.NormTemp002.len256crop9.lr2e5.step5kwarm200.8H100/checkpoint-1500/",
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb005-scale001.lora8.mmeb20_sub100k.bs1024pergpu128.GCq2p2.phi35.NormTemp002.len256crop9.lr2e5.step5kwarm200.8H100/checkpoint-2500/",
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb005-scale002-1.lora8.mmeb17_sub100k_NoMSCOCO.bs1024pergpu128.GCq2p2.phi35.NormTemp002.len256crop9.lr2e5.step2kwarm100.8H100/checkpoint-1000/",
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb005-scale002-1.lora8.mmeb17_sub100k_NoMSCOCO.bs1024pergpu128.GCq2p2.phi35.NormTemp002.len256crop9.lr2e5.step2kwarm100.8H100/checkpoint-1500/",
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb005-scale002-1.lora8.mmeb17_sub100k_NoMSCOCO.bs1024pergpu128.GCq2p2.phi35.NormTemp002.len256crop9.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+    # batch size
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb004-bs1024.fullmodel.mmeb20_sub50k.bs1024pergpu128.GCq4p4.phi35.NormTemp002.len256crop4.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+    # # task
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb004-taskVQA.fullmodel.mmeb20_sub50k.bs64pergpu8.GCq4p4.phi35.NormTemp002.len256crop4.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb004-taskRET.fullmodel.mmeb20_sub50k.bs64pergpu8.GCq4p4.phi35.NormTemp002.len256crop4.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb004-taskCLS.fullmodel.mmeb20_sub50k.bs64pergpu8.GCq4p4.phi35.NormTemp002.len256crop4.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+    # # lora
+    # "/fsx/sfr/data/MMEB_exp/mmeb004-lora8.mmeb20_sub50k.bs256pergpu32.GCq4p4.phi35.NormTemp002.len256crop4.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+    # "/fsx/sfr/data/MMEB_exp/mmeb004-lora32.mmeb20_sub50k.bs256pergpu32.GCq4p4.phi35.NormTemp002.len256crop4.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+    # "/fsx/sfr/data/MMEB_exp/mmeb004-lora8_bs1k.mmeb20_sub50k.bs1024pergpu128.GCq4p4.phi35.NormTemp002.len256crop4.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+    # # maxlen
+    # "/fsx/sfr/data/MMEB_exp/mmeb004-len128.fullmodel.mmeb20_sub50k.bs256pergpu32.GCq4p4.phi35.NormTemp002.len128crop4.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb004-len512.fullmodel.mmeb20_sub50k.bs256pergpu32.GCq4p4.phi35.NormTemp002.len512crop4.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+    # # step
+    # "/fsx/sfr/data/MMEB_exp/mmeb004-step1k.fullmodel.mmeb20_sub50k.bs256pergpu32.GCq4p4.phi35.NormTemp002.len256crop4.lr2e5.step1kwarm50.8H100/checkpoint-1000/",
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb004-step4k.fullmodel.mmeb20_sub50k.bs256pergpu32.GCq4p4.phi35.NormTemp002.len256crop4.lr2e5.step4kwarm200.8H100/checkpoint-4000/",
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb004-step8k.fullmodel.mmeb20_sub50k.bs256pergpu32.GCq4p4.phi35.NormTemp002.len256crop4.lr2e5.step8kwarm400.8H100/checkpoint-8000/",
+    # # crop
+    # "/fsx/sfr/data/MMEB_exp/mmeb004-crop1.fullmodel.mmeb20_sub50k.bs256pergpu32.GCq4p4.phi35.NormTemp002.len256crop1.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+    # "/fsx/sfr/data/MMEB_exp/mmeb004-crop2.fullmodel.mmeb20_sub50k.bs256pergpu32.GCq4p4.phi35.NormTemp002.len256crop2.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb004-crop9.fullmodel.mmeb20_sub50k.bs256pergpu32.GCq2p2.phi35.NormTemp002.len256crop9.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb004-crop16.fullmodel.mmeb20_sub50k.bs256pergpu32.GCq1p1.phi35.NormTemp002.len256crop16.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+    # data size
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb004-lora8_bs1k.mmeb20_sub50k.bs1024pergpu128.GCq4p4.phi35.NormTemp002.len256crop4.lr2e5.step2kwarm100.8H100/checkpoint-1000/",
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb004-lora4.mmeb20_sub50k.bs256pergpu32.GCq4p4.phi35.NormTemp002.len256crop4.lr2e5.step2kwarm100.8H100/checkpoint-2000/",
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb004-data25k.fullmodel.mmeb20_sub25k.bs256pergpu32.GCq4p4.phi35.NormTemp002.len256crop4.lr2e5.step4kwarm200.8H100/checkpoint-4000/",
+    # "/fsx/home/ruimeng/runs/mmeb/mmeb004-data100k.fullmodel.mmeb20_sub100k.bs256pergpu32.GCq4p4.phi35.NormTemp002.len256crop4.lr2e5.step4kwarm200.8H100/checkpoint-4000/",
+]
+# Function to extract step number from checkpoint directory name
+def extract_step(checkpoint_name):
+    match = re.search(r'checkpoint-(\d+)', checkpoint_name)
+    return int(match.group(1)) if match else float('inf')
+# Dictionary to hold all gathered scores, organized by experiment
+gathered_scores_by_exp = {}
+# Loop through checkpoint directories
+for checkpoint_path in checkpoint_paths:
+    step = extract_step(checkpoint_path)
+    experiment_dir = checkpoint_path.split("/")[-3]
+    # Check if it is a checkpoint directory, and a valid checkpoint dir
+    if str.isdigit(str(step)):
+        # Initialize a dictionary to store scores for this checkpoint
+        checkpoint_scores = {"experiment": experiment_dir, "checkpoint": str(step)}
+        # Go through each dataset and check if the corresponding score file exists
+        for dataset in datasets:
+            score_file = os.path.join(checkpoint_path, f"{dataset}_score.json")  # Score file named like DatasetName_score.json
+            # Check if the score file exists
+            if os.path.isfile(score_file):
+                with open(score_file, "r") as f:
+                    score_data = json.load(f)  # Load the score JSON
+                    checkpoint_scores[dataset] = score_data.get("acc", "N/A")  # Assuming 'acc' is the key for accuracy
+            else:
+                checkpoint_scores[dataset] = "N/A"  # If no score file, set to 'N/A'
+        # Append the scores for this checkpoint to the respective experiment group
+        gathered_scores_by_exp[experiment_dir] = checkpoint_scores
+print('\n' * 5)
+# Print gathered scores in a comma-separated format
+header = ["experiment", "checkpoint"] + datasets
+print(",".join(header))  # Print header
+for experiment, scores in gathered_scores_by_exp.items():
+    row = [scores["experiment"], scores["checkpoint"]] + [str(scores[dataset]) for dataset in datasets]
+    print(",".join(row))  # Print each row of scores
+header = ["dataset"] + list(gathered_scores_by_exp.keys())
+print(",".join(header))  # Print header
+# Additional Block: Print results per experiment, transposed (dataset per row, step per column)
+# Print dataset names in the first column, and the scores for each checkpoint in subsequent columns
+for dataset in datasets:
+    row = []
+    for experiment, scores in gathered_scores_by_exp.items():
+        row.append(str(scores[dataset]))
+    print(",".join([dataset] + row))  # Print header
+# header = ["dataset"] + list(gathered_scores_by_exp.keys())
+# print(",".join(header))  # Print header
+# # Additional Block: Print results per experiment, transposed (dataset per row, step per column)
+# # Print dataset names in the first column, and the scores for each checkpoint in subsequent columns
+# for dataset in datasets:
+#     print(",".join([dataset, str(scores[dataset])]))
+#     for experiment, scores in gathered_scores_by_exp.items():
+#         print(f"\nResults for {experiment}:")
+#

VLM2Vec/archive/merge.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from src.arguments import ModelArguments
+from transformers import HfArgumentParser, AutoProcessor
+from src.model import MMEBModel
+from evaluation.eval_utils import get_pred
+def main():
+    parser = HfArgumentParser(ModelArguments)
+    model_args, = parser.parse_args_into_dataclasses()
+    model_args: ModelArguments
+    processor = AutoProcessor.from_pretrained(
+        model_args.model_name,
+        trust_remote_code=True,
+        num_crops=model_args.num_crops,
+    )
+    processor.tokenizer.padding_side = "right"
+    model = MMEBModel.load(model_args)
+    model.encoder._hf_peft_config_loaded = False
+    model.encoder.save_pretrained('full_model/', safe_serialization=False)
+if __name__ == "__main__":
+    main()

VLM2Vec/archive/testset_stats.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import json
+import sys
+import numpy as np
+from src.arguments import ModelArguments, DataArguments, TrainingArguments
+from transformers import HfArgumentParser, AutoProcessor
+from src.dataset import EvalDataset
+import re
+def main():
+    for arg in sys.argv:
+        if arg.startswith("--local-rank="):
+            rank = arg.split("=")[1]
+            sys.argv.remove(arg)
+            sys.argv.append('--local_rank')
+            sys.argv.append(rank)
+    parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    model_args: ModelArguments
+    data_args: DataArguments
+    training_args: TrainingArguments
+    datasets = [
+        "GQA",
+        # "ImageNet-1K", "N24News", "HatefulMemes", "VOC2007", "SUN397", "Place365", "ImageNet-A", "ImageNet-R",
+        # "ObjectNet", "Country211",
+        # "OK-VQA", "A-OKVQA", "DocVQA", "InfographicsVQA", "ChartQA", "Visual7W", "ScienceQA", "VizWiz", "GQA",
+        # "TextVQA",
+        # "VisDial", "CIRR", "VisualNews_t2i", "VisualNews_i2t", "MSCOCO_t2i", "MSCOCO_i2t", "NIGHTS", "WebQA",
+        # "FashionIQ", "Wiki-SS-NQ", "OVEN", "EDIS",
+        # "MSCOCO", "RefCOCO", "RefCOCO-Matching", "Visual7W-Pointing"
+    ]
+    # ToDo: This part of code is a little bit hacky. Need to refactor later.
+    for idx, subset in enumerate(datasets):
+        eval_qry_dataset = EvalDataset(
+            data_args=data_args,
+            model_args=model_args,
+            subset=subset,
+            text_field="qry_text",
+            img_path_field="qry_img_path",
+        )
+        eval_tgt_dataset = EvalDataset(
+            data_args=data_args,
+            model_args=model_args,
+            subset=subset,
+            text_field="tgt_text",
+            img_path_field="tgt_img_path",
+        )
+        tgttokens = []
+        tgtstr_lens = []
+        for tgt in eval_tgt_dataset:
+            # print(tgt)
+            tokens = re.split('[^a-zA-Z]', tgt[0])
+            tgttokens.append(tokens)
+            tgtstr_lens.append(len(tokens))
+            pass
+        print(f'dataset: {subset}')
+        print(f'tgt-avg-len: {np.mean(tgtstr_lens)}')
+        pass
+if __name__ == "__main__":
+    main()

VLM2Vec/evaluation/eval_flickr.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from transformers import HfArgumentParser, AutoProcessor
+from src.arguments import ModelArguments, DataArguments, TrainingArguments
+from src.model import MMEBModel
+from src.dataset import FlickrDataset
+from src.collator import EvalCollator
+from src.utils import load_processor
+from torch.utils.data import DataLoader
+import torch
+from tqdm import tqdm
+import numpy as np
+import pickle
+import os
+from datasets import load_dataset
+from eval_utils import get_pred
+def main():
+    parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    model_args: ModelArguments
+    data_args: DataArguments
+    training_args: TrainingArguments
+    processor = load_processor(model_args)
+    eval_img_dataset = FlickrDataset(
+        modality="image", model_backbone=model_args.model_backbone
+    )
+    eval_txt_dataset = FlickrDataset(
+        modality="text", model_backbone=model_args.model_backbone
+    )
+    eval_collator = EvalCollator(
+        data_args=data_args,
+        model_args=model_args,
+        processor=processor,
+    )
+    model = MMEBModel.load(model_args)
+    model.eval()
+    model = model.to(training_args.device, dtype=torch.bfloat16)
+    eval_img_loader = DataLoader(
+        eval_img_dataset,
+        batch_size=training_args.per_device_eval_batch_size,
+        collate_fn=eval_collator,
+        shuffle=False,
+        drop_last=False,
+        num_workers=training_args.dataloader_num_workers,
+    )
+    eval_txt_loader = DataLoader(
+        eval_txt_dataset,
+        batch_size=training_args.per_device_eval_batch_size,
+        collate_fn=eval_collator,
+        shuffle=False,
+        drop_last=False,
+        num_workers=training_args.dataloader_num_workers,
+    )
+    encode_img_path = os.path.join(data_args.encode_output_path, f"flickr_image_1K-crop{model_args.num_crops}")
+    encode_txt_path = os.path.join(data_args.encode_output_path, f"flickr_text_1K-crop{model_args.num_crops}")
+    encoded_tensor = []
+    with torch.no_grad():
+        for batch in tqdm(eval_img_loader, desc="Encode image"):
+            batch = {key: value.to(training_args.device) for key, value in batch.items()}
+            output = model(qry=batch)
+            encoded_tensor.append(output["qry_reps"].cpu().detach().float().numpy())
+    encoded_tensor = np.concatenate(encoded_tensor)
+    with open(encode_img_path, 'wb') as f:
+        pickle.dump((encoded_tensor, eval_img_dataset.image_names), f)
+    encoded_tensor = []
+    with torch.no_grad():
+        for batch in tqdm(eval_txt_loader, desc="Encode text"):
+            batch = {key: value.to(training_args.device) for key, value in batch.items()}
+            output = model(qry=batch)
+            encoded_tensor.append(output["qry_reps"].cpu().detach().float().numpy())
+    encoded_tensor = np.concatenate(encoded_tensor)
+    with open(encode_txt_path, 'wb') as f:
+        pickle.dump((encoded_tensor, eval_txt_dataset.image_names), f)
+    with open(encode_img_path, 'rb') as f:
+        img_tensor, i2t_name = pickle.load(f)
+        img_tensor = torch.from_numpy(img_tensor)
+    with open(encode_txt_path, 'rb') as f:
+        txt_tensor, t2i_name = pickle.load(f)
+        txt_tensor = torch.from_numpy(txt_tensor)
+    # I -> T
+    similarity_matrix = torch.matmul(img_tensor, txt_tensor.T)
+    recall_at_k = {1: 0, 5: 0, 10: 0}
+    sorted_indices = torch.argsort(similarity_matrix, dim=1, descending=True)
+    for idx, file_name in enumerate(i2t_name):
+        top_k_indices = sorted_indices[idx, :10]  # Get top-10 indices
+        top_k_file_names = [t2i_name[i.item()] for i in top_k_indices]
+        for k in [1, 5, 10]:
+            if file_name in top_k_file_names[:k]:
+                recall_at_k[k] += 1
+    for k in [1, 5, 10]:
+        recall_at_k[k] = recall_at_k[k] / len(i2t_name)
+        print(f"\033[91m Recall@{k}: {recall_at_k[k]:.4f}\033[0m")
+    # T -> I
+    similarity_matrix = torch.matmul(txt_tensor, img_tensor.T)
+    recall_at_k = {1: 0, 5: 0, 10: 0}
+    sorted_indices = torch.argsort(similarity_matrix, dim=1, descending=True)
+    for idx, file_name in enumerate(t2i_name):
+        top_k_indices = sorted_indices[idx, :10]
+        top_k_file_names = [i2t_name[i.item()] for i in top_k_indices]
+        for k in [1, 5, 10]:
+            if file_name in top_k_file_names[:k]:
+                recall_at_k[k] += 1
+    for k in [1, 5, 10]:
+        recall_at_k[k] = recall_at_k[k] / len(t2i_name)
+        print(f"\033[91m Recall@{k}: {recall_at_k[k]:.4f}\033[0m")
+if __name__ == "__main__":
+    main()

VLM2Vec/figures/example.jpg ADDED Viewed

VLM2Vec/grad_cache/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+try:
+    from .grad_cache import GradCache
+except ModuleNotFoundError:
+    pass

VLM2Vec/grad_cache/cachex/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .functional import chunk_encode, cache_grad, unchunk_args
2	+ from .tree_utils import tree_chunk, tree_unchunk
3	+

VLM2Vec/grad_cache/context_managers.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+from torch.utils.checkpoint import get_device_states, set_device_states
+class RandContext:
+    def __init__(self, *tensors):
+        self.fwd_cpu_state = torch.get_rng_state()
+        self.fwd_gpu_devices, self.fwd_gpu_states = get_device_states(*tensors)
+    def __enter__(self):
+        self._fork = torch.random.fork_rng(
+            devices=self.fwd_gpu_devices,
+            enabled=True
+        )
+        self._fork.__enter__()
+        torch.set_rng_state(self.fwd_cpu_state)
+        set_device_states(self.fwd_gpu_devices, self.fwd_gpu_states)
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self._fork.__exit__(exc_type, exc_val, exc_tb)
+        self._fork = None

VLM2Vec/grad_cache/functional.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from functools import wraps
+from typing import Callable, Union, Tuple, Any
+import torch
+from torch import Tensor
+from torch import distributed as dist
+from .context_managers import RandContext
+def cached(func: Callable[..., Tensor]):
+    """
+    A decorator that takes a pytorch call function into a cached compatible version.
+    :param func: A function that calls the pytorch and return representation tensor.
+    :return: A function that returns 1) representation leaf tensors for cache construction, 2) a closure function for
+    the 2nd forward and the cached backward. Call 2) with 1) as argument after calling backward on the loss Tensor.
+    """
+    @wraps(func)
+    def cache_func(*args, **kwargs):
+        rnd_state = RandContext()
+        with torch.no_grad():
+            reps_no_grad = func(*args, **kwargs)
+        if isinstance(reps_no_grad, Tensor):
+            reps_no_grad = (reps_no_grad, )
+        else:
+            assert all(isinstance(v, Tensor) for v in reps_no_grad)
+        leaf_reps = tuple(t.detach().requires_grad_() for t in reps_no_grad)
+        @wraps(func)
+        def forward_backward_func(cache_reps: Union[Tensor, Tuple[Tensor]]):
+            with rnd_state:
+                reps = func(*args, **kwargs)
+            if isinstance(reps, Tensor):
+                reps = (reps,)
+            if isinstance(cache_reps, Tensor):
+                cache_reps = (cache_reps,)
+            assert len(reps) == len(cache_reps)
+            surrogate = sum(map(lambda u, v: torch.dot(u.flatten(), v.grad.flatten()), reps, cache_reps), 0)
+            surrogate.backward()
+        return leaf_reps + (forward_backward_func,)
+    return cache_func
+def _cat_tensor_list(xx):
+    if isinstance(xx, list) and len(xx) > 0 and all(isinstance(x, Tensor) for x in xx):
+        return torch.cat(xx)
+    else:
+        return xx
+def cat_input_tensor(func: Callable[..., Tensor]):
+    """
+    A decorator that concatenates positional and keyword arguments of type List[Tensor] into a single Tensor
+    on the 0 dimension. This can come in handy dealing with results of representation tensors from multiple
+    cached forward.
+    :param func: A loss function
+    :return: Decorated loss function for cached results.
+    """
+    @wraps(func)
+    def cat_f(*args, **kwargs):
+        args_cat = [_cat_tensor_list(x) for x in args]
+        kwargs_cat = dict((k, _cat_tensor_list(v)) for k, v in kwargs.values())
+        return func(*args_cat, **kwargs_cat)
+    return cat_f
+def _maybe_gather_tensor(t: Any, axis: int):
+    if not isinstance(t, Tensor):
+        return t
+    gathered = [torch.empty_like(t) for _ in range(dist.get_world_size())]
+    dist.all_gather(gathered, t)
+    gathered[dist.get_rank()] = t
+    return torch.cat(gathered, dim=axis)
+def gather_input_tensor(func: Callable[..., Tensor], axis=0):
+    """
+    A decorator that all-gather positional and keyword arguments of type Tensor and concatenate them on axis.
+    Intended to be used with distributed contrastive learning loss.
+    :param func: A loss function
+    :param axis: The axis the gathered tensors are concatenated.
+    :return: Decorated loss function for distributed training.
+    """
+    @wraps(func)
+    def f(*args, **kwargs):
+        args_gathered = [_maybe_gather_tensor(x, axis=axis) for x in args]
+        kwargs_gathered = dict((k, _maybe_gather_tensor(v, axis=axis)) for k, v in kwargs.values())
+        return func(*args_gathered, **kwargs_gathered)
+    return f

VLM2Vec/grad_cache/grad_cache.py ADDED Viewed

	@@ -0,0 +1,279 @@

+from typing import List, Union, Callable, Any
+from contextlib import nullcontext
+from itertools import repeat
+from collections import UserDict
+import logging
+import torch
+from torch import nn, Tensor
+from torch.cuda.amp import GradScaler, autocast
+from grad_cache.context_managers import RandContext
+logger = logging.getLogger(__name__)
+class GradCache:
+    """
+    Gradient Cache class. Implements input chunking, first graph-less forward pass, Gradient Cache creation, second
+    forward & backward gradient computation. Optimizer step is not included. Native torch automatic mixed precision is
+    supported. User needs to handle gradient unscaling and scaler update after a gradeitn cache step.
+    """
+    def __init__(
+            self,
+            models: List[nn.Module],
+            chunk_sizes: Union[int, List[int]],
+            loss_fn: Callable[..., Tensor],
+            split_input_fn: Callable[[Any, int], Any] = None,
+            get_rep_fn: Callable[..., Tensor] = None,
+            fp16: bool = False,
+            scaler: GradScaler = None,
+    ):
+        """
+        Initialize the Gradient Cache class instance.
+        :param models: A list of all encoder models to be updated by the current cache.
+        :param chunk_sizes: An integer indicating chunk size. Or a list of integers of chunk size for each model.
+        :param loss_fn: A loss function that takes arbitrary numbers of representation tensors and
+        arbitrary numbers of keyword arguments as input. It should not in any case modify the input tensors' relations
+        in the autograd graph, which are later relied upon to create the gradient cache.
+        :param split_input_fn: An optional function that split generic model input into chunks. If not provided, this
+        class will try its best to split the inputs of supported types. See `split_inputs` function.
+        :param get_rep_fn: An optional function that takes generic model output and return representation tensors. If
+        not provided, the generic output is assumed to be the representation tensor.
+        :param fp16: If True, run mixed precision training, which requires scaler to also be set.
+        :param scaler: A GradScaler object for automatic mixed precision training.
+        """
+        self.models = models
+        if isinstance(chunk_sizes, int):
+            self.chunk_sizes = [chunk_sizes for _ in range(len(models))]
+        else:
+            self.chunk_sizes = chunk_sizes
+        self.split_input_fn = split_input_fn
+        self.get_rep_fn = get_rep_fn
+        self.loss_fn = loss_fn
+        if fp16:
+            assert scaler is not None, "mixed precision training requires a gradient scaler passed in"
+        self.fp16 = fp16
+        self.scaler = scaler
+        self._get_input_tensors_strict = False
+    def __call__(self, *args, **kwargs):
+        """
+        Call the cache_step function.
+        :return: Current step loss.
+        """
+        return self.cache_step(*args, **kwargs)
+    def split_inputs(self, model_input, chunk_size: int) -> List:
+        """
+        Split input into chunks. Will call user provided `split_input_fn` if specified. Otherwise,
+        it can handle input types of tensor, list of tensors and dictionary of tensors.
+        :param model_input: Generic model input.
+        :param chunk_size:  Size of each chunk.
+        :return: A list of chunked model input.
+        """
+        # delegate splitting to user provided function
+        if self.split_input_fn is not None:
+            return self.split_input_fn(model_input, chunk_size)
+        if isinstance(model_input, (dict, UserDict)) and all(isinstance(x, Tensor) for x in model_input.values()):
+            keys = list(model_input.keys())
+            chunked_tensors = [model_input[k].split(chunk_size, dim=0) for k in keys]
+            return [dict(zip(kk, tt)) for kk, tt in zip(repeat(keys), zip(*chunked_tensors))]
+        elif isinstance(model_input, list) and all(isinstance(x, Tensor) for x in model_input):
+            chunked_x = [t.split(chunk_size, dim=0) for t in model_input]
+            return [list(s) for s in zip(*chunked_x)]
+        elif isinstance(model_input, Tensor):
+            return list(model_input.split(chunk_size, dim=0))
+        elif isinstance(model_input, tuple) and list(map(type, model_input)) == [list, dict]:
+            args_chunks = self.split_inputs(model_input[0], chunk_size)
+            kwargs_chunks = self.split_inputs(model_input[1], chunk_size)
+            return list(zip(args_chunks, kwargs_chunks))
+        else:
+            raise NotImplementedError(f'Model input split not implemented for type {type(model_input)}')
+    def get_input_tensors(self, model_input) -> List[Tensor]:
+        """
+        Recursively go through model input and grab all tensors, which are then used to record current device random
+        states. This method will do its best to parse types of Tensor, tuple, list, dict and UserDict. Other types will
+        be ignored unless self._get_input_tensors_strict is set to True, in which case an exception will be raised.
+        :param model_input: input to model
+        :return: all torch tensors in model_input
+        """
+        if isinstance(model_input, Tensor):
+            return [model_input]
+        elif isinstance(model_input, (list, tuple)):
+            return sum((self.get_input_tensors(x) for x in model_input), [])
+        elif isinstance(model_input, (dict, UserDict)):
+            return sum((self.get_input_tensors(x) for x in model_input.values()), [])
+        elif self._get_input_tensors_strict:
+            raise NotImplementedError(f'get_input_tensors not implemented for type {type(model_input)}')
+        else:
+            return []
+    def model_call(self, model: nn.Module, model_input):
+        """
+        Literally call the model's __call__ method.
+        :param model: model to be called
+        :param model_input: input to the model call
+        :return: model output
+        """
+        with autocast() if self.fp16 else nullcontext():
+            if isinstance(model_input, Tensor):
+                return model(model_input)
+            elif isinstance(model_input, list):
+                return model(*model_input)
+            elif isinstance(model_input, (dict, UserDict)):
+                return model(**model_input)
+            elif isinstance(model_input, tuple) and list(map(type, model_input)) == [list, dict]:
+                model_args, model_kwargs = model_input
+                return model(*model_args, **model_kwargs)
+            else:
+                raise NotImplementedError
+    def get_reps(self, model_out) -> Tensor:
+        """
+        Return representation tensor from generic model output
+        :param model_out: generic model output
+        :return: a single tensor corresponding to the model representation output
+        """
+        if self.get_rep_fn is not None:
+            return self.get_rep_fn(model_out)
+        else:
+            return model_out
+    def compute_loss(self, *reps: Tensor, **loss_kwargs) -> Tensor:
+        """
+        Compute the loss based on the representation tensors. The tensors should be ordered same as the list of models
+        registered in this GradCache class instance.
+        :param reps: Representations for computing the loss.
+        :param loss_kwargs: Keyword arguments input to the loss function.
+        :return: the loss tensor.
+        """
+        loss = self.loss_fn(*reps, **loss_kwargs)
+        return loss
+    def forward_no_grad(
+            self,
+            model: nn.Module,
+            model_inputs,
+    ) -> [Tensor, List[RandContext]]:
+        """
+        The first forward pass without gradient computation.
+        :param model: Encoder model.
+        :param model_inputs: Model input already broken into chunks.
+        :return: A tuple of a) representations and b) recorded random states.
+        """
+        rnd_states = []
+        model_reps = []
+        with torch.no_grad():
+            for x in model_inputs:
+                rnd_states.append(RandContext(*self.get_input_tensors(x)))
+                y = self.model_call(model, x)
+                model_reps.append(self.get_reps(y))
+        # concatenate all sub-batch representations
+        model_reps = torch.cat(model_reps, dim=0)
+        return model_reps, rnd_states
+    def build_cache(self, *reps: Tensor, **loss_kwargs) -> [List[Tensor], Tensor]:
+        """
+        Compute the gradient cache
+        :param reps: Computed representations from all encoder models
+        :param loss_kwargs: Extra keyword arguments to the loss function
+        :return: A tuple of a) gradient cache for each encoder model, and b) loss tensor
+        """
+        reps = [r.detach().requires_grad_() for r in reps]
+        with autocast() if self.fp16 else nullcontext():
+            loss = self.compute_loss(*reps, **loss_kwargs)
+        if self.fp16:
+            self.scaler.scale(loss).backward()
+        else:
+            loss.backward()
+        cache = [r.grad for r in reps]
+        return cache, loss.detach()
+    def forward_backward(
+            self,
+            model: nn.Module,
+            model_inputs,
+            cached_gradients: List[Tensor],
+            random_states: List[RandContext],
+            no_sync_except_last: bool = False
+    ):
+        """
+        Run the second forward and the backward pass to compute gradient for a model.
+        :param model: Encoder model.
+        :param model_inputs: Chunked input to the encoder model.
+        :param cached_gradients: Chunked gradient cache tensor for each input.
+        :param random_states: Each input's device random state during the first forward.
+        :param no_sync_except_last: If True, under distributed setup, only trigger gradient reduction across processes
+        for the last sub-batch's forward-backward pass.
+        """
+        if no_sync_except_last:
+            sync_contexts = [model.no_sync for _ in range(len(model_inputs) - 1)] + [nullcontext]
+        else:
+            sync_contexts = [nullcontext for _ in range(len(model_inputs))]
+        for x, state, gradient, sync_context in zip(model_inputs, random_states, cached_gradients, sync_contexts):
+            with sync_context():
+                with state:
+                    y = self.model_call(model, x)
+                reps = self.get_reps(y)
+                surrogate = torch.dot(reps.flatten(), gradient.flatten())
+                surrogate.backward()
+    def cache_step(
+            self,
+            *model_inputs,
+            no_sync_except_last: bool = False,
+            **loss_kwargs
+    ) -> Tensor:
+        """
+        Run a cached step to compute gradient over the inputs.
+        :param model_inputs: Input to each encoder model. Should be in similar order as the class's model.
+        :param no_sync_except_last: If True, under distributed setup, for each model, only trigger gradient reduction
+        across processes for the last sub-batch's forward-backward pass.
+        :param loss_kwargs: Additional keyword arguments to the loss function.
+        :return: The current's loss.
+        """
+        all_reps = []
+        all_rnd_states = []
+        if no_sync_except_last:
+            assert all(map(lambda m: isinstance(m, nn.parallel.DistributedDataParallel), self.models)), \
+                'Some of models are not wrapped in DistributedDataParallel. Make sure you are running DDP with ' \
+                'proper initializations.'
+        model_inputs = [self.split_inputs(x, chunk_size) for x, chunk_size in zip(model_inputs, self.chunk_sizes)]
+        for model, x in zip(self.models, model_inputs):
+            model_reps, rnd_states = self.forward_no_grad(model, x)
+            all_reps.append(model_reps)
+            all_rnd_states.append(rnd_states)
+        cache, loss = self.build_cache(*all_reps, **loss_kwargs)
+        cache = [c.split(chunk_size) for c, chunk_size in zip(cache, self.chunk_sizes)]
+        for model, x, model_cache, rnd_states in zip(
+                self.models, model_inputs, cache, all_rnd_states):
+            self.forward_backward(model, x, model_cache, rnd_states, no_sync_except_last=no_sync_except_last)
+        return loss

VLM2Vec/grad_cache/loss.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import torch
+from torch import Tensor
+from torch.nn import functional as F
+from torch import distributed as dist
+from src import dist_utils
+class InExampleContrastiveLoss:
+    """
+    Categorization loss: cross_entropy of 1 out of K classes (target labels)
+    x.shape=[bsz, hdim], y.shape=[bsz, num_label, hdim]
+    """
+    def __init__(self, n_hard_negatives: int = 0, temperature: float = 1.0, ndim: int = None, *args, **kwargs):
+        self.target_per_qry = n_hard_negatives + 1
+        self.temperature = temperature
+        self.ndim = ndim
+    def __call__(self, x: Tensor, y: Tensor, reduction: str = 'mean'):
+        # print("gather InExampleContrastiveLoss")
+        if torch.distributed.is_initialized():
+            x = dist_utils.dist_gather(x)
+            y = dist_utils.dist_gather(y)
+        bsz, ndim = x.size(0), x.size(1)
+        target = torch.zeros(bsz, dtype=torch.long, device=x.device)
+        if self.ndim:
+            ndim = self.ndim
+            x = x[:, :ndim]
+            y = y[:, :ndim]
+        logits = torch.einsum('bod,bsd->bs', x.view(bsz, 1, ndim), y.view(bsz, -1, ndim)) * self.temperature
+        preds = torch.argmax(logits, dim=-1)
+        loss = F.cross_entropy(logits, target, reduction=reduction)
+        loss_detail = {"logits": logits, "labels": target, "preds": preds}
+        return loss, loss_detail
+class SimpleContrastiveLoss:
+    def __init__(self, n_hard_negatives: int = 0, temperature: float = 1.0, *args, **kwargs):
+        self.target_per_qry = n_hard_negatives + 1
+        self.temperature = temperature
+    def __call__(self, x: Tensor, y: Tensor, target: Tensor = None, reduction: str = 'mean'):
+        # print("gather SimpleContrastiveLoss")
+        if target is None:
+            assert x.size(0) * self.target_per_qry == y.size(0)
+            target = torch.arange(0, y.size(0), step=self.target_per_qry, dtype=torch.long, device=x.device)
+        logits = torch.matmul(x, y.transpose(0, 1)) * self.temperature
+        preds = torch.argmax(logits, dim=-1)
+        loss = F.cross_entropy(logits, target, reduction=reduction)
+        loss_detail = {"logits": logits, "labels": target, "preds": preds}
+        return loss, loss_detail
+class DistributedContrastiveLoss(SimpleContrastiveLoss):
+    def __init__(self, n_hard_negatives: int = 0, temperature: float = 1.0, *args, **kwargs):
+        assert dist.is_initialized(), "Distributed training has not been properly initialized."
+        super().__init__(n_hard_negatives=n_hard_negatives, temperature=temperature)
+        self.world_size = dist.get_world_size()
+        self.rank = dist.get_rank()
+    def __call__(self, x: Tensor, y: Tensor, **kwargs):
+        # print("gather DistributedContrastiveLoss")
+        dist_x = self.gather_tensor(x)
+        dist_y = self.gather_tensor(y)
+        return super().__call__(dist_x, dist_y, **kwargs)
+    def gather_tensor(self, t):
+        gathered = [torch.empty_like(t) for _ in range(self.world_size)]
+        dist.all_gather(gathered, t)
+        gathered[self.rank] = t
+        return torch.cat(gathered, dim=0)
+LossName2LossCls = {
+    "inexample_contrastive": InExampleContrastiveLoss,
+    "inbatch_contrastive": SimpleContrastiveLoss,
+    "distributed_inbatch_contrastive": DistributedContrastiveLoss,
+}

VLM2Vec/grad_cache/minigc_cmd.md ADDED Viewed

	@@ -0,0 +1,90 @@

+#### Prerequisite
+```bash
+ENV_PATH=/export/share/ruimeng/env/anaconda/envs/llm/bin/ninja
+export PATH="${ENV_PATH}/:$PATH"
+export NCCL_DEBUG=WARN
+export HF_DATASETS_CACHE=/export/xgen-embedding/data/.hfdata_cache
+export TRANSFORMERS_CACHE=/export/xgen-embedding/data/.hfmodel_cache/
+export TOKENIZERS_PARALLELISM=true
+export WANDB_DISABLED=false
+export WANDB_PROJECT=mini-gradcache
+export WANDB_API_KEY=local-d64a4127e8d4a1782aedbb72e76080b3dfbf89dd
+export WANDB_BASE_URL=https://salesforceairesearch.wandb.io
+```
+```bash
+# gpu0-3, DDP4-bs4096-accum4, 29922MB, hang at epoch34
+export EXP_NAME=GC-4gpu-bs4096-accum16-step10k
+export EXP_DIR=/export/xgen-embedding/runs/ruimeng/minimal_gc/$EXP_NAME
+export WANDB_DIR=$EXP_DIR/wandb
+export WANDB_NAME=$EXP_NAME
+export WORLD_SIZE=4
+mkdir -p $EXP_DIR/wandb
+rm -rf $EXP_DIR/*
+cd /export/home/project/search/xgen-embedding/
+CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4 --master_port=4403 --max_restarts=0 mini_gc.py --model_name_or_path bert-base-uncased --output_dir $EXP_DIR --q_len 128 --d_len 256 --batch_size 4096 --chunk_sizes 256 2>&1 | tee $EXP_DIR/train.log
+# gpu0-3, DDP4-bs256-accum4, 11818MB
+export EXP_NAME=GC-4gpu-bs256-accum4-step10k
+export EXP_DIR=/export/xgen-embedding/runs/ruimeng/minimal_gc/$EXP_NAME
+export WANDB_DIR=$EXP_DIR/wandb
+export WANDB_NAME=$EXP_NAME
+export WORLD_SIZE=4
+mkdir -p $EXP_DIR/wandb
+rm -rf $EXP_DIR/*
+cd /export/home/project/search/xgen-embedding/
+CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4 --master_port=4403 --max_restarts=0 mini_gc.py --model_name_or_path bert-base-uncased --output_dir $EXP_DIR --q_len 128 --d_len 256 --batch_size 64 --chunk_sizes 16 2>&1 | tee $EXP_DIR/train.log
+# gpu45, DDP2-bs256-accum2, 15742MB
+export EXP_NAME=GC-2gpu-bs256-accum2-step10k
+export EXP_DIR=/export/xgen-embedding/runs/ruimeng/minimal_gc/$EXP_NAME
+export WANDB_DIR=$EXP_DIR/wandb
+export WANDB_NAME=$EXP_NAME
+export WORLD_SIZE=1
+mkdir -p $EXP_DIR/wandb
+rm -rf $EXP_DIR/*
+cd /export/home/project/search/xgen-embedding/
+CUDA_VISIBLE_DEVICES=4,5 torchrun --nproc_per_node=2 --master_port=2245 --max_restarts=0 mini_gc.py --model_name_or_path bert-base-uncased --output_dir $EXP_DIR --q_len 128 --d_len 256 --batch_size 128 --chunk_sizes 64 2>&1 | tee $EXP_DIR/train.log
+# gpu6, bs256-accum4, 9GB
+export EXP_NAME=GC-1gpu-bs256-accum4-step10k
+export EXP_DIR=/export/xgen-embedding/runs/ruimeng/minimal_gc/$EXP_NAME
+export WANDB_DIR=$EXP_DIR/wandb
+export WANDB_NAME=$EXP_NAME
+export WORLD_SIZE=1
+mkdir -p $EXP_DIR/wandb
+rm -rf $EXP_DIR/*
+cd /export/home/project/search/xgen-embedding/
+CUDA_VISIBLE_DEVICES=6 python -m mini_gc --model_name_or_path bert-base-uncased --output_dir $EXP_DIR --q_len 128 --d_len 256 --batch_size 256 --chunk_sizes 64 2>&1 | tee $EXP_DIR/train.log
+# gpu6, bs256-accum2, 18GB
+export EXP_NAME=GC-1gpu-bs256-accum2-step10k
+export EXP_DIR=/export/xgen-embedding/runs/ruimeng/minimal_gc/$EXP_NAME
+export WANDB_DIR=$EXP_DIR/wandb
+export WANDB_NAME=$EXP_NAME
+export WORLD_SIZE=1
+mkdir -p $EXP_DIR/wandb
+rm -rf $EXP_DIR/*
+cd /export/home/project/search/xgen-embedding/
+CUDA_VISIBLE_DEVICES=6 python -m mini_gc --model_name_or_path bert-base-uncased --output_dir $EXP_DIR --q_len 128 --d_len 256 --batch_size 256 --chunk_sizes 128 2>&1 | tee $EXP_DIR/train.log
+# gpu7, bs256-accum1, 38012MB
+export EXP_NAME=GC-1gpu-bs256-accum1-step10k-baseline
+export EXP_DIR=/export/xgen-embedding/runs/ruimeng/minimal_gc/$EXP_NAME
+export WANDB_DIR=$EXP_DIR/wandb
+export WANDB_NAME=$EXP_NAME
+export WORLD_SIZE=1
+mkdir -p $EXP_DIR/wandb
+rm -rf $EXP_DIR/*
+cd /export/home/project/search/xgen-embedding/
+CUDA_VISIBLE_DEVICES=7 python -m mini_gc --model_name_or_path bert-base-uncased --output_dir $EXP_DIR --q_len 128 --d_len 256 --batch_size 256 --chunk_sizes -1 2>&1 | tee $EXP_DIR/train.log
+```

VLM2Vec/scripts/llava_next/demo.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from src.model import MMEBModel
+from src.arguments import ModelArguments
+from src.utils import load_processor
+import torch
+from transformers import HfArgumentParser, AutoProcessor
+from PIL import Image
+import numpy as np
+model_args = ModelArguments(
+    model_name='TIGER-Lab/VLM2Vec-LLaVa-Next',
+    pooling='last',
+    normalize=True,
+    model_backbone='llava_next')
+processor = load_processor(model_args)
+model = MMEBModel.load(model_args)
+model.eval()
+model = model.to('cuda', dtype=torch.bfloat16)
+# Image + Text -> Text
+inputs = processor(text='<image> Represent the given image with the following question: What is in the image',
+                   images=Image.open('figures/example.jpg'),
+                   return_tensors="pt")
+inputs = {key: value.to('cuda') for key, value in inputs.items()}
+qry_output = model(qry=inputs)["qry_reps"]
+string = 'A cat and a dog'
+inputs = processor(text=string,
+                   images=None,
+                   return_tensors="pt")
+inputs = {key: value.to('cuda') for key, value in inputs.items()}
+tgt_output = model(tgt=inputs)["tgt_reps"]
+print(string, '=', model.compute_similarity(qry_output, tgt_output))
+## A cat and a dog = tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
+string = 'A cat and a tiger'
+inputs = processor(text=string,
+                   images=None,
+                   return_tensors="pt")
+inputs = {key: value.to('cuda') for key, value in inputs.items()}
+tgt_output = model(tgt=inputs)["tgt_reps"]
+print(string, '=', model.compute_similarity(qry_output, tgt_output))
+## A cat and a tiger = tensor([[0.3555]], device='cuda:0', dtype=torch.bfloat16)

VLM2Vec/scripts/llava_next/run_eval_flickr_llava_next.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+export PYTHONPATH=../VLM2Vec/:$PYTHONPATH
+CUDA_VISIBLE_DEVICES=0 python evaluation/eval_flickr.py \
+  --model_name TIGER-Lab/VLM2Vec-LLaVa-Next \
+  --model_backbone llava_next \
+  --max_len 256 \
+  --pooling last --normalize True \
+  --per_device_eval_batch_size 16 \
+  --encode_output_path /home/ziyan/MMEB_eval/flickr_new/
+## I -> T:
+#Recall@1: 0.9400
+#Recall@5: 0.9930
+#Recall@10: 0.9960
+#
+## T -> I
+#Recall@1: 0.8024
+#Recall@5: 0.9494
+#Recall@10: 0.9736

VLM2Vec/src/arguments.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from dataclasses import dataclass, field
+from transformers import TrainingArguments
+from typing import List
+@dataclass
+class ModelArguments:
+    model_name: str = field(
+        metadata={"help": "huggingface model name or path"}
+    )
+    model_backbone: str = field(
+        metadata={"help": "vlm backbone"}
+    )
+    processor_name: str = field(
+        default=None, metadata={"help": "processor_name, huggingface model name or path"}
+    )
+    model_type: str = field(
+        default=None, metadata={"help": "lavis model type"}
+    )
+    checkpoint_path: str = field(
+        default=None, metadata={"help": "a local model path"}
+    )
+    pooling: str = field(
+        default='last',
+        metadata={"help": "pooling method for encoder"}
+    )
+    normalize: bool = field(
+        default=False,
+        metadata={"help": "normalize query and passage representations"}
+    )
+    temperature: float = field(
+        default=0.02,
+        metadata={"help": "temperature for softmax"}
+    )
+    lora: bool = field(
+        default=False, metadata={"help": "do parameter-efficient fine-tuning with lora"}
+    )
+    lora_r: int = field(
+        default=16,
+        metadata={"help": "lora r"}
+    )
+    lora_alpha: int = field(
+        default=64,
+        metadata={"help": "lora alpha"}
+    )
+    lora_dropout: float = field(
+        default=0.1,
+        metadata={"help": "lora dropout"}
+    )
+    lora_target_modules: str = field(
+        default="qkv_proj,o_proj,gate_up_proj,down_proj,k_proj,q_proj,out_proj,v_proj",
+        metadata={"help": "lora target modules"}
+    )
+    num_crops: int = field(
+        default=16,
+        metadata={"help": "number of crops used in image encoder"}
+    )
+@dataclass
+class DataArguments:
+    dataset_name: str = field(
+        default=None, metadata={"help": "huggingface dataset name"}
+    )
+    subset_name: List[str] = field(
+        default=None, metadata={"help": "Useful for datasets with subsets"}
+    )
+    dataset_split: str = field(
+        default='train', metadata={"help": "dataset split"}
+    )
+    num_sample_per_subset: int = field(
+        default=100, metadata={"help": "number of training samples per subset"}
+    )
+    image_dir: str = field(
+        default=None, metadata={"help": "Image directory path"}
+    )
+    encode_output_path: str = field(
+        default=None, metadata={"help": "encode output path"}
+    )
+    max_len: int = field(
+        default=128, metadata={"help": "The maximum total input sequence length after tokenization."},
+    )
+    embedding_type: str = field(
+        default="", metadata={"help": "embedding type"}
+    )
+@dataclass
+class TrainingArguments(TrainingArguments):
+    image_encoder_freeze: bool = field(
+        default=False, metadata={"help": "huggingface model name"}
+    )
+    output_dir: str = field(
+        default=None, metadata={"help": "directory for saving trained models"}
+    )
+    project_name: str = field(
+        default=None, metadata={"help": "project name"}
+    )
+    logging_steps: int = field(
+        default=1, metadata={"help": "logging steps"}
+    )
+    num_train_epochs: int = field(
+        default=1, metadata={"help": "number of training epochs"}
+    )
+    grad_cache: bool = field(
+        default=False, metadata={"help": "Use gradient cache update"})
+    gc_q_chunk_size: int = field(
+        default=2, metadata={"help": "query side subset size"})
+    gc_p_chunk_size: int = field(
+        default=2, metadata={"help": "target side subset size"})
+@dataclass
+class MTEBArguments:
+    task_types: List[str] = field(
+        default=None, metadata={"help": ""}
+    )
+    tasks: List[str] = field(
+        default=None, metadata={"help": ""}
+    )