diff --git a/groundingLMM/LLaVA/.devcontainer/Dockerfile b/groundingLMM/LLaVA/.devcontainer/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..035e14937b3b57125ac54463770dfda25fbff6bf
--- /dev/null
+++ b/groundingLMM/LLaVA/.devcontainer/Dockerfile
@@ -0,0 +1,53 @@
+FROM mcr.microsoft.com/devcontainers/base:ubuntu-20.04
+
+SHELL [ "bash", "-c" ]
+
+# update apt and install packages
+RUN apt update && \
+    apt install -yq \
+        ffmpeg \
+        dkms \
+        build-essential
+
+# add user tools
+RUN sudo apt install -yq \
+        jq \
+        jp \
+        tree \
+        tldr
+
+# add git-lfs and install
+RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash && \
+    sudo apt-get install -yq git-lfs && \
+    git lfs install
+
+############################################
+# Setup user
+############################################
+
+USER vscode
+
+# install azcopy, a tool to copy to/from blob storage
+# for more info: https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-blobs-upload#upload-a-file
+RUN cd /tmp && \
+    wget https://azcopyvnext.azureedge.net/release20230123/azcopy_linux_amd64_10.17.0.tar.gz && \
+    tar xvf azcopy_linux_amd64_10.17.0.tar.gz && \
+    mkdir -p ~/.local/bin && \
+    mv azcopy_linux_amd64_10.17.0/azcopy ~/.local/bin && \
+    chmod +x ~/.local/bin/azcopy && \
+    rm -rf azcopy_linux_amd64*
+
+# Setup conda
+RUN cd /tmp && \
+    wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    bash ./Miniconda3-latest-Linux-x86_64.sh -b && \
+    rm ./Miniconda3-latest-Linux-x86_64.sh
+
+# Install dotnet
+RUN cd /tmp && \
+    wget https://dot.net/v1/dotnet-install.sh && \
+    chmod +x dotnet-install.sh && \
+    ./dotnet-install.sh --channel 7.0 && \
+    ./dotnet-install.sh --channel 3.1 && \
+    rm ./dotnet-install.sh
+
diff --git a/groundingLMM/LLaVA/.devcontainer/devcontainer.env b/groundingLMM/LLaVA/.devcontainer/devcontainer.env
new file mode 100644
index 0000000000000000000000000000000000000000..4cf3a49c16e1113f4d941b409bb9c7bea6c90fe0
--- /dev/null
+++ b/groundingLMM/LLaVA/.devcontainer/devcontainer.env
@@ -0,0 +1,2 @@
+SAMPLE_ENV_VAR1="Sample Value"
+SAMPLE_ENV_VAR2=332431bf-68bf
\ No newline at end of file
diff --git a/groundingLMM/LLaVA/.devcontainer/devcontainer.json b/groundingLMM/LLaVA/.devcontainer/devcontainer.json
new file mode 100644
index 0000000000000000000000000000000000000000..67f6ca20e17d808e3e77b806ad8a988b120f40a9
--- /dev/null
+++ b/groundingLMM/LLaVA/.devcontainer/devcontainer.json
@@ -0,0 +1,71 @@
+{
+    "name": "LLaVA",
+    "build": {
+        "dockerfile": "Dockerfile",
+        "context": "..",
+        "args": {}
+    },
+    "features": {
+        "ghcr.io/devcontainers/features/docker-in-docker:2": {},
+        "ghcr.io/devcontainers/features/azure-cli:1": {},
+        "ghcr.io/azure/azure-dev/azd:0": {},
+        "ghcr.io/devcontainers/features/powershell:1": {},
+        "ghcr.io/devcontainers/features/common-utils:2": {},
+        "ghcr.io/devcontainers-contrib/features/zsh-plugins:0": {},
+    },
+    // "forwardPorts": [],
+    "postCreateCommand": "bash ./.devcontainer/postCreateCommand.sh",
+    "customizations": {
+        "vscode": {
+            "settings": {
+                "python.analysis.autoImportCompletions": true,
+                "python.analysis.autoImportUserSymbols": true,
+                "python.defaultInterpreterPath": "~/miniconda3/envs/llava/bin/python",
+                "python.formatting.provider": "yapf",
+                "python.linting.enabled": true,
+                "python.linting.flake8Enabled": true,
+                "isort.check": true,
+                "dev.containers.copyGitConfig": true,
+                "terminal.integrated.defaultProfile.linux": "zsh",
+                "terminal.integrated.profiles.linux": {
+                    "zsh": {
+                        "path": "/usr/bin/zsh"
+                    },
+                }
+            },
+            "extensions": [
+                "aaron-bond.better-comments",
+                "eamodio.gitlens",
+                "EditorConfig.EditorConfig",
+                "foxundermoon.shell-format",
+                "GitHub.copilot-chat",
+                "GitHub.copilot-labs",
+                "GitHub.copilot",
+                "lehoanganh298.json-lines-viewer",
+                "mhutchie.git-graph",
+                "ms-azuretools.vscode-docker",
+                "ms-dotnettools.dotnet-interactive-vscode",
+                "ms-python.flake8",
+                "ms-python.isort",
+                "ms-python.python",
+                "ms-python.vscode-pylance",
+                "njpwerner.autodocstring",
+                "redhat.vscode-yaml",
+                "stkb.rewrap",
+                "yzhang.markdown-all-in-one",
+            ]
+        }
+    },
+    "mounts": [],
+    "runArgs": [
+        "--gpus",
+        "all",
+        // "--ipc",
+        // "host",
+        "--ulimit",
+        "memlock=-1",
+        "--env-file",
+        ".devcontainer/devcontainer.env"
+    ],
+    // "remoteUser": "root"
+}
diff --git a/groundingLMM/LLaVA/.devcontainer/postCreateCommand.sh b/groundingLMM/LLaVA/.devcontainer/postCreateCommand.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b32449207ce184a0d13eac79fbd83235acd451db
--- /dev/null
+++ b/groundingLMM/LLaVA/.devcontainer/postCreateCommand.sh
@@ -0,0 +1,45 @@
+git config --global safe.directory '*'
+git config --global core.editor "code --wait"
+git config --global pager.branch false
+
+# Set AZCOPY concurrency to auto
+echo "export AZCOPY_CONCURRENCY_VALUE=AUTO" >> ~/.zshrc
+echo "export AZCOPY_CONCURRENCY_VALUE=AUTO" >> ~/.bashrc
+
+# Activate conda by default
+echo ". /home/vscode/miniconda3/bin/activate" >> ~/.zshrc
+echo ". /home/vscode/miniconda3/bin/activate" >> ~/.bashrc
+
+# Use llava environment by default
+echo "conda activate llava" >> ~/.zshrc
+echo "conda activate llava" >> ~/.bashrc
+
+# Add dotnet to PATH
+echo 'export PATH="$PATH:$HOME/.dotnet"' >> ~/.bashrc
+echo 'export PATH="$PATH:$HOME/.dotnet"' >> ~/.zshrc
+
+# Create and activate llava environment
+source /home/vscode/miniconda3/bin/activate
+conda create -y -q -n llava python=3.10
+conda activate llava
+
+# Install Nvidia Cuda Compiler
+conda install -y -c nvidia cuda-compiler
+
+pip install pre-commit==3.0.2
+
+# Install package locally
+pip install --upgrade pip  # enable PEP 660 support
+pip install -e .
+
+# Install additional packages for training
+pip install -e ".[train]"
+pip install flash-attn --no-build-isolation
+
+# Download checkpoints to location outside of the repo
+git clone https://huggingface.co/liuhaotian/llava-v1.5-7b ~/llava-v1.5-7b
+
+# Commented because it is unlikely for users to have enough local GPU memory to load the model
+# git clone https://huggingface.co/liuhaotian/llava-v1.5-13b ~/llava-v1.5-13b
+
+echo "postCreateCommand.sh COMPLETE!"
diff --git a/groundingLMM/LLaVA/docs/Customize_Component.md b/groundingLMM/LLaVA/docs/Customize_Component.md
new file mode 100644
index 0000000000000000000000000000000000000000..e99a60879920b389799fb3a0baf1fd864ee0bccc
--- /dev/null
+++ b/groundingLMM/LLaVA/docs/Customize_Component.md
@@ -0,0 +1,20 @@
+# Customize Components in LLaVA
+
+This is an initial guide on how to replace the LLMs, visual encoders, etc. with your choice of components.
+
+## LLM
+
+It is quite simple to swap out LLaMA to any other LLMs.  You can refer to our implementation of [`llava_llama.py`](https://raw.githubusercontent.com/haotian-liu/LLaVA/main/llava/model/language_model/llava_llama.py) for an example of how to replace the LLM.
+
+Although it may seem that it still needs ~100 lines of code, most of them are copied from the original `llama.py` from HF.  The only part that is different is to insert some lines for processing the multimodal inputs.
+
+In `forward` function, you can see that we call `self.prepare_inputs_labels_for_multimodal` to process the multimodal inputs.  This function is defined in `LlavaMetaForCausalLM` and you just need to insert it into the `forward` function of your LLM.
+
+In `prepare_inputs_for_generation` function, you can see that we add `images` to the `model_inputs`.  This is because we need to pass the images to the LLM during generation.
+
+These are basically all the changes you need to make to replace the LLM.
+
+## Visual Encoder
+
+You can check out [`clip_encoder.py`](https://github.com/haotian-liu/LLaVA/blob/main/llava/model/multimodal_encoder/clip_encoder.py) on how we implement the CLIP visual encoder.
+
diff --git a/groundingLMM/LLaVA/docs/Data.md b/groundingLMM/LLaVA/docs/Data.md
new file mode 100644
index 0000000000000000000000000000000000000000..a13877451bae7a6e774258a2f1753bbecb32b890
--- /dev/null
+++ b/groundingLMM/LLaVA/docs/Data.md
@@ -0,0 +1,29 @@
+## Data
+
+| Data file name | Size |
+| --- | ---: |
+| [llava_instruct_150k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/llava_instruct_150k.json) | 229 MB |
+| [llava_instruct_80k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/llava_instruct_80k.json) | 229 MB |
+| [conversation_58k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/conversation_58k.json) | 126 MB |
+| [detail_23k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/detail_23k.json) | 20.5 MB |
+| [complex_reasoning_77k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/complex_reasoning_77k.json) | 79.6 MB |
+
+### Pretraining Dataset
+The pretraining dataset used in this release is a subset of CC-3M dataset, filtered with a more balanced concept coverage distribution.  Please see [here](https://huggingface.co/datasets/liuhaotian/LLaVA-CC3M-Pretrain-595K) for a detailed description of the dataset structure and how to download the images.
+
+If you already have CC-3M dataset on your disk, the image names follow this format: `GCC_train_000000000.jpg`.  You may edit the `image` field correspondingly if necessary.
+
+| Data | Chat File | Meta Data | Size |
+| --- |  --- |  --- | ---: |
+| CC-3M Concept-balanced 595K | [chat.json](https://huggingface.co/datasets/liuhaotian/LLaVA-CC3M-Pretrain-595K/blob/main/chat.json) | [metadata.json](https://huggingface.co/datasets/liuhaotian/LLaVA-CC3M-Pretrain-595K/blob/main/metadata.json) | 211 MB
+| LAION/CC/SBU BLIP-Caption Concept-balanced 558K | [blip_laion_cc_sbu_558k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/blob/main/blip_laion_cc_sbu_558k.json) | [metadata.json](#) | 181 MB
+
+**Important notice**: Upon the request from the community, as ~15% images of the original CC-3M dataset are no longer accessible, we upload [`images.zip`](https://huggingface.co/datasets/liuhaotian/LLaVA-CC3M-Pretrain-595K/blob/main/images.zip) for better reproducing our work in research community. It must not be used for any other purposes. The use of these images must comply with the CC-3M license. This may be taken down at any time when requested by the original CC-3M dataset owner or owners of the referenced images.
+
+### GPT-4 Prompts
+
+We provide our prompts and few-shot samples for GPT-4 queries, to better facilitate research in this domain.  Please check out the [`prompts`](https://github.com/haotian-liu/LLaVA/tree/main/playground/data/prompts) folder for three kinds of questions: conversation, detail description, and complex reasoning.
+
+They are organized in a format of `system_message.txt` for system message, pairs of `abc_caps.txt` for few-shot sample user input, and `abc_conv.txt` for few-shot sample reference output.
+
+Note that you may find them in different format. For example, `conversation` is in `jsonl`, and detail description is answer-only.  The selected format in our preliminary experiments works slightly better than a limited set of alternatives that we tried: `jsonl`, more natural format, answer-only.  If interested, you may try other variants or conduct more careful study in this.  Contributions are welcomed!
diff --git a/groundingLMM/LLaVA/docs/Evaluation.md b/groundingLMM/LLaVA/docs/Evaluation.md
new file mode 100644
index 0000000000000000000000000000000000000000..4bc49735c0c8f6eebb498b7ff8cb93262e1cd5cc
--- /dev/null
+++ b/groundingLMM/LLaVA/docs/Evaluation.md
@@ -0,0 +1,167 @@
+# Evaluation
+
+In LLaVA-1.5, we evaluate models on a diverse set of 12 benchmarks. To ensure the reproducibility, we evaluate the models with greedy decoding. We do not evaluate using beam search to make the inference process consistent with the chat demo of real-time outputs.
+
+Currently, we mostly utilize the official toolkit or server for the evaluation.
+
+## Evaluate on Custom Datasets
+
+You can evaluate LLaVA on your custom datasets by converting your dataset to LLaVA's jsonl format, and evaluate using [`model_vqa.py`](https://github.com/haotian-liu/LLaVA/blob/main/llava/eval/model_vqa.py).
+
+Below we provide a general guideline for evaluating datasets with some common formats.
+
+1. Short-answer (e.g. VQAv2, MME).
+
+```
+<question>
+Answer the question using a single word or phrase.
+```
+
+2. Option-only for multiple-choice (e.g. MMBench, SEED-Bench).
+
+```
+<question>
+A. <option_1>
+B. <option_2>
+C. <option_3>
+D. <option_4>
+Answer with the option's letter from the given choices directly.
+```
+
+3. Natural QA (e.g. LLaVA-Bench, MM-Vet).
+
+No postprocessing is needed.
+
+## Scripts
+
+Before preparing task-specific data, **you MUST first download [eval.zip](https://drive.google.com/file/d/1atZSBBrAX54yYpxtVVW33zFvcnaHeFPy/view?usp=sharing)**. It contains custom annotations, scripts, and the prediction files with LLaVA v1.5. Extract to `./playground/data/eval`. This also provides a general structure for all datasets.
+
+### VQAv2
+
+1. Download [`test2015`](http://images.cocodataset.org/zips/test2015.zip) and put it under `./playground/data/eval/vqav2`.
+2. Multi-GPU inference.
+```Shell
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/vqav2.sh
+```
+3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/830/my-submission): `./playground/data/eval/vqav2/answers_upload`.
+
+### GQA
+
+1. Download the [data](https://cs.stanford.edu/people/dorarad/gqa/download.html) and [evaluation scripts](https://cs.stanford.edu/people/dorarad/gqa/evaluate.html) following the official instructions and put under `./playground/data/eval/gqa/data`. You may need to modify `eval.py` as [this](https://gist.github.com/haotian-liu/db6eddc2a984b4cbcc8a7f26fd523187) due to the missing assets in the GQA v1.2 release.
+2. Multi-GPU inference.
+```Shell
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/gqa.sh
+```
+
+### VisWiz
+
+1. Download [`test.json`](https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip) and extract [`test.zip`](https://vizwiz.cs.colorado.edu/VizWiz_final/images/test.zip) to `test`. Put them under `./playground/data/eval/vizwiz`.
+2. Single-GPU inference.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/vizwiz.sh
+```
+3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/2185/my-submission): `./playground/data/eval/vizwiz/answers_upload`.
+
+### ScienceQA
+
+1. Under `./playground/data/eval/scienceqa`, download `images`, `pid_splits.json`, `problems.json` from the `data/scienceqa` folder of the ScienceQA [repo](https://github.com/lupantech/ScienceQA).
+2. Single-GPU inference and evaluate.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/sqa.sh
+```
+
+### TextVQA
+
+1. Download [`TextVQA_0.5.1_val.json`](https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json) and [images](https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip) and extract to `./playground/data/eval/textvqa`.
+2. Single-GPU inference and evaluate.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/textvqa.sh
+```
+
+### POPE
+
+1. Download `coco` from [POPE](https://github.com/AoiDragon/POPE/tree/e3e39262c85a6a83f26cf5094022a782cb0df58d/output/coco) and put under `./playground/data/eval/pope`.
+2. Single-GPU inference and evaluate.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/pope.sh
+```
+
+### MME
+
+1. Download the data following the official instructions [here](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation).
+2. Downloaded images to `MME_Benchmark_release_version`.
+3. put the official `eval_tool` and `MME_Benchmark_release_version` under `./playground/data/eval/MME`.
+4. Single-GPU inference and evaluate.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mme.sh
+```
+
+### MMBench
+
+1. Download [`mmbench_dev_20230712.tsv`](https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_20230712.tsv) and put under `./playground/data/eval/mmbench`.
+2. Single-GPU inference.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmbench.sh
+```
+3. Submit the results to the [evaluation server](https://opencompass.org.cn/leaderboard-multimodal): `./playground/data/eval/mmbench/answers_upload/mmbench_dev_20230712`.
+
+### MMBench-CN
+
+1. Download [`mmbench_dev_cn_20231003.tsv`](https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_cn_20231003.tsv) and put under `./playground/data/eval/mmbench`.
+2. Single-GPU inference.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmbench_cn.sh
+```
+3. Submit the results to the evaluation server: `./playground/data/eval/mmbench/answers_upload/mmbench_dev_cn_20231003`.
+
+
+### SEED-Bench
+
+1. Following the official [instructions](https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md) to download the images and the videos. Put images under `./playground/data/eval/seed_bench/SEED-Bench-image`.
+2. Extract the video frame in the middle from the downloaded videos, and put them under `./playground/data/eval/seed_bench/SEED-Bench-video-image`. We provide our script `extract_video_frames.py` modified from the official one.
+3. Multiple-GPU inference and evaluate.
+```Shell
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/seed.sh
+```
+4. Optionally, submit the results to the leaderboard: `./playground/data/eval/seed_bench/answers_upload` using the official jupyter notebook.
+
+### LLaVA-Bench-in-the-Wild
+
+1. Extract contents of [`llava-bench-in-the-wild`](https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild) to `./playground/data/eval/llava-bench-in-the-wild`.
+2. Single-GPU inference and evaluate.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/llavabench.sh
+```
+
+### MM-Vet
+
+1. Extract [`mm-vet.zip`](https://github.com/yuweihao/MM-Vet/releases/download/v1/mm-vet.zip) to `./playground/data/eval/mmvet`.
+2. Single-GPU inference.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmvet.sh
+```
+3. Evaluate the predictions in `./playground/data/eval/mmvet/results` using the official jupyter notebook.
+
+## More Benchmarks
+
+Below are awesome benchmarks for multimodal understanding from the research community, that are not initially included in the LLaVA-1.5 release.
+
+### Q-Bench
+
+1. Download [`llvisionqa_dev.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/llvisionqa_dev.json) (for `dev`-subset) and [`llvisionqa_test.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/llvisionqa_test.json) (for `test`-subset). Put them under `./playground/data/eval/qbench`. 
+2. Download and extract [images](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/images_llvisionqa.tar) and put all the images directly under `./playground/data/eval/qbench/images_llviqionqa`.
+3. Single-GPU inference (change `dev` to `test` for evaluation on test set).
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/qbench.sh dev
+```
+4. Submit the results by instruction [here](https://github.com/VQAssessment/Q-Bench#option-1-submit-results): `./playground/data/eval/qbench/llvisionqa_dev_answers.jsonl`.
+
+### Chinese-Q-Bench
+
+1. Download [`质衡-问答-验证集.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/%E8%B4%A8%E8%A1%A1-%E9%97%AE%E7%AD%94-%E9%AA%8C%E8%AF%81%E9%9B%86.json) (for `dev`-subset) and [`质衡-问答-测试集.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/%E8%B4%A8%E8%A1%A1-%E9%97%AE%E7%AD%94-%E6%B5%8B%E8%AF%95%E9%9B%86.json) (for `test`-subset). Put them under `./playground/data/eval/qbench`. 
+2. Download and extract [images](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/images_llvisionqa.tar) and put all the images directly under `./playground/data/eval/qbench/images_llviqionqa`.
+3. Single-GPU inference (change `dev` to `test` for evaluation on test set).
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/qbench_zh.sh dev
+```
+4. Submit the results by instruction [here](https://github.com/VQAssessment/Q-Bench#option-1-submit-results): `./playground/data/eval/qbench/llvisionqa_zh_dev_answers.jsonl`.
diff --git a/groundingLMM/LLaVA/docs/Finetune_Custom_Data.md b/groundingLMM/LLaVA/docs/Finetune_Custom_Data.md
new file mode 100644
index 0000000000000000000000000000000000000000..60baadaaef58ba96987f515b62caebf60a75dd2c
--- /dev/null
+++ b/groundingLMM/LLaVA/docs/Finetune_Custom_Data.md
@@ -0,0 +1,37 @@
+# Finetune LLaVA on Custom Datasets
+
+## Dataset Format
+
+Convert your data to a JSON file of a List of all samples. Sample metadata should contain `id` (a unique identifier), `image` (the path to the image), and `conversations` (the conversation data between human and AI).
+
+A sample JSON for finetuning LLaVA for generating tag-style captions for Stable Diffusion:
+
+```json
+[
+  {
+    "id": "997bb945-628d-4724-b370-b84de974a19f",
+    "image": "part-000001/997bb945-628d-4724-b370-b84de974a19f.jpg",
+    "conversations": [
+      {
+        "from": "human",
+        "value": "<image>\nWrite a prompt for Stable Diffusion to generate this image."
+      },
+      {
+        "from": "gpt",
+        "value": "a beautiful painting of chernobyl by nekro, pascal blanche, john harris, greg rutkowski, sin jong hun, moebius, simon stalenhag. in style of cg art. ray tracing. cel shading. hyper detailed. realistic. ue 5. maya. octane render. "
+      },
+    ]
+  },
+  ...
+]
+```
+
+## Command
+
+If you have a limited task-specific data, we recommend finetuning from LLaVA checkpoints with LoRA following this [script](https://github.com/haotian-liu/LLaVA/blob/main/scripts/v1_5/finetune_task_lora.sh).
+
+If the amount of the task-specific data is sufficient, you can also finetune from LLaVA checkpoints with full-model finetuning following this [script](https://github.com/haotian-liu/LLaVA/blob/main/scripts/v1_5/finetune_task.sh).
+
+You may need to adjust the hyperparameters to fit each specific dataset and your hardware constraint.
+
+
diff --git a/groundingLMM/LLaVA/docs/Intel.md b/groundingLMM/LLaVA/docs/Intel.md
new file mode 100644
index 0000000000000000000000000000000000000000..c759e4098aa06f89d04199182702176aa4c64b12
--- /dev/null
+++ b/groundingLMM/LLaVA/docs/Intel.md
@@ -0,0 +1,7 @@
+# Intel Platforms 
+
+* Support [Intel GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html)    
+* Support [Intel CPU Sapphire Rapides](https://ark.intel.com/content/www/us/en/ark/products/codename/126212/products-formerly-sapphire-rapids.html)    
+* Based on [Intel Extension for Pytorch](https://intel.github.io/intel-extension-for-pytorch)    
+
+More details in  [**intel branch**](https://github.com/haotian-liu/LLaVA/tree/intel/docs/intel)
diff --git a/groundingLMM/LLaVA/docs/LLaVA_Bench.md b/groundingLMM/LLaVA/docs/LLaVA_Bench.md
new file mode 100644
index 0000000000000000000000000000000000000000..643fee99cd6252e2f53353b9744f3ad392e5db4f
--- /dev/null
+++ b/groundingLMM/LLaVA/docs/LLaVA_Bench.md
@@ -0,0 +1,31 @@
+# LLaVA-Bench [[Download](https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild)]
+
+**-Introduction-**  Large commercial multimodal chatbots have been released in this week, including 
+- [Multimodal Bing-Chat by Microsoft](https://blogs.bing.com/search/july-2023/Bing-Chat-Enterprise-announced,-multimodal-Visual-Search-rolling-out-to-Bing-Chat) (July 18, 2023) 
+- [Multimodal Bard by Google](https://bard.google.com/). 
+
+These chatbots are presumably supported by proprietary large multimodal models (LMM). Compared with the open-source LMM such as LLaVA, proprietary LMM represent the scaling success upperbound of the current SoTA techniques. They share the goal of developing multimodal chatbots that follow human intents to complete various daily-life visual tasks in the wild. While it remains less explored how to evaluate multimodal chat ability, it provides useful feedback to study open-source LMMs against the commercial multimodal chatbots. In addition to the *LLaVA-Bench (COCO)* dataset we used to develop the early versions of LLaVA, we are releasing  [*LLaVA-Bench (In-the-Wild)*](https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild) to the community for the public use.
+
+## LLaVA-Bench (In-the-Wild *[Ongoing work]*)
+
+To evaluate the model's capability in more challenging tasks and generalizability to novel domains, we collect a diverse set of 24 images with 60 questions in total, including indoor and outdoor scenes, memes, paintings, sketches, etc, and associate each image with a highly-detailed and manually-curated description and a proper selection of questions. Such design also assesses the model's robustness to different prompts. In this release, we also categorize questions into three categories: conversation (simple QA), detailed description, and complex reasoning. We continue to expand and improve the diversity of the LLaVA-Bench (In-the-Wild).  We manually query Bing-Chat and Bard to get the responses. 
+
+### Results
+
+The score is measured by comparing against a reference answer generated by text-only GPT-4. It is generated by feeding the question, along with the ground truth image annotations as the context. A text-only GPT-4 evaluator rates both answers. We query GPT-4 by putting the reference answer first, and then the answer generated by the candidate model. We upload images at their original resolution to Bard and Bing-Chat to obtain the results.
+
+| Approach       | Conversation | Detail | Reasoning | Overall |
+|----------------|--------------|--------|-----------|---------|
+| Bard-0718      | 83.7         | 69.7   | 78.7      | 77.8    |
+| Bing-Chat-0629 | 59.6         | 52.2   | 90.1      | 71.5    |
+| LLaVA-13B-v1-336px-0719 (beam=1) | 64.3         | 55.9   | 81.7      | 70.1    |
+| LLaVA-13B-v1-336px-0719 (beam=5) | 68.4         | 59.9   | 84.3      | 73.5    |
+
+Note that Bard sometimes refuses to answer questions about images containing humans, and Bing-Chat blurs the human faces in the images. We also provide the benchmark score for the subset without humans.
+
+| Approach       | Conversation | Detail | Reasoning | Overall |
+|----------------|--------------|--------|-----------|---------|
+| Bard-0718      | 94.9         | 74.3   | 84.3      | 84.6    |
+| Bing-Chat-0629 | 55.8         | 53.6   | 93.5      | 72.6    |
+| LLaVA-13B-v1-336px-0719 (beam=1) | 62.2         | 56.4   | 82.2      | 70.0    |
+| LLaVA-13B-v1-336px-0719 (beam=5) | 65.6         | 61.7   | 85.0      | 73.6    |
diff --git a/groundingLMM/LLaVA/docs/LLaVA_from_LLaMA2.md b/groundingLMM/LLaVA/docs/LLaVA_from_LLaMA2.md
new file mode 100644
index 0000000000000000000000000000000000000000..214754bf2f206c2d95ff744429d49420e2745d19
--- /dev/null
+++ b/groundingLMM/LLaVA/docs/LLaVA_from_LLaMA2.md
@@ -0,0 +1,29 @@
+# LLaVA (based on Llama 2 LLM, Preview)
+
+*NOTE: This is a technical preview. We are still running hyperparameter search, and will release the final model soon.  If you'd like to contribute to this, please contact us.*
+
+:llama: **-Introduction-** [Llama 2 is an open-source LLM released by Meta AI](https://about.fb.com/news/2023/07/llama-2/) today (July 18, 2023). Compared with its early version [Llama 1](https://ai.meta.com/blog/large-language-model-llama-meta-ai/), Llama 2 is more favored in ***stronger language performance***, ***longer context window***, and importantly ***commercially usable***! While Llama 2 is changing the LLM market landscape in the language space, its multimodal ability remains unknown. We quickly develop the LLaVA variant based on the latest Llama 2 checkpoints, and release it to the community for the public use.
+
+You need to apply for and download the latest Llama 2 checkpoints to start your own training (apply [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/))
+
+
+## Training
+
+Please checkout [`pretrain.sh`](https://github.com/haotian-liu/LLaVA/blob/main/scripts/pretrain.sh), [`finetune.sh`](https://github.com/haotian-liu/LLaVA/blob/main/scripts/finetune.sh), [`finetune_lora.sh`](https://github.com/haotian-liu/LLaVA/blob/main/scripts/finetune_lora.sh).
+
+## LLaVA (based on Llama 2), What is different? 
+
+:volcano: How is the new LLaVA based on Llama 2 different from Llama 1? The comparisons of the training process are described:
+- **Pre-training**. The pre-trained base LLM is changed from Llama 1 to Llama 2
+- **Language instruction-tuning**. The previous LLaVA model starts with Vicuna, which is instruct tuned on ShareGPT data from Llama 1; The new LLaVA model starts with Llama 2 Chat, which is an instruct tuned checkpoint on dialogue data from Llama 2.
+- **Multimodal instruction-tuning**. The same LLaVA-Lighting process is applied.
+
+
+### Results
+
+- Llama 2 is better at following the instructions of role playing; Llama 2 fails in following the instructions of translation
+- The quantitative evaluation on [LLaVA-Bench](https://github.com/haotian-liu/LLaVA/blob/main/docs/LLaVA_Bench.md) demonstrates on-par performance between Llama 2 and Llama 1 in LLaVA's multimodal chat ability.
+
+
+<img src="../images/llava_example_cmp.png" width="100%">
+
diff --git a/groundingLMM/LLaVA/docs/LoRA.md b/groundingLMM/LLaVA/docs/LoRA.md
new file mode 100644
index 0000000000000000000000000000000000000000..bed25f57d0aaa8c37f63703f6f641999b02b1b3e
--- /dev/null
+++ b/groundingLMM/LLaVA/docs/LoRA.md
@@ -0,0 +1,46 @@
+# LLaVA (LoRA, Preview)
+
+NOTE: This is a technical preview, and is not yet ready for production use. We are still running hyperparameter search for the LoRA model, and will release the final model soon.  If you'd like to contribute to this, please contact us.
+
+You need latest code base for LoRA support (instructions [here](https://github.com/haotian-liu/LLaVA#upgrade-to-latest-code-base))
+
+## Demo (Web UI)
+
+Please execute each of the commands below one by one (after the previous one has finished).  The commands are the same as launching other demos except for an additional `--model-base` flag to specify the base model to use. Please make sure the base model corresponds to the LoRA checkpoint that you are using.  For this technical preview, you need Vicuna v1.1 (7B) checkpoint (if you do not have that already, follow the instructions [here](https://github.com/lm-sys/FastChat#vicuna-weights)).
+
+#### Launch a controller
+```Shell
+python -m llava.serve.controller --host 0.0.0.0 --port 10000
+```
+
+#### Launch a gradio web server.
+```Shell
+python -m llava.serve.gradio_web_server --controller http://localhost:10000 --model-list-mode reload
+```
+You just launched the Gradio web interface. Now, you can open the web interface with the URL printed on the screen. You may notice that there is no model in the model list. Do not worry, as we have not launched any model worker yet. It will be automatically updated when you launch a model worker.
+
+#### Launch a model worker
+```Shell
+python -m llava.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model-path liuhaotian/llava-vicuna-7b-v1.1-lcs_558k-instruct_80k_3e-lora-preview-alpha --model-base /path/to/vicuna-v1.1
+```
+Wait until the process finishes loading the model and you see "Uvicorn running on ...".  Now, refresh your Gradio web UI, and you will see the model you just launched in the model list.
+
+You can launch as many workers as you want, and compare between different model checkpoints in the same Gradio interface. Please keep the `--controller` the same, and modify the `--port` and `--worker` to a different port number for each worker.
+
+
+## Training
+
+Please see sample training scripts for [LoRA](https://github.com/haotian-liu/LLaVA/blob/main/scripts/finetune_lora.sh) and [QLoRA](https://github.com/haotian-liu/LLaVA/blob/main/scripts/finetune_qlora.sh).
+
+We provide sample DeepSpeed configs, [`zero3.json`](https://github.com/haotian-liu/LLaVA/blob/main/scripts/zero3.json) is more like PyTorch FSDP, and [`zero3_offload.json`](https://github.com/haotian-liu/LLaVA/blob/main/scripts/zero3_offload.json) can further save memory consumption by offloading parameters to CPU. `zero3.json` is usually faster than `zero3_offload.json` but requires more GPU memory, therefore, we recommend trying `zero3.json` first, and if you run out of GPU memory, try `zero3_offload.json`. You can also tweak the `per_device_train_batch_size` and `gradient_accumulation_steps` in the config to save memory, and just to make sure that `per_device_train_batch_size` and `gradient_accumulation_steps` remains the same.
+
+If you are having issues with ZeRO-3 configs, and there are enough VRAM, you may try [`zero2.json`](https://github.com/haotian-liu/LLaVA/blob/main/scripts/zero2.json). This consumes slightly more memory than ZeRO-3, and behaves more similar to PyTorch FSDP, while still supporting parameter-efficient tuning.
+
+## Create Merged Checkpoints
+
+```Shell
+python scripts/merge_lora_weights.py \
+    --model-path /path/to/lora_model \
+    --model-base /path/to/base_model \
+    --save-model-path /path/to/merge_model
+```
diff --git a/groundingLMM/LLaVA/docs/MODEL_ZOO.md b/groundingLMM/LLaVA/docs/MODEL_ZOO.md
new file mode 100644
index 0000000000000000000000000000000000000000..2d870e6c0b8e97dc08d4e1b6a2d4ca0af9185ee1
--- /dev/null
+++ b/groundingLMM/LLaVA/docs/MODEL_ZOO.md
@@ -0,0 +1,150 @@
+# Model Zoo
+
+**To Use LLaVA-1.6 checkpoints, your llava package version must be newer than 1.2.0. [Instructions](https://github.com/haotian-liu/LLaVA#upgrade-to-latest-code-base) on how to upgrade.**
+
+If you are interested in including any other details in Model Zoo, please open an issue :)
+
+The model weights below are *merged* weights. You do not need to apply delta. The usage of LLaVA checkpoints should comply with the base LLM's model license.
+
+## LLaVA-v1.6
+
+| Version | LLM | Schedule | Checkpoint | MMMU | MathVista | VQAv2 | GQA | VizWiz | SQA | TextVQA | POPE | MME | MM-Bench | MM-Bench-CN | SEED-IMG | LLaVA-Bench-Wild | MM-Vet |
+|----------|----------|-----------|-----------|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
+| LLaVA-1.6 | Vicuna-7B | full_ft-1e | [liuhaotian/llava-v1.6-vicuna-7b](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b) | 35.8 | 34.6 | 81.8 | 64.2 | 57.6 | 70.1 | 64.9 | 86.5 | 1519/332 | 67.4 | 60.6 | 70.2 | 81.6 | 43.9 |
+| LLaVA-1.6 | Vicuna-13B | full_ft-1e | [liuhaotian/llava-v1.6-vicuna-13b](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-13b) | 36.2 | 35.3 | 82.8 | 65.4 | 60.5 | 73.6 | 67.1 | 86.2 | 1575/326 | 70 | 64.4 | 71.9 | 87.3 | 48.4 |
+| LLaVA-1.6 | Mistral-7B | full_ft-1e | [liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b) | 35.3 | 37.7 | 82.2 | 64.8 | 60.0 | 72.8 | 65.7 | 86.7 | 1498/321 | 68.7 | 61.2 | 72.2 | 83.2 | 47.3 |
+| LLaVA-1.6 | Hermes-Yi-34B | full_ft-1e | [liuhaotian/llava-v1.6-34b](https://huggingface.co/liuhaotian/llava-v1.6-34b) | 51.1 | 46.5 | 83.7 | 67.1 | 63.8 | 81.8 | 69.5 | 87.7 | 1631/397 | 79.3 | 79 | 75.9 | 89.6 | 57.4 |
+
+*LLaVA-1.6-34B outperforms Gemini Pro on benchmarks like MMMU and MathVista.*
+
+
+## LLaVA-v1.5
+
+| Version | Size | Schedule | Checkpoint | VQAv2 | GQA | VizWiz | SQA | TextVQA | POPE | MME | MM-Bench | MM-Bench-CN | SEED | LLaVA-Bench-Wild | MM-Vet |
+|----------|----------|-----------|-----------|---|---|---|---|---|---|---|---|---|---|---|---|
+| LLaVA-1.5 | 7B | full_ft-1e | [liuhaotian/llava-v1.5-7b](https://huggingface.co/liuhaotian/llava-v1.5-7b) | 78.5 | 62.0 | 50.0 | 66.8 | 58.2 | 85.9 | 1510.7 | 64.3 | 58.3 | 58.6 | 65.4 | 31.1 |
+| LLaVA-1.5 | 13B | full_ft-1e | [liuhaotian/llava-v1.5-13b](https://huggingface.co/liuhaotian/llava-v1.5-13b) | 80.0 | 63.3 | 53.6 | 71.6 | 61.3 | 85.9 | 1531.3 | 67.7 | 63.6 | 61.6 | 72.5 | 36.1 |
+| LLaVA-1.5 | 7B | lora-1e | [liuhaotian/llava-v1.5-7b-lora](https://huggingface.co/liuhaotian/llava-v1.5-7b-lora) | 79.1 | 63.0 | 47.8 | 68.4 | 58.2 | 86.4 | 1476.9 | 66.1 | 58.9 | 60.1 | 67.9 | 30.2 |
+| LLaVA-1.5 | 13B | lora-1e | [liuhaotian/llava-v1.5-13b-lora](https://huggingface.co/liuhaotian/llava-v1.5-13b-lora) | 80.0 | 63.3 | 58.9 | 71.2 | 60.2 | 86.7 | 1541.7 | 68.5 | 61.5 | 61.3 | 69.5 | 38.3 |
+
+Base model: Vicuna v1.5. Training logs: [wandb](https://api.wandb.ai/links/lht/6orh56wc).
+
+<p align="center">
+  <img src="../images/llava_v1_5_radar.jpg" width="500px"> <br>
+  LLaVA-1.5 achieves SoTA performance across 11 benchmarks.
+</p>
+
+
+## LLaVA-v1
+
+*Note: We recommend using the most capable LLaVA-v1.6 series above for the best performance.*
+
+| Base LLM | Vision Encoder | Pretrain Data | Pretraining schedule | Finetuning Data | Finetuning schedule | LLaVA-Bench-Conv | LLaVA-Bench-Detail | LLaVA-Bench-Complex | LLaVA-Bench-Overall | Download |
+|----------|----------------|---------------|----------------------|-----------------|--------------------|------------------|--------------------|---------------------|---------------------|---------------------|
+| Vicuna-13B-v1.3 | CLIP-L-336px | LCS-558K | 1e | LLaVA-Instruct-80K | proj-1e, lora-1e | 64.3 | 55.9 | 81.7 | 70.1 | [LoRA](https://huggingface.co/liuhaotian/llava-v1-0719-336px-lora-vicuna-13b-v1.3) [LoRA-Merged](https://huggingface.co/liuhaotian/llava-v1-0719-336px-lora-merge-vicuna-13b-v1.3) |
+| LLaMA-2-13B-Chat | CLIP-L | LCS-558K | 1e | LLaVA-Instruct-80K | full_ft-1e | 56.7 | 58.6 | 80.0 | 67.9 | [ckpt](https://huggingface.co/liuhaotian/llava-llama-2-13b-chat-lightning-preview) |
+| LLaMA-2-7B-Chat | CLIP-L | LCS-558K | 1e | LLaVA-Instruct-80K | lora-1e | 51.2 | 58.9 | 71.6 | 62.8 | [LoRA](https://huggingface.co/liuhaotian/llava-llama-2-7b-chat-lightning-lora-preview) |
+
+
+## Projector weights
+
+These are projector weights we have pretrained. You can use these projector weights for visual instruction tuning. They are just pretrained on image-text pairs and are NOT instruction-tuned, which means they do NOT follow instructions as well as our official models and can output repetitive, lengthy, and garbled outputs. If you want to have nice conversations with LLaVA, use the checkpoints above (LLaVA v1.6).
+
+NOTE: These projector weights are only compatible with `llava>=1.0.0`. Please check out the latest codebase if your local code version is below v1.0.0.
+
+NOTE: When you use our pretrained projector for visual instruction tuning, it is very important to use the same base LLM and vision encoder as the one we used for pretraining the projector. Otherwise, the performance will be very poor.
+
+When using these projector weights to instruction-tune your LMM, please make sure that these options are correctly set as follows,
+
+```Shell
+--mm_use_im_start_end False
+--mm_use_im_patch_token False
+```
+
+| Base LLM | Vision Encoder | Projection | Pretrain Data | Pretraining schedule | Download |
+|----------|----------------|---------------|----------------------|----------|----------|
+| Vicuna-13B-v1.5 | CLIP-L-336px | MLP-2x | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-v1.5-mlp2x-336px-pretrain-vicuna-13b-v1.5) |
+| Vicuna-7B-v1.5 | CLIP-L-336px | MLP-2x | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-v1.5-mlp2x-336px-pretrain-vicuna-7b-v1.5) |
+| LLaMA-2-13B-Chat | CLIP-L-336px | Linear | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-336px-pretrain-llama-2-13b-chat) |
+| LLaMA-2-7B-Chat | CLIP-L-336px | Linear | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-336px-pretrain-llama-2-7b-chat) |
+| LLaMA-2-13B-Chat | CLIP-L | Linear | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-pretrain-llama-2-13b-chat) |
+| LLaMA-2-7B-Chat | CLIP-L | Linear | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-pretrain-llama-2-7b-chat) |
+| Vicuna-13B-v1.3 | CLIP-L-336px | Linear | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-336px-pretrain-vicuna-13b-v1.3) |
+| Vicuna-7B-v1.3 | CLIP-L-336px | Linear | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-336px-pretrain-vicuna-7b-v1.3) |
+| Vicuna-13B-v1.3 | CLIP-L | Linear | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-pretrain-vicuna-13b-v1.3) |
+| Vicuna-7B-v1.3 | CLIP-L | Linear | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/llava-pretrain-vicuna-7b-v1.3) |
+
+
+## Science QA Checkpoints
+
+| Base LLM | Vision Encoder | Pretrain Data | Pretraining schedule | Finetuning Data | Finetuning schedule | Download |
+|----------|----------------|---------------|----------------------|-----------------|--------------------|---------------------|
+| Vicuna-13B-v1.3 | CLIP-L | LCS-558K | 1e | ScienceQA | full_ft-12e | [ckpt](https://huggingface.co/liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3) |
+
+
+## Legacy Models (merged weights)
+
+The model weights below are *merged* weights. You do not need to apply delta. The usage of LLaVA checkpoints should comply with the base LLM's model license.
+
+| Base LLM | Vision Encoder | Pretrain Data | Pretraining schedule | Finetuning Data | Finetuning schedule | Download |
+|----------|----------------|---------------|----------------------|-----------------|--------------------|------------------|
+| MPT-7B-Chat | CLIP-L | LCS-558K | 1e | LLaVA-Instruct-80K | full_ft-1e | [preview](https://huggingface.co/liuhaotian/LLaVA-Lightning-MPT-7B-preview) |
+
+
+## Legacy Models (delta weights)
+
+The model weights below are *delta* weights. The usage of LLaVA checkpoints should comply with the base LLM's model license: [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md).
+
+You can add our delta to the original LLaMA weights to obtain the LLaVA weights.
+
+Instructions:
+
+1. Get the original LLaMA weights in the huggingface format by following the instructions [here](https://huggingface.co/docs/transformers/main/model_doc/llama).
+2. Use the following scripts to get LLaVA weights by applying our delta. It will automatically download delta weights from our Hugging Face account. In the script below, we use the delta weights of [`liuhaotian/LLaVA-7b-delta-v0`](https://huggingface.co/liuhaotian/LLaVA-7b-delta-v0) as an example. It can be adapted for other delta weights by changing the `--delta` argument (and base/target accordingly).
+
+```bash
+python3 -m llava.model.apply_delta \
+    --base /path/to/llama-7b \
+    --target /output/path/to/LLaVA-7B-v0 \
+    --delta liuhaotian/LLaVA-7b-delta-v0
+```
+
+| Base LLM | Vision Encoder | Pretrain Data | Pretraining schedule | Finetuning Data | Finetuning schedule | Download |
+|----------|----------------|---------------|----------------------|-----------------|--------------------|------------------|
+| Vicuna-13B-v1.1 | CLIP-L | CC-595K | 1e | LLaVA-Instruct-158K | full_ft-3e | [delta-weights](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1) |
+| Vicuna-7B-v1.1 | CLIP-L | LCS-558K | 1e | LLaVA-Instruct-80K | full_ft-1e | [delta-weights](https://huggingface.co/liuhaotian/LLaVA-Lightning-7B-delta-v1-1) |
+| Vicuna-13B-v0 | CLIP-L | CC-595K | 1e | LLaVA-Instruct-158K | full_ft-3e | [delta-weights](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v0) |
+| Vicuna-13B-v0 | CLIP-L | CC-595K | 1e | ScienceQA | full_ft-12e | [delta-weights](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v0-science_qa) |
+| Vicuna-7B-v0 | CLIP-L | CC-595K | 1e | LLaVA-Instruct-158K | full_ft-3e | [delta-weights](https://huggingface.co/liuhaotian/LLaVA-7b-delta-v0) |
+
+
+
+## Legacy Projector weights
+
+The following projector weights are deprecated, and the support for them may be removed in the future. They do not support zero-shot inference. Please use the projector weights in the [table above](#projector-weights) if possible.
+
+**NOTE**: When you use our pretrained projector for visual instruction tuning, it is very important to **use the same base LLM and vision encoder** as the one we used for pretraining the projector. Otherwise, the performance will be very bad.
+
+When using these projector weights to instruction tune your LMM, please make sure that these options are correctly set as follows,
+
+```Shell
+--mm_use_im_start_end True
+--mm_use_im_patch_token False
+```
+
+| Base LLM | Vision Encoder | Pretrain Data | Pretraining schedule | Download |
+|----------|----------------|---------------|----------------------|----------|
+| Vicuna-7B-v1.1 | CLIP-L | LCS-558K | 1e | [projector](https://huggingface.co/liuhaotian/LLaVA-Pretrained-Projectors/blob/main/LLaVA-7b-pretrain-projector-v1-1-LCS-558K-blip_caption.bin) |
+| Vicuna-13B-v0 | CLIP-L | CC-595K | 1e | [projector](https://huggingface.co/liuhaotian/LLaVA-Pretrained-Projectors/blob/main/LLaVA-13b-pretrain-projector-v0-CC3M-595K-original_caption.bin) |
+| Vicuna-7B-v0 | CLIP-L | CC-595K | 1e | [projector](https://huggingface.co/liuhaotian/LLaVA-Pretrained-Projectors/blob/main/LLaVA-7b-pretrain-projector-v0-CC3M-595K-original_caption.bin) |
+
+When using these projector weights to instruction tune your LMM, please make sure that these options are correctly set as follows,
+
+```Shell
+--mm_use_im_start_end False
+--mm_use_im_patch_token False
+```
+
+| Base LLM | Vision Encoder | Pretrain Data | Pretraining schedule | Download |
+|----------|----------------|---------------|----------------------|----------|
+| Vicuna-13B-v0 | CLIP-L | CC-595K | 1e | [projector](https://huggingface.co/liuhaotian/LLaVA-Pretrained-Projectors/blob/main/LLaVA-13b-pretrain-projector-v0-CC3M-595K-original_caption-no_im_token.bin) |
diff --git a/groundingLMM/LLaVA/docs/ScienceQA.md b/groundingLMM/LLaVA/docs/ScienceQA.md
new file mode 100644
index 0000000000000000000000000000000000000000..8881c41c67002a3798435b051c9a609dd1c0d506
--- /dev/null
+++ b/groundingLMM/LLaVA/docs/ScienceQA.md
@@ -0,0 +1,53 @@
+### ScienceQA
+
+#### Prepare Data
+1. Please see ScienceQA [repo](https://github.com/lupantech/ScienceQA) for setting up the dataset.
+2. Generate ScienceQA dataset for LLaVA conversation-style format.
+
+```Shell
+python scripts/convert_sqa_to_llava.py \
+    convert_to_llava \
+    --base-dir /path/to/ScienceQA/data/scienceqa \
+    --prompt-format "QCM-LEA" \
+    --split {train,val,minival,test,minitest}
+```
+
+#### Training
+
+1. Pretraining
+
+You can download our pretrained projector weights from our [Model Zoo](), or train your own projector weights using [`pretrain.sh`](https://github.com/haotian-liu/LLaVA/blob/main/scripts/pretrain.sh).
+
+2. Finetuning
+
+See [`finetune_sqa.sh`](https://github.com/haotian-liu/LLaVA/blob/main/scripts/finetune_sqa.sh).
+
+#### Evaluation
+
+1. Multiple-GPU inference
+You may evaluate this with multiple GPUs, and concatenate the generated jsonl files.  Please refer to our script for [batch evaluation](https://github.com/haotian-liu/LLaVA/blob/main/scripts/sqa_eval_batch.sh) and [results gathering](https://github.com/haotian-liu/LLaVA/blob/main/scripts/sqa_eval_gather.sh).
+
+2. Single-GPU inference
+
+(a) Generate LLaVA responses on ScienceQA dataset
+
+```Shell
+python -m llava.eval.model_vqa_science \
+    --model-path liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3 \
+    --question-file /path/to/ScienceQA/data/scienceqa/llava_test_QCM-LEA.json \
+    --image-folder /path/to/ScienceQA/data/scienceqa/images/test \
+    --answers-file vqa/results/ScienceQA/test_llava-13b.jsonl \
+    --conv-mode llava_v1
+```
+
+(b) Evaluate the generated responses
+
+```Shell
+python eval_science_qa.py \
+    --base-dir /path/to/ScienceQA/data/scienceqa \
+    --result-file vqa/results/ScienceQA/test_llava-13b.jsonl \
+    --output-file vqa/results/ScienceQA/test_llava-13b_output.json \
+    --output-result vqa/results/ScienceQA/test_llava-13b_result.json \
+```
+
+For reference, we attach our prediction file [`test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json`](https://github.com/haotian-liu/LLaVA/blob/main/llava/eval/table/results/test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json) and [`test_sqa_llava_13b_v0.json`](https://github.com/haotian-liu/LLaVA/blob/main/llava/eval/table/results/test_sqa_llava_13b_v0.json) for comparison when reproducing our results, as well as for further analysis in detail.
diff --git a/groundingLMM/LLaVA/docs/Windows.md b/groundingLMM/LLaVA/docs/Windows.md
new file mode 100644
index 0000000000000000000000000000000000000000..355ab81ffa1a73e874f3a8fb85d2742896068d08
--- /dev/null
+++ b/groundingLMM/LLaVA/docs/Windows.md
@@ -0,0 +1,27 @@
+# Run LLaVA on Windows
+
+*NOTE: LLaVA on Windows is not fully supported. Currently we only support 16-bit inference. For a more complete support, please use [WSL2](https://learn.microsoft.com/en-us/windows/wsl/install) for now. More functionalities on Windows is to be added soon, stay tuned.*
+
+## Installation
+
+1. Clone this repository and navigate to LLaVA folder
+```bash
+git clone https://github.com/haotian-liu/LLaVA.git
+cd LLaVA
+```
+
+2. Install Package
+```Shell
+conda create -n llava python=3.10 -y
+conda activate llava
+python -m pip install --upgrade pip  # enable PEP 660 support
+pip install torch==2.0.1+cu117 torchvision==0.15.2+cu117 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu117
+pip install -e .
+pip uninstall bitsandbytes
+```
+
+## Run demo
+
+See instructions [here](https://github.com/haotian-liu/LLaVA#demo).
+
+Note that quantization (4-bit, 8-bit) is *NOT* supported on Windows. Stay tuned for the 4-bit support on Windows!
diff --git a/groundingLMM/LLaVA/docs/macOS.md b/groundingLMM/LLaVA/docs/macOS.md
new file mode 100644
index 0000000000000000000000000000000000000000..0008e5e7cf52e99d85388ef7f0f77d76940c8cef
--- /dev/null
+++ b/groundingLMM/LLaVA/docs/macOS.md
@@ -0,0 +1,29 @@
+# Run LLaVA on macOS
+
+*NOTE: LLaVA on macOS is not fully supported. Currently we only support 16-bit inference. More functionalities on macOS is to be added soon, stay tuned.*
+
+## Installation
+
+1. Clone this repository and navigate to LLaVA folder
+```bash
+git clone https://github.com/haotian-liu/LLaVA.git
+cd LLaVA
+```
+
+2. Install Package
+```Shell
+conda create -n llava python=3.10 -y
+conda activate llava
+python -mpip install --upgrade pip  # enable PEP 660 support
+pip install -e .
+pip install torch==2.1.0 torchvision==0.16.0
+pip uninstall bitsandbytes
+```
+
+## Run demo
+
+Specify `--device mps` when launching model worker or CLI.
+
+See instructions [here](https://github.com/haotian-liu/LLaVA#demo).
+
+Note that quantization (4-bit, 8-bit) is *NOT* supported on macOS. Stay tuned for the 4-bit support on macOS!
diff --git a/groundingLMM/LLaVA/scripts/convert_gqa_for_eval.py b/groundingLMM/LLaVA/scripts/convert_gqa_for_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d46c8b876df618faac548e9b369109d541f4f23
--- /dev/null
+++ b/groundingLMM/LLaVA/scripts/convert_gqa_for_eval.py
@@ -0,0 +1,18 @@
+import os
+import json
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--src", type=str)
+parser.add_argument("--dst", type=str)
+args = parser.parse_args()
+
+all_answers = []
+for line_idx, line in enumerate(open(args.src)):
+    res = json.loads(line)
+    question_id = res['question_id']
+    text = res['text'].rstrip('.').lower()
+    all_answers.append({"questionId": question_id, "prediction": text})
+
+with open(args.dst, 'w') as f:
+    json.dump(all_answers, f)
diff --git a/groundingLMM/LLaVA/scripts/convert_mmvet_for_eval.py b/groundingLMM/LLaVA/scripts/convert_mmvet_for_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f5cfb7fb7691ef3921e3e6afc6d82ec54d4c6c
--- /dev/null
+++ b/groundingLMM/LLaVA/scripts/convert_mmvet_for_eval.py
@@ -0,0 +1,18 @@
+import os
+import json
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--src", type=str)
+parser.add_argument("--dst", type=str)
+args = parser.parse_args()
+
+cur_result = {}
+
+for line in open(args.src):
+    data = json.loads(line)
+    qid = data['question_id']
+    cur_result[f'v1_{qid}'] = data['text']
+
+with open(args.dst, 'w') as f:
+    json.dump(cur_result, f, indent=2)
diff --git a/groundingLMM/LLaVA/scripts/convert_sqa_to_llava_base_prompt.py b/groundingLMM/LLaVA/scripts/convert_sqa_to_llava_base_prompt.py
new file mode 100644
index 0000000000000000000000000000000000000000..b327fcc29eb44d7fe68be35da25bafa0e1d6feba
--- /dev/null
+++ b/groundingLMM/LLaVA/scripts/convert_sqa_to_llava_base_prompt.py
@@ -0,0 +1,334 @@
+def get_question_text(problem):
+    question = problem['question']
+    return question
+
+
+def get_context_text(problem, use_caption):
+    txt_context = problem['hint']
+    img_context = problem['caption'] if use_caption else ""
+    context = " ".join([txt_context, img_context]).strip()
+    if context == "":
+        context = "N/A"
+    return context
+
+
+def get_choice_text(probelm, options):
+    choices = probelm['choices']
+    choice_list = []
+    for i, c in enumerate(choices):
+        choice_list.append("({}) {}".format(options[i], c))
+    choice_txt = " ".join(choice_list)
+    #print(choice_txt)
+    return choice_txt
+
+
+def get_answer(problem, options):
+    return options[problem['answer']]
+
+
+def get_lecture_text(problem):
+    # \\n: GPT-3 can generate the lecture with more tokens.
+    lecture = problem['lecture'].replace("\n", "\\n")
+    return lecture
+
+
+def get_solution_text(problem):
+    # \\n: GPT-3 can generate the solution with more tokens
+    solution = problem['solution'].replace("\n", "\\n")
+    return solution
+
+
+def create_one_example_chatbot(format, question, context, choice, answer, lecture, solution, test_example=True):
+
+    input_format, output_format = format.split("-")
+
+    ## Inputs
+    if input_format == "CQM":
+        input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n"
+    elif input_format == "QCM":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"
+    # upper bound experiment
+    elif input_format == "QCML":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
+    elif input_format == "QCME":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
+    elif input_format == "QCMLE":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"
+
+    elif input_format == "QCLM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
+    elif input_format == "QCEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
+    elif input_format == "QCLEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"
+
+    # Outputs
+    if test_example:
+        output = "Answer:"
+    elif output_format == 'A':
+        output = f"Answer: The answer is {answer}."
+
+    elif output_format == 'AL':
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution}"
+    elif output_format == 'AE':
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture}"
+    elif output_format == 'ALE':
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}"
+    elif output_format == 'AEL':
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}"
+
+    elif output_format == 'LA':
+        output = f"Answer: {lecture} The answer is {answer}."
+    elif output_format == 'EA':
+        output = f"Answer: {solution} The answer is {answer}."
+    elif output_format == 'LEA':
+        output = f"Answer: {lecture} {solution} The answer is {answer}."
+    elif output_format == 'ELA':
+        output = f"Answer: {solution} {lecture} The answer is {answer}."
+    elif output_format == 'LEPA':
+        output = ''
+        if len(lecture.strip()) > 0:
+            output += f"LECTURE: {lecture}\n"
+        if len(solution.strip()) > 0:
+            output += f"SOLUTION: {solution}\n"
+        output += '###\n'
+        output += f"ANSWER: {answer}."
+
+    input = input.replace("  ", " ").strip()
+    output = output.replace("  ", " ").strip()
+    if input.endswith("BECAUSE:"):
+        input = input.replace("BECAUSE:", "").strip()
+    if output.endswith("BECAUSE:"):
+        output = output.replace("BECAUSE:", "").strip()
+    return input, output
+
+
+def create_one_example(format, question, context, choice, answer, lecture, solution, test_example=True):
+
+    input_format, output_format = format.split("-")
+
+    ## Inputs
+    if input_format == "CQM":
+        input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n"
+    elif input_format == "QCM":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"
+    # upper bound experiment
+    elif input_format == "QCML":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
+    elif input_format == "QCME":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
+    elif input_format == "QCMLE":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"
+
+    elif input_format == "QCLM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
+    elif input_format == "QCEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
+    elif input_format == "QCLEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"
+
+    # Outputs
+    if test_example:
+        output = "Answer:"
+    elif output_format == 'A':
+        output = f"Answer: The answer is {answer}."
+
+    elif output_format == 'AL':
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution}"
+    elif output_format == 'AE':
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture}"
+    elif output_format == 'ALE':
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}"
+    elif output_format == 'AEL':
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}"
+
+    elif output_format == 'LA':
+        output = f"Answer: {lecture} The answer is {answer}."
+    elif output_format == 'EA':
+        output = f"Answer: {solution} The answer is {answer}."
+    elif output_format == 'LEA':
+        output = f"Answer: {lecture} {solution} The answer is {answer}."
+    elif output_format == 'ELA':
+        output = f"Answer: {solution} {lecture} The answer is {answer}."
+
+    text = input + output
+    text = text.replace("  ", " ").strip()
+    if text.endswith("BECAUSE:"):
+        text = text.replace("BECAUSE:", "").strip()
+    return text
+
+
+
+def create_one_example_gpt4(format, question, context, choice, answer, lecture, solution, test_example=True):
+
+    input_format, output_format = format.split("-")
+
+    ## Inputs
+    if input_format == "CQM":
+        input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n"
+    elif input_format == "QCM":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"
+    # upper bound experiment
+    elif input_format == "QCML":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
+    elif input_format == "QCME":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
+    elif input_format == "QCMLE":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"
+
+    elif input_format == "QCLM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
+    elif input_format == "QCEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
+    elif input_format == "QCLEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"
+
+    # Outputs
+    if test_example:
+        output = "Answer:"
+    elif output_format == 'A':
+        output = f"Answer: The answer is {answer}."
+
+    elif output_format == 'AL':
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution}"
+    elif output_format == 'AE':
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture}"
+    elif output_format == 'ALE':
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}"
+    elif output_format == 'AEL':
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}"
+
+    elif output_format == 'LA':
+        output = f"Answer: {lecture} The answer is {answer}."
+    elif output_format == 'EA':
+        output = f"Answer: {solution} The answer is {answer}."
+    elif output_format == 'LEA':
+        output = f"Answer: {lecture} {solution} The answer is {answer}."
+    elif output_format == 'ELA':
+        output = f"Answer: {solution} {lecture} The answer is {answer}."
+
+    input = input.replace("  ", " ").strip()
+    output = output.replace("  ", " ").strip()
+    if output.endswith("BECAUSE:"):
+        output = output.replace("BECAUSE:", "").strip()
+
+    user_prompt = {"role": "user", "content": f"Can you explain {input}?"}
+    assistant_prompt = {"role": "assistant", "content": f"{output}"}
+
+    return user_prompt, assistant_prompt
+
+
+def build_prompt_chatbot(problems, shot_qids, prompt_format, use_caption=False, options=["A", "B", "C", "D", "E"], is_test=False):
+    examples = {}
+
+    for qid in shot_qids:
+        question = get_question_text(problems[qid])
+        context = get_context_text(problems[qid], use_caption)
+        choice = get_choice_text(problems[qid], options)
+        answer = get_answer(problems[qid], options)
+        lecture = get_lecture_text(problems[qid]).replace('\\n', '\n')
+        solution = get_solution_text(problems[qid]).replace('\\n', '\n')
+
+        train_example = create_one_example_chatbot(prompt_format,
+                                           question,
+                                           context,
+                                           choice,
+                                           answer,
+                                           lecture,
+                                           solution,
+                                           test_example=is_test)
+        examples[qid] = train_example
+    return examples
+
+
+def build_prompt(problems, shot_qids, test_qid, args):
+
+    examples = []
+
+    # n-shot training examples
+    for qid in shot_qids:
+        question = get_question_text(problems[qid])
+        context = get_context_text(problems[qid], args.use_caption)
+        choice = get_choice_text(problems[qid], args.options)
+        answer = get_answer(problems[qid], args.options)
+        lecture = get_lecture_text(problems[qid])
+        solution = get_solution_text(problems[qid])
+
+        train_example = create_one_example(args.prompt_format,
+                                           question,
+                                           context,
+                                           choice,
+                                           answer,
+                                           lecture,
+                                           solution,
+                                           test_example=False)
+        examples.append(train_example)
+
+    # test example
+    question = get_question_text(problems[test_qid])
+    context = get_context_text(problems[test_qid], args.use_caption)
+    choice = get_choice_text(problems[test_qid], args.options)
+    answer = get_answer(problems[test_qid], args.options)
+    lecture = get_lecture_text(problems[test_qid])
+    solution = get_solution_text(problems[test_qid])
+
+    test_example = create_one_example(args.prompt_format,
+                                      question,
+                                      context,
+                                      choice,
+                                      answer,
+                                      lecture,
+                                      solution,
+                                      test_example=True)
+    examples.append(test_example)
+
+    # create the prompt input
+    prompt_input = '\n\n'.join(examples)
+
+    return prompt_input
+
+
+def build_prompt_gpt4(problems, shot_qids, test_qid, args):
+
+    prompt_array = [{"role": "system", "content": "You are a helpful assistant."}]
+
+    # n-shot training examples
+    for qid in shot_qids:
+        question = get_question_text(problems[qid])
+        context = get_context_text(problems[qid], args.use_caption)
+        choice = get_choice_text(problems[qid], args.options)
+        answer = get_answer(problems[qid], args.options)
+        lecture = get_lecture_text(problems[qid])
+        solution = get_solution_text(problems[qid])
+
+        user_prompt, assistant_prompt = create_one_example_gpt4(args.prompt_format,
+                                           question,
+                                           context,
+                                           choice,
+                                           answer,
+                                           lecture,
+                                           solution,
+                                           test_example=False)
+        prompt_array.append(user_prompt)
+        prompt_array.append(assistant_prompt)
+
+    # test example
+    question = get_question_text(problems[test_qid])
+    context = get_context_text(problems[test_qid], args.use_caption)
+    choice = get_choice_text(problems[test_qid], args.options)
+    answer = get_answer(problems[test_qid], args.options)
+    lecture = get_lecture_text(problems[test_qid])
+    solution = get_solution_text(problems[test_qid])
+
+    user_prompt, assistant_prompt = create_one_example_gpt4(args.prompt_format,
+                                      question,
+                                      context,
+                                      choice,
+                                      answer,
+                                      lecture,
+                                      solution,
+                                      test_example=True)
+    prompt_array.append(user_prompt)
+    prompt_array.append(assistant_prompt)
+
+    return prompt_array
\ No newline at end of file
diff --git a/groundingLMM/LLaVA/scripts/convert_vizwiz_for_submission.py b/groundingLMM/LLaVA/scripts/convert_vizwiz_for_submission.py
new file mode 100644
index 0000000000000000000000000000000000000000..7836d19f573d30e4224f2f89a53104acf03efb91
--- /dev/null
+++ b/groundingLMM/LLaVA/scripts/convert_vizwiz_for_submission.py
@@ -0,0 +1,47 @@
+import os
+import argparse
+import json
+
+from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--annotation-file', type=str, required=True)
+    parser.add_argument('--result-file', type=str, required=True)
+    parser.add_argument('--result-upload-file', type=str, required=True)
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+
+    args = parse_args()
+
+    os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True)
+
+    results = []
+    error_line = 0
+    for line_idx, line in enumerate(open(args.result_file)):
+        try:
+            results.append(json.loads(line))
+        except:
+            error_line += 1
+    results = {x['question_id']: x['text'] for x in results}
+    test_split = [json.loads(line) for line in open(args.annotation_file)]
+    split_ids = set([x['question_id'] for x in test_split])
+
+    print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
+
+    all_answers = []
+
+    answer_processor = EvalAIAnswerProcessor()
+
+    for x in test_split:
+        assert x['question_id'] in results
+        all_answers.append({
+            'image': x['image'],
+            'answer': answer_processor(results[x['question_id']])
+        })
+
+    with open(args.result_upload_file, 'w') as f:
+        json.dump(all_answers, f)
diff --git a/groundingLMM/LLaVA/scripts/extract_mm_projector.py b/groundingLMM/LLaVA/scripts/extract_mm_projector.py
new file mode 100644
index 0000000000000000000000000000000000000000..45be31e896e9c087093bd9bcb6d355ec6dfd11ab
--- /dev/null
+++ b/groundingLMM/LLaVA/scripts/extract_mm_projector.py
@@ -0,0 +1,47 @@
+"""
+This is just a utility that I use to extract the projector for quantized models.
+It is NOT necessary at all to train, or run inference/serve demos.
+Use this script ONLY if you fully understand its implications.
+"""
+
+
+import os
+import argparse
+import torch
+import json
+from collections import defaultdict
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Extract MMProjector weights')
+    parser.add_argument('--model-path', type=str, help='model folder')
+    parser.add_argument('--output', type=str, help='output file')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    keys_to_match = ['mm_projector']
+    ckpt_to_key = defaultdict(list)
+    try:
+        model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json')))
+        for k, v in model_indices['weight_map'].items():
+            if any(key_match in k for key_match in keys_to_match):
+                ckpt_to_key[v].append(k)
+    except FileNotFoundError:
+        # Smaller models or model checkpoints saved by DeepSpeed.
+        v = 'pytorch_model.bin'
+        for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys():
+            if any(key_match in k for key_match in keys_to_match):
+                ckpt_to_key[v].append(k)
+
+    loaded_weights = {}
+
+    for ckpt_name, weight_keys in ckpt_to_key.items():
+        ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu')
+        for k in weight_keys:
+            loaded_weights[k] = ckpt[k]
+
+    torch.save(loaded_weights, args.output)
diff --git a/groundingLMM/LLaVA/scripts/finetune_qlora.sh b/groundingLMM/LLaVA/scripts/finetune_qlora.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c2ed4c030cb7a3fff79f47a8e681f4df7c989100
--- /dev/null
+++ b/groundingLMM/LLaVA/scripts/finetune_qlora.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
+
+# Uncomment and set the following variables correspondingly to run this script:
+
+################## VICUNA ##################
+# PROMPT_VERSION=v1
+# MODEL_VERSION="vicuna-v1-3-7b"
+################## VICUNA ##################
+
+################## LLaMA-2 ##################
+# PROMPT_VERSION="llava_llama_2"
+# MODEL_VERSION="llama-2-7b-chat"
+################## LLaMA-2 ##################
+
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero2.json \
+    --lora_enable True \
+    --bits 4 \
+    --model_name_or_path ./checkpoints/$MODEL_VERSION \
+    --version $PROMPT_VERSION \
+    --data_path ./playground/data/llava_instruct_80k.json \
+    --image_folder /path/to/coco/train2017 \
+    --vision_tower openai/clip-vit-large-patch14 \
+    --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --bf16 True \
+    --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 50000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --lazy_preprocess True \
+    --dataloader_num_workers 4 \
+    --report_to wandb
diff --git a/groundingLMM/LLaVA/scripts/merge_lora_weights.py b/groundingLMM/LLaVA/scripts/merge_lora_weights.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b39cc7beb12301379af7daebbb5553fa92093ea
--- /dev/null
+++ b/groundingLMM/LLaVA/scripts/merge_lora_weights.py
@@ -0,0 +1,22 @@
+import argparse
+from llava.model.builder import load_pretrained_model
+from llava.mm_utils import get_model_name_from_path
+
+
+def merge_lora(args):
+    model_name = get_model_name_from_path(args.model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu')
+
+    model.save_pretrained(args.save_model_path)
+    tokenizer.save_pretrained(args.save_model_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, required=True)
+    parser.add_argument("--model-base", type=str, required=True)
+    parser.add_argument("--save-model-path", type=str, required=True)
+
+    args = parser.parse_args()
+
+    merge_lora(args)
diff --git a/groundingLMM/LLaVA/scripts/pretrain.sh b/groundingLMM/LLaVA/scripts/pretrain.sh
new file mode 100644
index 0000000000000000000000000000000000000000..83f263dd570e447b3b009542d26688ce936436af
--- /dev/null
+++ b/groundingLMM/LLaVA/scripts/pretrain.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
+
+# Uncomment and set the following variables correspondingly to run this script:
+
+# MODEL_VERSION=vicuna-v1-3-7b
+# MODEL_VERSION=llama-2-7b-chat
+
+########### DO NOT CHANGE ###########
+########### USE THIS FOR BOTH ###########
+PROMPT_VERSION=plain
+########### DO NOT CHANGE ###########
+
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero2.json \
+    --model_name_or_path ./checkpoints/$MODEL_VERSION \
+    --version $PROMPT_VERSION \
+    --data_path /path/to/pretrain_data.json \
+    --image_folder /path/to/images \
+    --vision_tower openai/clip-vit-large-patch14 \
+    --tune_mm_mlp_adapter True \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --bf16 True \
+    --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 24000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-3 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to wandb
diff --git a/groundingLMM/LLaVA/scripts/upload_pypi.sh b/groundingLMM/LLaVA/scripts/upload_pypi.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c46597a2cdf85da52b4b109ddf2a103bea72364b
--- /dev/null
+++ b/groundingLMM/LLaVA/scripts/upload_pypi.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# Step 0: Clean up
+rm -rf dist
+
+# Step 1: Change the package name to "llava-torch"
+sed -i 's/name = "llava"/name = "llava-torch"/' pyproject.toml
+
+# Step 2: Build the package
+python -m build
+
+# Step 3: Revert the changes in pyproject.toml to the original
+sed -i 's/name = "llava-torch"/name = "llava"/' pyproject.toml
+
+# Step 4: Upload to PyPI
+python -m twine upload dist/*
diff --git a/groundingLMM/LLaVA/scripts/zero2.json b/groundingLMM/LLaVA/scripts/zero2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c95ebefe07b7d8d9fd0936a014679d07102cc270
--- /dev/null
+++ b/groundingLMM/LLaVA/scripts/zero2.json
@@ -0,0 +1,23 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto"
+    }
+}
\ No newline at end of file
diff --git a/groundingLMM/dataset/caption_datasets/COCO_Caption_ds.py b/groundingLMM/dataset/caption_datasets/COCO_Caption_ds.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dc59e96680d6d050ff57699922e9a1924693776
--- /dev/null
+++ b/groundingLMM/dataset/caption_datasets/COCO_Caption_ds.py
@@ -0,0 +1,124 @@
+import os
+import cv2
+import random
+import torch
+import torch.nn.functional as F
+from pycocotools.coco import COCO
+from transformers import CLIPImageProcessor
+from model.llava import conversation as conversation_lib
+from model.SAM.utils.transforms import ResizeLongestSide
+from tools.utils import DEFAULT_IMAGE_TOKEN
+from dataset.utils.utils import CAPTION_QUESTIONS
+
+
+class CocoCapDataset(torch.utils.data.Dataset):
+    IMG_MEAN = torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1)
+    IMG_STD = torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1)
+    IMG_SIZE = 1024
+    IGNORE_LABEL = 255
+
+    def __init__(self, dataset_dir, tokenizer, global_image_encoder, epoch_samples=10000, precision="fp32",
+                 image_size=224, num_classes_per_sample=3, max_gt_per_img=10, validation=False, random_sampling=True):
+        self.epoch_samples = epoch_samples
+        self.num_classes_per_sample = num_classes_per_sample
+
+        self.dataset_dir = dataset_dir
+        self.image_size = image_size
+        self.tokenizer = tokenizer
+        self.precision = precision
+        self.transform = ResizeLongestSide(image_size)
+        self.global_enc_processor = CLIPImageProcessor.from_pretrained(global_image_encoder)
+
+        self.max_gt_per_img = max_gt_per_img
+        self.validation = validation
+        self.random_sampling = random_sampling
+
+        # Defining paths
+        mode = "val" if validation else "train"
+        self.base_dir = os.path.join(dataset_dir, "coco_2017")
+        self.image_folder = os.path.join(dataset_dir, f"coco_2017/{mode}2017")
+        json_files = {'validation': "captions_val2017.json", 'training': "captions_train2017.json"}
+        annotations_file = os.path.join(self.base_dir, "annotations",
+                                        json_files['validation'] if validation else json_files['training'])
+        self.data_infos = self._load_annotations(annotations_file)
+
+        self.begin_str = f"""The {DEFAULT_IMAGE_TOKEN} provides an overview of the picture.\n"""
+        mode = "Val" if validation else "Train"
+        print('\033[92m' + "----CAP-{}: COCO Caption dataset initialized----".format(mode) + '\033[0m')
+
+    def _load_annotations(self, annotation_file):
+        self.coco_api = COCO(annotation_file)
+        ann_ids = self.coco_api.getAnnIds()
+        # Limiting anns to 1000(optional) for validation
+        ann_ids = ann_ids[:1000] if self.validation else ann_ids
+        images_info = []
+        for i, id in enumerate(ann_ids):
+            annotation = self.coco_api.loadAnns([id])[0]
+            image_id = annotation['image_id']
+            image_info = self.coco_api.loadImgs([image_id])[0]
+            image_info['filename'] = image_info['file_name'].split('_')[-1]
+            images_info.append(image_info)
+        return images_info
+
+    def _parse_ann_info(self, annotation):
+        return {'caption': annotation['caption'].strip()}
+
+    def __getitem__(self, idx):
+        ann_id = random.choice(self.coco_api.getAnnIds())
+        annotation = self.coco_api.loadAnns(ann_id)[0]
+        image_info = self.coco_api.loadImgs([annotation['image_id']])[0]
+
+        # Extract caption from annotation
+        caption_info = self._parse_ann_info(annotation)
+
+        data = {"image_path": os.path.join(self.image_folder, image_info['file_name']),
+                "filename": image_info['file_name'],
+                "caption": caption_info['caption'],
+                }
+
+        processed_data = self.process_data(data)
+        return processed_data
+
+    def __len__(self):
+        return len(self.data_infos)
+
+    def grounding_enc_processor(self, x: torch.Tensor) -> torch.Tensor:
+        x = (x - self.IMG_MEAN) / self.IMG_STD
+        h, w = x.shape[-2:]
+        x = F.pad(x, (0, self.IMG_SIZE - w, 0, self.IMG_SIZE - h))
+        return x
+
+    def create_conversations(self, labels):
+        conversations = []
+        questions = []
+        conv = conversation_lib.default_conversation.copy()
+        conv.messages = []
+
+        question = random.choice(CAPTION_QUESTIONS).strip()
+        answer = labels
+
+        conv.append_message(conv.roles[0], self.begin_str + question)
+        conv.append_message(conv.roles[1], answer)
+        prompt = conv.get_prompt()
+        conversations.append(prompt)
+        return questions, conversations
+
+    def process_data(self, data_item):
+        caption = data_item['caption']
+        image_path = data_item['image_path']
+        image = cv2.imread(image_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # Prepare input for Global Image Encoder
+        global_enc_image = self.global_enc_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+        # Skip input for Grounding Image Encoder
+        grounding_enc_image = None
+        image_resize = None
+
+        masks, bboxes = None, None
+
+        questions, conversations = self.create_conversations(caption)
+        label = None
+        selected_labels = [caption]
+
+        return (image_path, global_enc_image, grounding_enc_image, bboxes, conversations, masks, label, image_resize,
+                questions, selected_labels)
diff --git a/groundingLMM/dataset/caption_datasets/GranD_ShortCaption_ds.py b/groundingLMM/dataset/caption_datasets/GranD_ShortCaption_ds.py
new file mode 100644
index 0000000000000000000000000000000000000000..cafc670c0743bde009aecfd7119c2f1eb4cd2ac7
--- /dev/null
+++ b/groundingLMM/dataset/caption_datasets/GranD_ShortCaption_ds.py
@@ -0,0 +1,105 @@
+import os
+import cv2
+import lmdb
+import json
+import random
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor
+from model.llava import conversation as conversation_lib
+from model.SAM.utils.transforms import ResizeLongestSide
+from dataset.utils.utils import CAPTION_QUESTIONS
+from tools.utils import DEFAULT_IMAGE_TOKEN
+
+
+class GrandShortCaptionDataset(torch.utils.data.Dataset):
+    IMG_MEAN = torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1)
+    IMG_STD = torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1)
+    IMG_SIZE = 1024
+    IGNORE_LABEL = 255
+
+    def __init__(self, dataset_dir, tokenizer, global_image_encoder, epoch_samples=10000, precision="fp32",
+                 image_size=224, num_classes_per_sample=3, validation=False, random_sampling=True):
+
+        self.dataset_dir = dataset_dir
+        self.image_size = image_size
+        self.tokenizer = tokenizer
+        self.precision = precision
+        self.transform = ResizeLongestSide(image_size)
+        self.global_enc_processor = CLIPImageProcessor.from_pretrained(global_image_encoder)
+        self.epoch_samples = epoch_samples
+        self.num_classes_per_sample = num_classes_per_sample
+        self.validation = validation
+        self.random_sampling = random_sampling
+
+        # Defining paths
+        self.base_dir = os.path.join(dataset_dir, "GranD_Data")
+        self.image_folder = os.path.join(self.base_dir, "images")
+        ann_file_name = "Grand_Caption_Grounding_lmdb"
+        ann_path = os.path.join(self.base_dir, ann_file_name)
+        self.annos = lmdb.open(ann_path, readonly=True, max_readers=1, lock=False, readahead=False, meminit=False)
+        mode = "Val" if validation else "Train"
+        self.data_infos = self._load_annotations(os.path.join(self.base_dir, ann_file_name, f'{ann_file_name}_{mode}.txt'))
+        self.begin_str = f"""The {DEFAULT_IMAGE_TOKEN} provides an overview of the picture.\n"""
+        print('\033[92m' + "----CAP-{}: Grand Short Caption dataset initialized----".format(mode) + '\033[0m')
+
+    def _load_annotations(self, ann_file):
+        with open(ann_file, 'r') as f:
+            data_infos = [line.strip() for line in f if line.strip()]
+        data_infos = data_infos[0: 1000] if self.validation else data_infos
+        return data_infos
+
+    def __len__(self):
+        return len(self.data_infos)
+
+    def grounding_enc_processor(self, x: torch.Tensor) -> torch.Tensor:
+        x = (x - self.IMG_MEAN) / self.IMG_STD
+        h, w = x.shape[-2:]
+        x = F.pad(x, (0, self.IMG_SIZE - w, 0, self.IMG_SIZE - h))
+        return x
+
+    def create_conversations(self, labels):
+        conversations = []
+        questions = []
+        conv = conversation_lib.default_conversation.copy()
+        conv.messages = []
+
+        question = random.choice(CAPTION_QUESTIONS).strip()
+        answer = labels
+
+        conv.append_message(conv.roles[0], self.begin_str + question)
+        conv.append_message(conv.roles[1], answer)
+        prompt = conv.get_prompt()
+        conversations.append(prompt)
+        return questions, conversations
+
+    def __getitem__(self, idx):
+        image_name = self.data_infos[idx] if (self.validation or not self.random_sampling) else self.data_infos[
+            random.randint(0, len(self.data_infos) - 1)]
+        # Get the annotation from lmdb
+        with self.annos.begin() as txn:
+            json_contents = txn.get(image_name.encode())
+        json_contents = json.loads(json_contents.decode('utf-8'))
+        ann_info = random.choice(json_contents[image_name])
+        # Process the image
+        image_path = os.path.join(self.image_folder, image_name)
+        image = cv2.imread(image_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # Prepare input for Global Image Encoder
+        global_enc_image = self.global_enc_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+        # Skip input for Grounding Image Encoder
+        grounding_enc_image = None
+        image_resize = None
+        bboxes = None
+
+        caption = ann_info["caption"]
+        questions, conversations = self.create_conversations(caption)
+        selected_labels = conversations
+
+        masks = torch.rand(0, *image_resize)
+        label = None
+
+        assert len(conversations) == 1
+
+        return (image_path, global_enc_image, grounding_enc_image, bboxes, conversations, masks, label, image_resize,
+                questions, selected_labels)
\ No newline at end of file
diff --git a/groundingLMM/dataset/caption_datasets/LLavaInstruct_vqa_ds.py b/groundingLMM/dataset/caption_datasets/LLavaInstruct_vqa_ds.py
new file mode 100644
index 0000000000000000000000000000000000000000..838cb2ecf1c19cb569ee409fc8b5af3cb042b6c9
--- /dev/null
+++ b/groundingLMM/dataset/caption_datasets/LLavaInstruct_vqa_ds.py
@@ -0,0 +1,107 @@
+import os
+import cv2
+import json
+import random
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor
+from model.llava import conversation as conversation_lib
+from model.SAM.utils.transforms import ResizeLongestSide
+from tools.utils import DEFAULT_IMAGE_TOKEN
+
+
+class LLaVAInstructDataset(torch.utils.data.Dataset):
+    IMG_MEAN = torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1)
+    IMG_STD = torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1)
+    IMG_SIZE = 1024
+    IGNORE_LABEL = 255
+
+    def __init__(self, dataset_dir, tokenizer, global_image_encoder, epoch_samples=10000, precision="fp32",
+                 image_size=224, num_classes_per_sample=3, validation=False, random_sampling=True):
+
+        self.dataset_dir = dataset_dir
+        self.image_size = image_size
+        self.tokenizer = tokenizer
+        self.precision = precision
+        self.transform = ResizeLongestSide(image_size)
+        self.global_enc_processor = CLIPImageProcessor.from_pretrained(global_image_encoder)
+        self.epoch_samples = epoch_samples
+        self.num_classes_per_sample = num_classes_per_sample
+        self.validation = validation
+        self.random_sampling = random_sampling
+
+        # Defining paths
+        mode = "val" if validation else "train"
+        self.base_dir = os.path.join(dataset_dir, "llava_dataset")
+        self.image_folder = os.path.join(dataset_dir, f"coco_2017/{mode}2017")
+        annotations_file = os.path.join(self.base_dir, "llava_instruct_150k.json")
+        self.data_infos = self._load_annotations(annotations_file)
+        print('\033[92m' + "----CAP-{}: LLaVA-Instruct VQA dataset initialized----".format(mode) + '\033[0m')
+
+    def _load_annotations(self, ann_file):
+        with open(ann_file, 'r') as f:
+            data_infos = json.load(f)
+        data_infos = data_infos[0: 1000] if self.validation else data_infos
+        return data_infos
+
+    def __len__(self):
+        return len(self.vqa_data)
+
+    def grounding_enc_processor(self, x: torch.Tensor) -> torch.Tensor:
+        x = (x - self.IMG_MEAN) / self.IMG_STD
+        h, w = x.shape[-2:]
+        x = F.pad(x, (0, self.IMG_SIZE - w, 0, self.IMG_SIZE - h))
+        return x
+
+    def create_conversations(self, conv_ann):
+        # Preprocess:
+        for sentence in conv_ann:
+            if DEFAULT_IMAGE_TOKEN in sentence["value"]:
+                sentence["value"] = (sentence["value"].replace(DEFAULT_IMAGE_TOKEN, "").strip())
+                sentence["value"] = DEFAULT_IMAGE_TOKEN + "\n" + sentence["value"]
+                sentence["value"] = sentence["value"].strip()
+                if "mmtag" in conversation_lib.default_conversation.version:
+                    sentence["value"] = sentence["value"].replace(
+                        DEFAULT_IMAGE_TOKEN, "<Image>" + DEFAULT_IMAGE_TOKEN + "</Image>"
+                    )
+        conversations = []
+        conv = conversation_lib.default_conversation.copy()
+        conv.messages = []
+        roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+        if roles[conv_ann[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            conv_ann = conv_ann[1:]
+
+        for j, sentence in enumerate(conv_ann):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{j}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+        questions = conversations
+
+        return questions, conversations
+
+    def __getitem__(self, idx):
+        ann_info = self.data_infos[idx] if (self.validation or not self.random_sampling) else self.data_infos[
+            random.randint(0, len(self.data_infos) - 1)]
+        image_path = os.path.join(self.image_folder, ann_info["image"])
+        image = cv2.imread(image_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # Prepare input for Global Image Encoder
+        global_enc_image = self.global_enc_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+        # Skip input for Grounding Image Encoder
+        grounding_enc_image = None
+        image_resize = None
+        bboxes = None
+
+        conv_ann = ann_info["conversations"]
+        questions, conversations = self.create_conversations(conv_ann)
+        selected_labels = conversations
+
+        masks = None
+        label = None
+
+        assert len(conversations) == 1
+
+        return (image_path, global_enc_image, grounding_enc_image, bboxes, conversations, masks, label, image_resize,
+                questions, selected_labels)
\ No newline at end of file
diff --git a/groundingLMM/dataset/gcg_datasets/GranDf_gcg_ds.py b/groundingLMM/dataset/gcg_datasets/GranDf_gcg_ds.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d5d338440c68244f31872f4192fceb94f9a9d23
--- /dev/null
+++ b/groundingLMM/dataset/gcg_datasets/GranDf_gcg_ds.py
@@ -0,0 +1,353 @@
+import os
+import cv2
+import json
+import random
+import numpy as np
+from PIL import Image
+import torch
+import torch.nn.functional as F
+from pycocotools import mask
+from pycocotools.coco import COCO
+from transformers import CLIPImageProcessor
+from model.llava import conversation as conversation_lib
+from model.SAM.utils.transforms import ResizeLongestSide
+from tools.utils import DEFAULT_IMAGE_TOKEN
+from dataset.utils.utils import GCG_QUESTIONS
+
+
+class GCGBaseDataset(torch.utils.data.Dataset):
+    """
+    Dataset Class for Grounded Conversation Generation (GCG) proposed in GLaMM.
+    """
+    CLASSES = ('object',)
+    IMG_MEAN = torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1)
+    IMG_STD = torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1)
+    IMG_SIZE = 1024
+    IGNORE_LABEL = 255
+
+    def __init__(self, dataset_dir, tokenizer, global_image_encoder, epoch_samples=8000, precision="fp32",
+                 image_size=224, num_classes_per_sample=3, validation=False, random_sampling=True,
+                 image_dir='', json_path=''):
+        self.epoch_samples = epoch_samples
+        self.num_classes_per_sample = num_classes_per_sample
+        self.dataset_dir = dataset_dir
+        self.image_size = image_size
+        self.tokenizer = tokenizer
+        self.precision = precision
+        self.transform = ResizeLongestSide(image_size)
+        self.global_enc_processor = CLIPImageProcessor.from_pretrained(global_image_encoder)
+        self.validation = validation
+        self.random_sampling = random_sampling
+
+        self.question_templates = GCG_QUESTIONS
+        self.begin_str = f"""The {DEFAULT_IMAGE_TOKEN} provides an overview of the picture.\n"""
+        self.validation = validation
+
+        # Defining paths
+        self.base_dir = os.path.join(dataset_dir, "GranDf")
+        self.image_folder = os.path.join(image_dir)
+        self.ann_file = os.path.join(self.base_dir, "annotations", "train", json_path)
+        self.data_infos = self._load_annotations(self.ann_file)
+
+    def _load_annotations(self, ann_file):
+        with open(ann_file, 'r') as f:
+            data_infos = json.load(f)
+        data_infos = data_infos[0: 1000] if self.validation else data_infos
+        return data_infos
+
+    def _parse_annotations(self, ann_info):
+        image_path = os.path.join(self.image_folder, ann_info['file_name'])
+        annotations = {'labels': [], 'caption': [], 'masks': [], 'tokens_positive': [],
+                       'file_name': ann_info['file_name']}
+        width, height = Image.open(image_path).size
+        annotations['caption'] = ann_info['caption'].strip('"').strip()
+
+        for word, grounding in ann_info["groundings"].items():
+            annotations['labels'].append(word)
+            annotations['tokens_positive'].append(grounding["token_positives"])
+
+            # Convert segmentation to binary mask
+            binary_mask = np.zeros((height, width), dtype=np.uint8)
+            for rle in grounding["rle_masks"]:
+                m = mask.decode(rle).astype(np.uint8)
+                binary_mask += m.squeeze()
+            annotations['masks'].append(binary_mask)
+
+        return annotations
+
+    def __getitem__(self, index):
+        while True:
+            ann_info = self.data_infos[index] if (self.validation or not self.random_sampling) \
+                else self.data_infos[random.randint(0, len(self.data_infos) - 1)]
+            # Parse annotation info
+            ann = self._parse_annotations(ann_info)
+            image_path = os.path.join(self.image_folder, ann['file_name'])
+            if len(ann['labels']) > 0:
+                break
+            else:
+                index = random.randint(0, len(self.data_infos) - 1)
+        data_item = {"image_path": image_path, "filename": ann['file_name'], "caption": ann['caption'],
+            "labels": ann['labels'], "masks": ann['masks'], "tokens_positive": ann['tokens_positive']}
+        return self.process_data(data_item)
+
+    def __len__(self):
+        return len(self.data_infos)
+
+    def grounding_enc_processor(self, x: torch.Tensor) -> torch.Tensor:
+        x = (x - self.IMG_MEAN) / self.IMG_STD
+        h, w = x.shape[-2:]
+        x = F.pad(x, (0, self.IMG_SIZE - w, 0, self.IMG_SIZE - h))
+        return x
+
+    def create_conversations(self, caption, tokens_positive):
+        question = random.choice(self.question_templates).strip()
+
+        # Prepare caption with tags
+        def tag_caption(caption, tokens):
+            for start, end in sorted(tokens, key=lambda x: x[0], reverse=True):
+                caption = f"{caption[:start]}<p> {caption[start:end]} </p> [SEG]{caption[end:]}"
+            return caption
+
+        detailed_answer = tag_caption(caption, tokens_positive)
+
+        conversations = []
+        conv = conversation_lib.default_conversation.copy()
+        conv.messages = []
+        conv.append_message(conv.roles[0], self.begin_str + question)
+        conv.append_message(conv.roles[1], detailed_answer)
+        conversations.append(conv.get_prompt())
+        questions = [question]
+        return questions, conversations
+
+    def process_data(self, data_item):
+        data_labels = data_item['labels']
+        masks = data_item['masks']
+        caption = data_item['caption']
+        tokens_positive = data_item['tokens_positive']
+        image_path = data_item['image_path']
+
+        # Function to sort elements based on the start index of each phrase
+        def sort_by_start_index(items, order):
+            return [items[i] for i in order]
+
+        # Sort phrases based on their appearance in the sentence
+        phrase_order = sorted(range(len(tokens_positive)), key=lambda x: tokens_positive[x][0])
+        masks = sort_by_start_index(masks, phrase_order)
+        data_labels = sort_by_start_index(data_labels, phrase_order)
+        tokens_positive = sort_by_start_index(tokens_positive, phrase_order)
+
+        image = cv2.imread(image_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # Prepare input for Global Image Encoder
+        global_enc_image = self.global_enc_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+        # Prepare input for Grounding Image Encoder
+        image = self.transform.apply_image(image)
+        image_resize = image.shape[:2]
+        grounding_enc_image = self.grounding_enc_processor(torch.from_numpy(image).permute(2, 0, 1).contiguous())
+        bboxes = None
+
+        questions, conversations = self.create_conversations(caption, tokens_positive)
+        masks = np.stack(masks, axis=0)
+        masks = torch.from_numpy(masks)
+        label = torch.ones(masks.shape[1:], dtype=torch.long) * self.IGNORE_LABEL
+        selected_labels = data_labels
+
+        return (
+        image_path, global_enc_image, grounding_enc_image, bboxes, conversations, masks, label, image_resize, questions,
+        selected_labels)
+
+
+class GranDfDataset(GCGBaseDataset):
+    """
+    Human annotated dataset proposed in GLaMM as part of GranDf dataset.
+    """
+    def __init__(self, dataset_dir, tokenizer, global_image_encoder, epoch_samples=8000, precision="fp32",
+                 image_size=224, num_classes_per_sample=3, validation=False, random_sampling=True):
+        self.base_dir = os.path.join(dataset_dir, "GranDf")
+        json_path = "GranDf_HA_GCG_train.json"
+        image_dir = os.path.join(self.base_dir, "GranDf_HA_images", "train")
+        mode = "Val" if validation else "Train"
+
+        super().__init__(
+            dataset_dir, tokenizer, global_image_encoder, epoch_samples, precision, image_size, num_classes_per_sample,
+            validation, random_sampling, image_dir, json_path, )
+        print('\033[92m' + "----GCG-{}: GranDf-GCG dataset initialized----".format(mode) + '\033[0m')
+
+
+class OpenPsgGCGDataset(GCGBaseDataset):
+    def __init__(self, dataset_dir, tokenizer, global_image_encoder, epoch_samples=8000, precision="fp32",
+                 image_size=224, num_classes_per_sample=3, validation=False, random_sampling=True):
+        json_files = {'validation': "OpenPsgGCG_val.json", 'training': "OpenPsgGCG_train.json"}
+        json_path = json_files['validation'] if validation else json_files['training']
+        image_dir = os.path.join("coco_2017", "train2017")
+        mode = "Val" if validation else "Train"
+
+        super().__init__(
+            dataset_dir, tokenizer, global_image_encoder, epoch_samples, precision, image_size, num_classes_per_sample,
+            validation, random_sampling, image_dir, json_path, )
+        print('\033[92m' + "----GCG-{}: OpenPSG-GCG dataset initialized----".format(mode) + '\033[0m')
+
+
+class Flickr30kGCGDataset(GCGBaseDataset):
+    def __init__(self, dataset_dir, tokenizer, global_image_encoder, epoch_samples=8000, precision="fp32",
+                 image_size=224, num_classes_per_sample=3, validation=False, random_sampling=True):
+        json_files = {'validation': "flickr_mergedGT_GCG_val.json", 'training': "flickr_mergedGT_GCG_train.json"}
+        json_path = json_files['validation'] if validation else json_files['training']
+        image_dir = os.path.join("flikcr_30k", "train")
+        mode = "Val" if validation else "Train"
+
+        super().__init__(
+            dataset_dir, tokenizer, global_image_encoder, epoch_samples, precision, image_size, num_classes_per_sample,
+            validation, random_sampling, image_dir, json_path, )
+        # Filter out images smaller than the minimum size
+        self.data_infos = [self.data_infos[i] for i in self._filter_images(min_size=32)]
+        self.validation = validation
+        print('\033[92m' + "----GCG-{}: Flickr30k-GCG dataset initialized----".format(mode) + '\033[0m')
+
+    def _load_annotations(self, ann_file):
+        # Load annotations and filter out images with very short captions
+        self.coco = COCO(ann_file)
+        self.image_ids = self.coco.getImgIds()
+        data_infos = []
+        total_ann_ids = []
+        removed_img_count = 0
+        for img_id in self.image_ids:
+            if len(data_infos) == 1000 and self.validation:
+                # Only limited images for validation
+                break
+            info = self.coco.loadImgs([img_id])[0]
+            if len(info['caption'].split(' ')) < 3:
+                removed_img_count += 1
+                continue
+            info['filename'] = info['file_name'].split('_')[-1]
+            info['height'] = int(info['height'])
+            info['width'] = int(info['width'])
+            data_infos.append(info)
+            ann_ids = self.coco.getAnnIds(imgIds=[img_id])
+            total_ann_ids.extend(ann_ids)
+        assert len(set(total_ann_ids)) == len(total_ann_ids), f"Non-unique annotation IDs in '{ann_file}'!"
+        print(f'Removed {removed_img_count} images.')
+        return data_infos
+
+    def _filter_images(self, min_size):
+        return [i for i, info in enumerate(self.data_infos) if min(info['width'], info['height']) >= min_size]
+
+    def _parse_annotations(self, img_info, ann_info):
+        annotations = {'bboxes': [], 'labels': [], 'bboxes_ignore': [], 'caption': img_info['caption'], 'masks': [],
+                       'tokens_positive': []}
+        for ann in ann_info:
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0 or ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            annotations['bboxes'].append(bbox)
+            tokens_positive = ann['tokens_positive']
+            gt_label = [img_info['caption'][span[0]:span[1]] for span in tokens_positive]
+            annotations['labels'].append(gt_label[0])
+            annotations['tokens_positive'].append(tokens_positive[0])
+
+            rle = ann['sam_mask']
+            mask_decoded = mask.decode(rle).astype(np.uint8)
+            annotations['masks'].append(mask_decoded)
+
+        # Convert bounding boxes to numpy arrays
+        annotations['bboxes'] = np.array(annotations['bboxes'], dtype=np.float32) if annotations[
+            'bboxes'] else np.zeros((0, 4), dtype=np.float32)
+        annotations['bboxes_ignore'] = np.array(annotations['bboxes_ignore'], dtype=np.float32) if annotations[
+            'bboxes_ignore'] else np.zeros((0, 4), dtype=np.float32)
+
+        return annotations
+
+    def __getitem__(self, index):
+        img_info = self.data_infos[index] if (self.validation or not self.random_sampling) \
+            else self.data_infos[random.randint(0, len(self.data_infos) - 1)]
+        ann_ids = self.coco.getAnnIds(imgIds=img_info['id'])
+        ann_info = self.coco.loadAnns(ann_ids)
+        image_path = os.path.join(self.image_folder, img_info['file_name'])
+        # Parse annotation info
+        ann = self._parse_annotations(img_info, ann_info)
+        data_item = {"image_path": image_path, "filename": img_info['file_name'], "width": img_info['width'],
+                     "height": img_info['height'], "bbox": ann['bboxes'], "caption": ann['caption'],
+                     "labels": ann['labels'], "masks": ann['masks'], "tokens_positive": ann['tokens_positive']}
+        return self.process_data(data_item)
+
+
+class RefCOCOgGCGDataset(GCGBaseDataset):
+    def __init__(self, dataset_dir, tokenizer, global_image_encoder, epoch_samples=8000, precision="fp32",
+                 image_size=224, num_classes_per_sample=3, validation=False, random_sampling=True):
+        json_files = {'validation': "RefCOCOg_GCG_val.json", 'training': "RefCOCOg_GCG_train.json"}
+        json_path = json_files['validation'] if validation else json_files['training']
+        image_dir = os.path.join("coco_2014", "train2014")
+        mode = "Val" if validation else "Train"
+
+        super().__init__(
+            dataset_dir, tokenizer, global_image_encoder, epoch_samples, precision, image_size, num_classes_per_sample,
+            validation, random_sampling, image_dir, json_path, )
+        print('\033[92m' + "----GCG-{}: RefCOCOg-GCG dataset initialized----".format(mode) + '\033[0m')
+
+    def _parse_annotations(self, ann_info):
+        image_path = os.path.join(self.image_folder, ann_info['img_file_name'])
+        annotations = {'labels': [], 'caption': [], 'masks': [], 'tokens_positive': [],
+                       'file_name': ann_info['img_file_name']}
+        width, height = Image.open(image_path).size
+        orig_caption = ann_info['caption'].strip('"').strip()
+        annotations['caption'] = orig_caption.lower()
+
+        for detail in ann_info['refs']:
+            phrase = detail['sentence']
+            if phrase.lower() in annotations['caption']:
+                annotations['labels'].append(phrase)
+                index = annotations['caption'].find(phrase)
+                end_index = index + len(phrase) if index != -1 else -1
+                annotations['tokens_positive'].append([index, end_index])
+
+                # Convert segmentation to binary mask
+                binary_mask = np.zeros((height, width), dtype=np.uint8)
+                for seg in detail["segmentation"]:
+                    rles = mask.frPyObjects([seg], height, width)
+                    m = mask.decode(rles)
+                    m = m.astype(np.uint8)
+                    binary_mask += m.squeeze()
+                annotations['masks'].append(binary_mask)
+
+        # Sort tokens_positive and corresponding lists
+        tokens_positive = annotations['tokens_positive']
+        sorted_indices = sorted(range(len(tokens_positive)), key=lambda i: tokens_positive[i][0])
+        annotations['tokens_positive'] = [tokens_positive[i] for i in sorted_indices]
+        annotations['masks'] = [annotations['masks'][i] for i in sorted_indices]
+        annotations['labels'] = [annotations['labels'][i] for i in sorted_indices]
+
+        # Trimming overlapping intervals
+        for i in range(len(tokens_positive)):
+            for j in range(i + 1, len(tokens_positive)):
+                # If there is overlap
+                if tokens_positive[i][1] >= tokens_positive[j][0]:
+                    # Modify the end index of phrase i to be one less than the start index of phrase j
+                    tokens_positive[i][1] = tokens_positive[j][0] - 1
+                    # Modify the phrases to reflect the change in indices
+                    annotations['labels'][i] = orig_caption[tokens_positive[i][0]:tokens_positive[i][1] + 1]
+                    break  # Exit inner loop since i was modified
+
+        return annotations
+
+    def __getitem__(self, index):
+        while True:
+            ann_dict = self.data_infos[index] if (self.validation or not self.random_sampling) \
+                else self.data_infos[random.randint(0, len(self.data_infos) - 1)]
+            ann_info = next(iter(ann_dict.values()))
+            # Parse annotation info
+            ann = self._parse_annotations(ann_info)
+            image_path = os.path.join(self.image_folder, ann['file_name'])
+            # Check if len(gt_phrases) > 0 and if True, break the loop
+            if len(ann['labels']) > 0:
+                break
+            else:
+                index = random.randint(0, len(self.data_infos) - 1)
+        data_item = {"image_path": image_path, "filename": ann['file_name'], "caption": ann['caption'],
+                     "labels": ann['labels'], "masks": ann['masks'], "tokens_positive": ann['tokens_positive']}
+
+        return self.process_data(data_item)
diff --git a/groundingLMM/dataset/region_datasets/Flickr_Region_ds.py b/groundingLMM/dataset/region_datasets/Flickr_Region_ds.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6b7840c6f9836754c5ed74d69fb87eba3ce4fed
--- /dev/null
+++ b/groundingLMM/dataset/region_datasets/Flickr_Region_ds.py
@@ -0,0 +1,193 @@
+import os
+import cv2
+import numpy as np
+import random
+import torch
+import torch.nn.functional as F
+from pycocotools.coco import COCO
+from transformers import CLIPImageProcessor
+from model.llava import conversation as conversation_lib
+from model.SAM.utils.transforms import ResizeLongestSide
+from tools.utils import DEFAULT_IMAGE_TOKEN
+from dataset.utils.utils import REGION_QUESTIONS, REGION_GROUP_QUESTIONS
+
+
+class Flickr30kRegDataset(torch.utils.data.Dataset):
+    CLASSES = ('object',)
+    IMG_MEAN = torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1)
+    IMG_STD = torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1)
+    IMG_SIZE = 1024
+    IGNORE_LABEL = 255
+
+    def __init__(self, dataset_dir, tokenizer, global_image_encoder, epoch_samples=8000, precision="fp32",
+                 image_size=224, num_classes_per_sample=3, max_gt_per_img=10, validation=False, random_sampling=True):
+        self.epoch_samples = epoch_samples
+        self.num_classes_per_sample = num_classes_per_sample
+        self.dataset_dir = dataset_dir
+        self.image_size = image_size
+        self.tokenizer = tokenizer
+        self.precision = precision
+        self.transform = ResizeLongestSide(image_size)
+        self.global_enc_processor = CLIPImageProcessor.from_pretrained(global_image_encoder)
+        self.max_gt_per_img = max_gt_per_img
+        self.validation = validation
+        self.random_sampling = random_sampling
+
+        self.base_dir = os.path.join(dataset_dir, "flikcr_30k")
+        self.image_folder = os.path.join(self.base_dir, "flickr30k-images")
+        self.ann_file = os.path.join(self.base_dir, "mdetr_annotations", "final_flickr_mergedGT_train.json")
+
+        self.data_infos = self._load_annotations(self.ann_file)
+        self.data_infos = [self.data_infos[i] for i in self._filter_images(min_size=32)]
+        self.id_cap_dict = dict()
+        self.begin_str = f"The {DEFAULT_IMAGE_TOKEN} provides an overview of the picture.\n"
+        print('\033[92m' + "----REGION-Train: Loaded Flickr30k dataset ----" + '\033[0m')
+
+    def _load_annotations(self, ann_file):
+        self.coco = COCO(ann_file)
+        img_ids = self.coco.getImgIds()
+        data_infos = []
+        for img_id in img_ids:
+            info = self.coco.loadImgs([img_id])[0]
+            if len(info['caption'].split(' ')) < 3:
+                continue
+            info['filename'] = info['file_name'].split('_')[-1]
+            info['height'] = int(info['height'])
+            info['width'] = int(info['width'])
+            data_infos.append(info)
+        return data_infos
+
+    def _filter_images(self, min_size):
+        return [i for i, info in enumerate(self.data_infos) if min(info['width'], info['height']) >= min_size]
+
+    def _parse_annotations(self, img_info, ann_info):
+        annotations = {'bboxes': [], 'labels': [], 'bboxes_ignore': [], 'masks_ann': []}
+        self.cat_ids = self.coco.getCatIds(catNms=self.CLASSES)
+        self.id_cap_dict = dict()
+        self.id_cap_dict[img_info['file_name']] = img_info['caption']
+
+        for ann in ann_info:
+            if ann.get('ignore', False) or ann['area'] <= 0 or ann['bbox'][2] < 1 or ann['bbox'][3] < 1:
+                continue
+            bbox = self._get_valid_bbox(ann['bbox'], img_info['width'], img_info['height'])
+            if bbox:
+                if ann.get('iscrowd', False):
+                    annotations['bboxes_ignore'].append(bbox)
+                else:
+                    annotations['bboxes'].append(bbox)
+                    gt_list = [img_info['caption'][atp[0]:atp[1]] for atp in ann['tokens_positive']]
+                    annotations['labels'].append(gt_list[0])
+                    annotations['masks_ann'].append(ann.get('segmentation', None))
+
+        annotations['bboxes'] = np.array(annotations['bboxes'], dtype=np.float32) if annotations[
+            'bboxes'] else np.zeros((0, 4), dtype=np.float32)
+        annotations['bboxes_ignore'] = np.zeros((0, 4), dtype=np.float32)
+        return annotations
+
+    def _get_valid_bbox(self, bbox, img_width, img_height):
+        x1, y1, w, h = bbox
+        inter_w = max(0, min(x1 + w, img_width) - max(x1, 0))
+        inter_h = max(0, min(y1 + h, img_height) - max(y1, 0))
+        if inter_w * inter_h == 0:
+            return None
+        return [x1, y1, x1 + w, y1 + h]
+
+    def __getitem__(self, index):
+        img_info = self.data_infos[index] if (self.validation or not self.random_sampling) \
+            else self.data_infos[random.randint(0, len(self.data_infos) - 1)]
+        ann_info = self.coco.loadAnns(self.coco.getAnnIds(imgIds=img_info['id']))
+        ann = self._parse_annotations(img_info, ann_info)
+
+        data_item = {
+            "image_path": os.path.join(self.image_folder, img_info['file_name']),
+            "filename": img_info['file_name'],
+            "width": img_info['width'],
+            "height": img_info['height'],
+            "bbox": ann['bboxes'],
+            "caption": img_info['caption'],
+            "labels": ann['labels'],
+        }
+
+        return self.process_data(data_item)
+
+    def __len__(self):
+        return len(self.coco.imgs)
+
+    def grounding_enc_processor(self, x: torch.Tensor) -> torch.Tensor:
+        x = (x - self.IMG_MEAN) / self.IMG_STD
+        h, w = x.shape[-2:]
+        x = F.pad(x, (0, self.IMG_SIZE - w, 0, self.IMG_SIZE - h))
+        return x
+
+    def region_enc_processor(self, orig_size, post_size, bboxes, labels, device):
+        orig_h, orig_w = orig_size
+        post_h, post_w = post_size
+        y_scale = post_h / orig_h
+        x_scale = post_w / orig_w
+        shuffle_ids = torch.randperm(len(labels))
+        if len(shuffle_ids) > self.max_gt_per_img:
+            shuffle_ids_reg_question = shuffle_ids[:self.max_gt_per_img]
+            selected_labels = [labels[i] for i in shuffle_ids_reg_question]
+        else:
+            selected_labels = [labels[i] for i in shuffle_ids]
+        selected_bboxes = bboxes[shuffle_ids]
+        # Ensure selected_bboxes is two-dimensional
+        if len(selected_bboxes.shape) == 1:
+            selected_bboxes = np.expand_dims(selected_bboxes, axis=0)
+
+        selected_bboxes[:, [0, 2]] *= x_scale
+        selected_bboxes[:, [1, 3]] *= y_scale
+        selected_bboxes = torch.tensor(selected_bboxes, device=device, dtype=torch.float32) / post_h
+        return selected_bboxes, selected_labels
+
+    def create_conversations(self, labels, caption):
+        # DETAILED QUESTION (About all objects - answer is caption
+        # (bbox order does not matter because all objects are used)
+        questions = []
+        question = random.choice(REGION_GROUP_QUESTIONS).strip()
+        region_string = ''
+        for i in range(len(labels)):
+            region_string = region_string + f'region{i + 1} <bbox>,'
+        detailed_question = question.replace('<region>', region_string)
+        questions.append(detailed_question)
+        detailed_answer = caption
+
+        conversations = []
+        conv = conversation_lib.default_conversation.copy()
+        conv.messages = []
+
+        # Start with question of all regions - Create message with roles:
+        conv.append_message(conv.roles[0], self.begin_str + detailed_question)
+        conv.append_message(conv.roles[1], detailed_answer)
+        for i, reg_answer in enumerate(labels):
+            reg_question = random.choice(REGION_QUESTIONS).replace('<region>', f'region{i + 1} <bbox>').strip()
+            conv.append_message(conv.roles[0], reg_question)
+            conv.append_message(conv.roles[1], reg_answer)
+        conversations.append(conv.get_prompt())
+        return questions, conversations
+
+    def process_data(self, data_item):
+        data_labels = data_item['labels']
+        data_bboxes = data_item['bbox']
+        caption = data_item['caption']
+        image_path = data_item['image_path']
+        image = cv2.imread(image_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        orig_h, orig_w = image.shape[:2]
+        # Prepare input for Global Image Encoder
+        global_enc_image = self.global_enc_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+        post_h, post_w = global_enc_image.shape[1:3]
+        # Skip input for Grounding Image Encoder
+        grounding_enc_image = None
+        image_resize = None
+        # Prepare input for Region Image Encoder
+        bboxes, selected_labels = self.region_enc_processor(
+            (orig_h, orig_w), (post_h, post_w), data_bboxes, data_labels, global_enc_image.device
+            )
+        masks = None
+
+        questions, conversations = self.create_conversations(selected_labels, caption)
+        label = None
+
+        return (image_path, global_enc_image, grounding_enc_image, bboxes, conversations, masks, label, image_resize,
+                questions, selected_labels)
diff --git a/groundingLMM/dataset/region_datasets/GranD_ReferringRegion_ds.py b/groundingLMM/dataset/region_datasets/GranD_ReferringRegion_ds.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c186f1be1bc8622f58405493f18a0ec4cf53212
--- /dev/null
+++ b/groundingLMM/dataset/region_datasets/GranD_ReferringRegion_ds.py
@@ -0,0 +1,162 @@
+import os
+import cv2
+import lmdb
+import json
+import numpy as np
+import random
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor
+from model.llava import conversation as conversation_lib
+from model.SAM.utils.transforms import ResizeLongestSide
+from tools.utils import DEFAULT_IMAGE_TOKEN
+from dataset.utils.utils import REGION_QUESTIONS
+
+
+class GrandReferRegDataset(torch.utils.data.Dataset):
+    CLASSES = ('object',)
+    IMG_MEAN = torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1)
+    IMG_STD = torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1)
+    IMG_SIZE = 1024
+    IGNORE_LABEL = 255
+
+    def __init__(self, dataset_dir, tokenizer, global_image_encoder, epoch_samples=8000, precision="fp32",
+                 image_size=224, num_classes_per_sample=3, max_gt_per_img=10, validation=False, random_sampling=True):
+        self.epoch_samples = epoch_samples
+        self.num_classes_per_sample = num_classes_per_sample
+        self.dataset_dir = dataset_dir
+        self.image_size = image_size
+        self.tokenizer = tokenizer
+        self.precision = precision
+        self.transform = ResizeLongestSide(image_size)
+        self.global_enc_processor = CLIPImageProcessor.from_pretrained(global_image_encoder)
+        self.max_gt_per_img = max_gt_per_img
+        self.validation = validation
+        self.random_sampling = random_sampling
+
+        # Defining paths
+        self.base_dir = os.path.join(dataset_dir, "GranD_Data")
+        self.image_folder = os.path.join(self.base_dir, "images")
+        ann_file_name = "Grand_Referring_Expression_lmdb"
+        ann_path = os.path.join(self.base_dir, ann_file_name)
+        self.annos = lmdb.open(ann_path, readonly=True, max_readers=1, lock=False, readahead=False, meminit=False)
+        mode = "Val" if validation else "Train"
+        self.data_infos = self._load_annotations(
+            os.path.join(self.base_dir, ann_file_name, f'{ann_file_name}_{mode}.txt')
+            )
+        self.begin_str = f"""The {DEFAULT_IMAGE_TOKEN} provides an overview of the picture.\n"""
+        self.question_templates = REGION_QUESTIONS
+        print('\033[92m' + "----REGION-{}: GranD Referring Region dataset initialized----".format(mode) + '\033[0m')
+
+    def _load_annotations(self, ann_file):
+        with open(ann_file, 'r') as f:
+            data_infos = [line.strip() for line in f if line.strip()]
+        data_infos = data_infos[0: 1000] if self.validation else data_infos
+        return data_infos
+
+    def _parse_annotations(self, ann_info):
+        annotations = {'bboxes': [], 'labels': []}
+        for ann in ann_info:
+            bbox = ann['bbox']
+            if bbox:
+                annotations['bboxes'].append(bbox)
+                annotations['labels'].append(ann['attribute'])
+
+        annotations['bboxes'] = np.array(annotations['bboxes'], dtype=np.float32) if annotations[
+            'bboxes'] else np.zeros((0, 4), dtype=np.float32)
+        return annotations
+
+    def __getitem__(self, index):
+        image_name = self.data_infos[index] if (self.validation or not self.random_sampling) \
+            else self.data_infos[random.randint(0, len(self.data_infos) - 1)]
+        image_path = os.path.join(self.image_folder, image_name)
+        # Get the annotation from lmdb
+        with self.annos.begin() as txn:
+            json_contents = txn.get(image_name.encode())
+        json_contents = json.loads(json_contents.decode('utf-8'))
+        ann_info = json_contents[image_name]
+        ann = self._parse_annotations(ann_info)
+
+        data_item = {
+            "image_path": image_path,
+            "filename": image_name,
+            "bbox": ann['bboxes'],
+            "labels": ann['labels'],
+        }
+
+        return self.process_data(data_item)
+
+    def __len__(self):
+        return len(self.coco.imgs)
+
+    def grounding_enc_processor(self, x: torch.Tensor) -> torch.Tensor:
+        x = (x - self.IMG_MEAN) / self.IMG_STD
+        h, w = x.shape[-2:]
+        x = F.pad(x, (0, self.IMG_SIZE - w, 0, self.IMG_SIZE - h))
+        return x
+
+    def region_enc_processor(self, orig_size, post_size, bboxes, labels, device):
+        orig_h, orig_w = orig_size
+        post_h, post_w = post_size
+        y_scale = post_h / orig_h
+        x_scale = post_w / orig_w
+        shuffle_ids = torch.randperm(len(labels))
+        if len(shuffle_ids) > self.max_gt_per_img:
+            shuffle_ids_reg_question = shuffle_ids[:self.max_gt_per_img]
+            selected_labels = [labels[i] for i in shuffle_ids_reg_question]
+        else:
+            selected_labels = [labels[i] for i in shuffle_ids]
+        selected_bboxes = bboxes[shuffle_ids]
+        # Ensure selected_bboxes is two-dimensional
+        if len(selected_bboxes.shape) == 1:
+            selected_bboxes = np.expand_dims(selected_bboxes, axis=0)
+
+        selected_bboxes[:, [0, 2]] *= x_scale
+        selected_bboxes[:, [1, 3]] *= y_scale
+        selected_bboxes = torch.tensor(selected_bboxes, device=device, dtype=torch.float32) / post_h
+        return selected_bboxes, selected_labels
+
+    def create_conversations(self, labels, question_templates):
+        questions = []
+        answers = []
+        for i, label in enumerate(labels):
+            question = random.choice(question_templates).strip().replace('<region>', f'region{i + 1} <bbox>')
+            questions.append(question)
+            answers.append(label)
+
+        conversations = []
+        conv = conversation_lib.default_conversation.copy()
+        conv.messages = []
+        for i, (question, answer) in enumerate(zip(questions, answers)):
+            if i == 0:
+                question = self.begin_str + question
+            conv.append_message(conv.roles[0], question)
+            conv.append_message(conv.roles[1], answer)
+        conversations.append(conv.get_prompt())
+        return questions, conversations
+
+    def process_data(self, data_item):
+        data_labels = data_item['labels']
+        data_bboxes = data_item['bbox']
+
+        image_path = data_item['image_path']
+        image = cv2.imread(image_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        orig_h, orig_w = image.shape[:2]
+        # Prepare input for Global Image Encoder
+        global_enc_image = self.global_enc_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+        post_h, post_w = global_enc_image.shape[1:3]
+        # Skip input for Grounding Image Encoder
+        grounding_enc_image = None
+        image_resize = None
+        # Prepare input for Region Image Encoder
+        bboxes, selected_labels = self.region_enc_processor(
+            (orig_h, orig_w), (post_h, post_w), data_bboxes, data_labels, global_enc_image.device
+            )
+        masks = None
+
+        questions, conversations = self.create_conversations(selected_labels, question_templates=self.question_templates)
+        label = None
+
+        return (image_path, global_enc_image, grounding_enc_image, bboxes, conversations, masks, label, image_resize,
+                questions, selected_labels)
diff --git a/groundingLMM/dataset/region_datasets/RefCOCO_VG_Region_ds.py b/groundingLMM/dataset/region_datasets/RefCOCO_VG_Region_ds.py
new file mode 100644
index 0000000000000000000000000000000000000000..69ad3e49ff8216504eabc193f314bb406f7be5b8
--- /dev/null
+++ b/groundingLMM/dataset/region_datasets/RefCOCO_VG_Region_ds.py
@@ -0,0 +1,300 @@
+import os
+import cv2
+import random
+import numpy as np
+import torch
+from pycocotools.coco import COCO
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor
+from model.llava import conversation as conversation_lib
+from model.SAM.utils.transforms import ResizeLongestSide
+from tools.utils import DEFAULT_IMAGE_TOKEN
+from dataset.utils.utils import REGION_QUESTIONS
+
+
+class RegionBaseDataset(torch.utils.data.Dataset):
+    CLASSES = ('object',)
+    IMG_MEAN = torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1)
+    IMG_STD = torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1)
+    IMG_SIZE = 1024
+    IGNORE_LABEL = 255
+
+    def __init__(self, dataset_dir, tokenizer, global_image_encoder, epoch_samples=8000, precision="fp32",
+                 image_size=224, num_classes_per_sample=3, max_gt_per_img=10, validation=False, dataset_name='',
+                 image_dir='', json_path='', intro_string='', question_templates=None, random_sampling=True):
+        self.epoch_samples = epoch_samples
+        self.num_classes_per_sample = num_classes_per_sample
+        self.dataset_dir = dataset_dir
+        self.image_size = image_size
+        self.tokenizer = tokenizer
+        self.precision = precision
+        self.transform = ResizeLongestSide(image_size)
+        self.global_enc_processor = CLIPImageProcessor.from_pretrained(global_image_encoder)
+        self.max_gt_per_img = max_gt_per_img
+        self.validation = validation
+        self.random_sampling = random_sampling
+
+        # Dataset type specific
+        self.begin_str = intro_string
+        self.base_dir = os.path.join(dataset_dir, dataset_name)
+        self.ann_file = os.path.join(self.base_dir, json_path)
+        self.question_templates = question_templates
+        self.image_folder = os.path.join(self.base_dir, image_dir)
+
+        self.data_infos = self._load_annotations(self.ann_file)
+        self.data_infos = [self.data_infos[i] for i in self._filter_images(min_size=32)]
+
+    def _load_annotations(self, ann_file):
+        self.coco = COCO(ann_file)
+        img_ids = self.coco.getImgIds()
+        data_infos = []
+        for img_id in img_ids:
+            if self.validation and len(data_infos) == 1000:
+                # limited images during validation
+                break
+            info = self.coco.loadImgs([img_id])[0]
+            info['filename'] = info['file_name'].split('_')[-1]
+            info['height'] = int(info['height'])
+            info['width'] = int(info['width'])
+            data_infos.append(info)
+        return data_infos
+
+    def _filter_images(self, min_size):
+        return [i for i, info in enumerate(self.data_infos) if min(info['width'], info['height']) >= min_size]
+
+    def _parse_annotations(self, img_info, ann_info):
+        annotations = {'bboxes': [], 'labels': [], 'bboxes_ignore': [], 'masks_ann': [],
+                       'seg_map': img_info['file_name'].replace('jpg', 'png')}
+
+        for ann in ann_info:
+            if ann.get('ignore', False) or ann['area'] <= 0 or ann['bbox'][2] < 1 or ann['bbox'][3] < 1:
+                continue
+            bbox = self._get_valid_bbox(ann['bbox'], img_info['width'], img_info['height'])
+            if bbox:
+                annotations['bboxes'].append(bbox)
+                annotations['labels'].append(img_info['caption'].strip())
+
+        annotations['bboxes'] = np.array(annotations['bboxes'], dtype=np.float32) if annotations[
+            'bboxes'] else np.zeros((0, 4), dtype=np.float32)
+        annotations['bboxes_ignore'] = np.zeros((0, 4), dtype=np.float32)
+        return annotations
+
+    def _get_valid_bbox(self, bbox, img_width, img_height):
+        x1, y1, w, h = bbox
+        inter_w = max(0, min(x1 + w, img_width) - max(x1, 0))
+        inter_h = max(0, min(y1 + h, img_height) - max(y1, 0))
+        if inter_w * inter_h == 0:
+            return None
+        return [x1, y1, x1 + w, y1 + h]
+
+    def __getitem__(self, index):
+        img_info = self.data_infos[index] if (self.validation or not self.random_sampling) \
+            else self.data_infos[random.randint(0, len(self.data_infos) - 1)]
+        ann_info = self.coco.loadAnns(self.coco.getAnnIds(imgIds=img_info['id']))
+        ann = self._parse_annotations(img_info, ann_info)
+
+        data_item = {
+            "image_path": os.path.join(self.image_folder, img_info['file_name']),
+            "width": img_info['width'],
+            "height": img_info['height'],
+            "bbox": ann['bboxes'],
+            "caption": img_info['caption'],
+            "labels": ann['labels'],
+            "seg_map": ann['seg_map'],
+        }
+
+        return self.process_data(data_item)
+
+    def __len__(self):
+        return len(self.data_infos)
+
+    def grounding_enc_processor(self, x: torch.Tensor) -> torch.Tensor:
+        x = (x - self.IMG_MEAN) / self.IMG_STD
+        h, w = x.shape[-2:]
+        x = F.pad(x, (0, self.IMG_SIZE - w, 0, self.IMG_SIZE - h))
+        return x
+
+    def region_enc_processor(self, orig_size, post_size, bboxes, labels, device):
+        orig_h, orig_w = orig_size
+        post_h, post_w = post_size
+        y_scale = post_h / orig_h
+        x_scale = post_w / orig_w
+        shuffle_ids = torch.randperm(len(labels))[:self.max_gt_per_img]
+        selected_bboxes = bboxes[shuffle_ids]
+
+        # Ensure selected_bboxes is two-dimensional
+        if len(selected_bboxes.shape) == 1:
+            selected_bboxes = np.expand_dims(selected_bboxes, axis=0)
+
+        selected_labels = [labels[i] for i in shuffle_ids]
+        selected_bboxes[:, [0, 2]] *= x_scale
+        selected_bboxes[:, [1, 3]] *= y_scale
+        selected_bboxes = torch.tensor(selected_bboxes, device=device, dtype=torch.float32) / post_h
+        return selected_bboxes, selected_labels
+
+    def create_conversations(self, labels, question_templates):
+        questions = []
+        answers = []
+        for i, label in enumerate(labels):
+            question = random.choice(question_templates).strip().replace('<region>', f'region{i + 1} <bbox>')
+            questions.append(question)
+            answers.append(label)
+
+        conversations = []
+        conv = conversation_lib.default_conversation.copy()
+        conv.messages = []
+        for i, (question, answer) in enumerate(zip(questions, answers)):
+            if i == 0:
+                question = self.begin_str + question
+            conv.append_message(conv.roles[0], question)
+            conv.append_message(conv.roles[1], answer)
+        conversations.append(conv.get_prompt())
+        return questions, conversations
+
+    def process_data(self, data_item):
+        data_labels = data_item['labels']
+        data_bboxes = data_item['bbox']
+
+        image_path = data_item['image_path']
+        image = cv2.imread(image_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        orig_h, orig_w = image.shape[:2]
+        # Prepare input for Global Image Encoder
+        global_enc_image = self.global_enc_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+        post_h, post_w = global_enc_image.shape[1:3]
+        # Skip input for Grounding Image Encoder
+        grounding_enc_image = None
+        image_resize = None
+        # Prepare input for Region Image Encoder
+        bboxes, selected_labels = self.region_enc_processor((orig_h, orig_w), (post_h, post_w), data_bboxes, data_labels,
+                                                            global_enc_image.device)
+        masks = None
+
+        questions, conversations = self.create_conversations(
+            selected_labels, question_templates=self.question_templates
+            )
+        label = None
+
+        return (image_path, global_enc_image, grounding_enc_image, bboxes, conversations, masks, label, image_resize,
+                questions, selected_labels)
+
+
+class RefCocoRegDataset(RegionBaseDataset):
+    def __init__(self, dataset_dir, tokenizer, global_image_encoder, epoch_samples=8000, precision="fp32",
+                 image_size=224, num_classes_per_sample=3, max_gt_per_img=10, validation=False, random_sampling=True):
+        intro_string = DEFAULT_IMAGE_TOKEN + "\n" + ("I will provide you with only one region containing only one "
+                                                     "object, although there may be other objects present in the "
+                                                     "image. It is recommended that you describe the object's "
+                                                     "relative position with respect to other objects in the image, "
+                                                     "as well as its position within the image and its basic "
+                                                     "attributes.")
+        json_path = os.path.join("mdetr_annotations", "finetune_refcoco_train.json")
+        dataset_name = "RefCoco_Reg"
+        image_dir = "coco_2014"
+        question_templates = ['<region>',]
+        mode = "Val" if validation else "Train"
+
+        super().__init__(
+            dataset_dir, tokenizer, global_image_encoder, epoch_samples, precision, image_size, num_classes_per_sample,
+            max_gt_per_img, validation, dataset_name, image_dir, json_path,
+            intro_string, question_templates, random_sampling
+            )
+        print('\033[92m' + "----REGION-{}: Loaded RefCOCO dataset ----".format(mode) + '\033[0m')
+
+
+class RefCocoGRegDataset(RegionBaseDataset):
+    def __init__(self, dataset_dir, tokenizer, global_image_encoder, epoch_samples=8000, precision="fp32",
+                 image_size=224, num_classes_per_sample=3, max_gt_per_img=10, validation=False, random_sampling=True):
+        intro_string = f"""The {DEFAULT_IMAGE_TOKEN} provides an overview of the picture.\n"""
+        dataset_name = "RefCoco_Reg"
+        json_files = {'validation': "finetune_refcocog_val.json", 'training': "finetune_refcocog_train.json"}
+        json_path = os.path.join("mdetr_annotations", json_files['validation'] if validation else json_files['training'])
+        image_dir = "coco_2014"
+        question_templates = REGION_QUESTIONS
+        mode = "Val" if validation else "Train"
+
+        super().__init__(
+            dataset_dir, tokenizer, global_image_encoder, epoch_samples, precision, image_size, num_classes_per_sample,
+            max_gt_per_img, validation, dataset_name, image_dir, json_path,
+            intro_string, question_templates, random_sampling
+            )
+        print('\033[92m' + "----REGION-{}: Loaded RefCOCO-G dataset ----".format(mode) + '\033[0m')
+
+
+class RefCocoPRegDataset(RegionBaseDataset):
+    def __init__(self, dataset_dir, tokenizer, global_image_encoder, epoch_samples=8000, precision="fp32",
+                 image_size=224, num_classes_per_sample=3, max_gt_per_img=10, validation=False, random_sampling=True):
+        intro_string = DEFAULT_IMAGE_TOKEN + "\n" + ("I will provide you with only one region containing only one "
+                                                     "object, although there may be other objects present in the "
+                                                     "image. It is recommended that you describe the object's "
+                                                     "relative position with respect to other objects in the image, "
+                                                     "as well as its position within the image and its basic "
+                                                     "attributes.")
+        dataset_name = "RefCoco_Reg"
+        json_files = {'validation': "finetune_refcoco+_val.json", 'training': "finetune_refcoco+_train.json"}
+        json_path = os.path.join(
+            "mdetr_annotations", json_files['validation'] if validation else json_files['training']
+            )
+        image_dir = "coco_2014"
+        question_templates = ['<region>', ]
+        mode = "Val" if validation else "Train"
+
+        super().__init__(
+            dataset_dir, tokenizer, global_image_encoder, epoch_samples, precision, image_size, num_classes_per_sample,
+            max_gt_per_img, validation, dataset_name, image_dir, json_path,
+            intro_string, question_templates, random_sampling
+        )
+        print('\033[92m' + "----REGION-{}: Loaded RefCOCO-P dataset ----".format(mode) + '\033[0m')
+
+
+class VisualGenomeRegDataset(RegionBaseDataset):
+    def __init__(self, dataset_dir, tokenizer, global_image_encoder, epoch_samples=8000, precision="fp32",
+                 image_size=224, num_classes_per_sample=3, max_gt_per_img=10, validation=False, random_sampling=True):
+        intro_string = f"""The {DEFAULT_IMAGE_TOKEN} provides an overview of the picture.\n"""
+        dataset_name = "visual_genome"
+        json_files = {'validation': "test_caption.json", 'training': "train.json"}
+        json_path = json_files['validation'] if validation else json_files['training']
+        image_dir = "images"
+        question_templates = REGION_QUESTIONS
+        mode = "Val" if validation else "Train"
+
+        super().__init__(
+            dataset_dir, tokenizer, global_image_encoder, epoch_samples, precision, image_size, num_classes_per_sample,
+            max_gt_per_img, validation, dataset_name, image_dir, json_path,
+            intro_string, question_templates, random_sampling
+            )
+        print('\033[92m' + "----REGION-{}: Loaded VisualGenome dataset ----".format(mode) + '\033[0m')
+
+    def _parse_annotations(self, img_info, ann_info):
+        annotations = {'bboxes': [], 'labels': [], }
+
+        for ann in ann_info:
+            if ann.get('ignore', False):
+                continue
+            # Check for valid area and dimensions
+            if ann['area'] <= 0 or ann['bbox'][2] < 1 or ann['bbox'][3] < 1:
+                continue
+            bbox = self._get_valid_bbox(ann['bbox'], img_info['width'], img_info['height'])
+            if bbox:
+                annotations['bboxes'].append(bbox)
+                annotations['labels'].append(ann['caption'].strip())
+
+        annotations['bboxes'] = np.array(annotations['bboxes'], dtype=np.float32) if annotations[
+            'bboxes'] else np.zeros((0, 4), dtype=np.float32)
+        return annotations
+
+    def __getitem__(self, index):
+        img_info = self.data_infos[index] if (self.validation or not self.random_sampling) \
+            else self.data_infos[random.randint(0, len(self.data_infos) - 1)]
+        ann_info = self.coco.loadAnns(self.coco.getAnnIds(imgIds=img_info['id']))
+        ann = self._parse_annotations(img_info, ann_info)
+
+        data_item = {
+            "image_path": os.path.join(self.image_folder, img_info['file_name']),
+            "width": img_info['width'],
+            "height": img_info['height'],
+            "bbox": ann['bboxes'],
+            "labels": ann['labels'],
+        }
+
+        return self.process_data(data_item)
diff --git a/groundingLMM/dataset/segm_datasets/GranD_ReferringSegm_ds.py b/groundingLMM/dataset/segm_datasets/GranD_ReferringSegm_ds.py
new file mode 100644
index 0000000000000000000000000000000000000000..57c8a8a2797d2089a0dbdbdb3f30adf3b1589274
--- /dev/null
+++ b/groundingLMM/dataset/segm_datasets/GranD_ReferringSegm_ds.py
@@ -0,0 +1,156 @@
+import os
+import cv2
+import random
+import lmdb
+import json
+import numpy as np
+import torch
+import torch.nn.functional as F
+from pycocotools import mask
+from transformers import CLIPImageProcessor
+from model.llava import conversation as conversation_lib
+from model.SAM.utils.transforms import ResizeLongestSide
+from tools.utils import DEFAULT_IMAGE_TOKEN
+from dataset.utils.utils import ANSWER_LIST, SEG_QUESTIONS
+
+
+class GrandReferSegmDataset(torch.utils.data.Dataset):
+    CLASSES = ('object',)
+    IMG_MEAN = torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1)
+    IMG_STD = torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1)
+    IMG_SIZE = 1024
+    IGNORE_LABEL = 255
+
+    def __init__(self, dataset_dir, tokenizer, global_image_encoder, epoch_samples=500 * 8 * 2 * 10,
+                 precision: str = "fp32", image_size: int = 224, num_classes_per_sample: int = 3,
+                 validation=False, split='train', random_sampling=True, inference=False):
+        self.epoch_samples = epoch_samples
+        self.num_classes_per_sample = num_classes_per_sample
+
+        self.dataset_dir = dataset_dir
+        self.image_size = image_size
+        self.tokenizer = tokenizer
+        self.precision = precision
+        self.transform = ResizeLongestSide(image_size)
+        self.global_enc_processor = CLIPImageProcessor.from_pretrained(global_image_encoder)
+
+        self.question_templates = SEG_QUESTIONS
+        self.answer_list = ANSWER_LIST
+        self.begin_str = f"""The {DEFAULT_IMAGE_TOKEN} provides an overview of the picture.\n"""
+        self.validation = validation
+        self.random_sampling = random_sampling
+        # Defining paths
+        self.base_dir = os.path.join(dataset_dir, "GranD_Data")
+        self.image_folder = os.path.join(self.base_dir, "images")
+        ann_file_name = "Grand_Referring_Expression_lmdb"
+        ann_path = os.path.join(self.base_dir, ann_file_name)
+        self.annos = lmdb.open(ann_path, readonly=True, max_readers=1, lock=False, readahead=False, meminit=False)
+        mode = "Val" if validation else "Train"
+        self.data_infos = self._load_annotations(
+            os.path.join(self.base_dir, ann_file_name, f'{ann_file_name}_{mode}.txt')
+            )
+        print('\033[92m' + "----SEGM-{}: GranD Referring Segm dataset initialized----".format(mode) + '\033[0m')
+
+    def _load_annotations(self, ann_file):
+        with open(ann_file, 'r') as f:
+            data_infos = [line.strip() for line in f if line.strip()]
+        data_infos = data_infos[0: 1000] if self.validation else data_infos
+        return data_infos
+
+    def __len__(self):
+        return len(self.data_infos)
+
+    def grounding_enc_processor(self, x: torch.Tensor) -> torch.Tensor:
+        x = (x - self.IMG_MEAN) / self.IMG_STD
+        h, w = x.shape[-2:]
+        x = F.pad(x, (0, self.IMG_SIZE - w, 0, self.IMG_SIZE - h))
+        return x
+
+    def create_conversations(self, labels, questions):
+        questions = []
+        answers = []
+        for i, label in enumerate(labels):
+            question = random.choice(questions)
+            questions.append(question)
+            answers.append(label)
+
+        conversations = []
+        conv = conversation_lib.default_conversation.copy()
+        conv.messages = []
+        for i, (question, answer) in enumerate(zip(questions, answers)):
+            if i == 0:
+                question = self.begin_str + question
+            conv.append_message(conv.roles[0], question)
+            conv.append_message(conv.roles[1], answer)
+        conversations.append(conv.get_prompt())
+        return questions, conversations
+
+    def _parse_annotations(self, ann_info):
+        annotations = {'masks': [], 'labels': []}
+        for ann in ann_info:
+            rle = ann.get("segmentation")
+            if rle:
+                m = mask.decode(rle)
+                m = m.astype(np.uint8)
+                annotations['masks'].append(m)
+                annotations['labels'].append(ann['attribute'])
+
+        annotations['bboxes'] = np.array(annotations['masks'], dtype=np.float32) if annotations[
+            'bboxes'] else np.zeros((0, 4), dtype=np.float32)
+        return annotations
+
+    def __getitem__(self, idx):
+        image_name = self.data_infos[idx] if (self.validation or not self.random_sampling) else self.data_infos[
+            random.randint(0, len(self.data_infos) - 1)]
+        image_path = os.path.join(self.image_folder, image_name)
+        # Get the annotation from lmdb
+        with self.annos.begin() as txn:
+            json_contents = txn.get(image_name.encode())
+        json_contents = json.loads(json_contents.decode('utf-8'))
+        ann_info = json_contents[image_name]
+        print(image_path)
+        ann = self._parse_annotations(ann_info)
+        data_item = {"image_path": image_path,
+                     "filename": image_name,
+                     "bbox": ann['bboxes'],
+                     "labels": ann['labels'], }
+
+        return self.process_data(data_item)
+
+    def process_data(self, data_item):
+        data_labels = data_item['labels']
+        data_masks = data_item['maks']
+
+        image_path = data_item['image_path']
+        image = cv2.imread(image_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # Prepare input for Global Image Encoder
+        global_enc_image = self.global_enc_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+        # Prepare input for Grounding Image Encoder
+        image = self.transform.apply_image(image)
+        image_resize = image.shape[:2]
+        grounding_enc_image = self.grounding_enc_processor(torch.from_numpy(image).permute(2, 0, 1).contiguous())
+
+        # Prepare input for Segmentation module
+        shuffle_ids = torch.randperm(len(data_labels))
+        if len(shuffle_ids) > self.max_gt_per_img:
+            shuffle_ids_segm_question = shuffle_ids[:self.max_gt_per_img]
+            selected_labels = [data_labels[i] for i in shuffle_ids_segm_question]
+        else:
+            selected_labels = [data_labels[i] for i in shuffle_ids]
+        selected_masks = data_masks[shuffle_ids]
+
+        masks = np.stack(selected_masks, axis=0)
+        masks = torch.from_numpy(masks)
+
+        if len(data_labels) == 0:
+            print(image_path)
+
+        questions, conversations = self.create_conversations(
+            selected_labels, self.question_templates)
+        label = torch.ones(grounding_enc_image.shape[1], grounding_enc_image.shape[2]) * self.IGNORE_LABEL
+        bboxes = None
+
+        return (
+        image_path, global_enc_image, grounding_enc_image, bboxes, conversations, masks, label, image_resize,
+        questions, selected_labels)
diff --git a/groundingLMM/dataset/segm_datasets/RefCOCO_Segm_ds.py b/groundingLMM/dataset/segm_datasets/RefCOCO_Segm_ds.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc35d0a07e69eb555f571d3c4def5bb36b0c4579
--- /dev/null
+++ b/groundingLMM/dataset/segm_datasets/RefCOCO_Segm_ds.py
@@ -0,0 +1,242 @@
+import os
+import cv2
+import random
+import numpy as np
+import torch
+import torch.nn.functional as F
+from pycocotools import mask
+from transformers import CLIPImageProcessor
+from model.llava import conversation as conversation_lib
+from model.SAM.utils.transforms import ResizeLongestSide
+from dataset.utils.grefer import G_REFER
+from dataset.utils.refcoco_refer import REFER
+from tools.utils import DEFAULT_IMAGE_TOKEN
+from dataset.utils.utils import ANSWER_LIST, SEG_QUESTIONS
+
+
+class ReferSegmDataset(torch.utils.data.Dataset):
+    CLASSES = ('object',)
+    IMG_MEAN = torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1)
+    IMG_STD = torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1)
+    IMG_SIZE = 1024
+    IGNORE_LABEL = 255
+
+    def __init__(self, dataset_dir, tokenizer, global_image_encoder, epoch_samples=500 * 8 * 2 * 10,
+                 precision: str = "fp32", image_size: int = 224, num_classes_per_sample: int = 3,
+                 refer_segm_data="refcoco||refcoco+||refcocog||refclef", validation=False, split='train',
+                 random_sampling=True, inference=False):
+        self.epoch_samples = epoch_samples
+        self.num_classes_per_sample = num_classes_per_sample
+
+        self.dataset_dir = dataset_dir
+        self.image_size = image_size
+        self.tokenizer = tokenizer
+        self.precision = precision
+        self.transform = ResizeLongestSide(image_size)
+        self.global_enc_processor = CLIPImageProcessor.from_pretrained(global_image_encoder)
+
+        self.question_templates = SEG_QUESTIONS
+        self.answer_list = ANSWER_LIST
+        self.begin_str = f"""The {DEFAULT_IMAGE_TOKEN} provides an overview of the picture.\n"""
+        self.validation = validation
+        self.split = split
+        self.initialize_refer_segm_data(refer_segm_data, inference)
+        self.random_sampling = random_sampling
+
+    def initialize_refer_segm_data(self, refer_segm_data, inference=False):
+
+        dataset_dir = os.path.join(self.dataset_dir, "Refer_Segm")
+        self.refer_seg_ds_list = refer_segm_data.split("||")
+        # ['refclef', 'refcoco', 'refcoco+', 'refcocog']
+        self.refer_segm_data = {}
+
+        for dataset_name in self.refer_seg_ds_list:
+            splitBy = "umd" if dataset_name == "refcocog" else "unc"
+            refer_api = G_REFER(dataset_dir, dataset_name, splitBy) if dataset_name == "grefcoco" else\
+                REFER(dataset_dir, dataset_name, splitBy)
+            ref_ids_train = refer_api.getRefIds(split=self.split)
+            images_ids_train = refer_api.getImgIds(ref_ids=ref_ids_train)
+            refs_train = refer_api.loadRefs(ref_ids=ref_ids_train)
+            refer_seg_ds = {
+                "images": self.load_images(refer_api, images_ids_train, dataset_dir, dataset_name, inference=inference),
+                "annotations": refer_api.Anns,
+                "img2refs": self.create_img_to_refs_mapping(refs_train)
+            }
+
+            print(f"dataset {dataset_name} (refs {splitBy}) ({self.split} split) has {len(refer_seg_ds['images'])} "
+                  f"images and {len(refer_seg_ds['annotations'])} annotations.")
+            print(f'\033[92m----SEG-{"Val" if self.validation else "Train"}:'
+                  f' Loaded ReferSeg - {dataset_name} dataset ----\033[0m')
+
+            self.refer_segm_data[dataset_name] = refer_seg_ds
+
+    def load_images(self, refer_api, images_ids_train, dataset_dir, dataset_name, inference=False):
+        images = []
+        loaded_images = refer_api.loadImgs(image_ids=images_ids_train)
+        # Limiting images to 1000(optional) for validation
+        loaded_images = loaded_images[:1000] if (self.validation and not inference) else loaded_images
+        for item in loaded_images:
+            item = item.copy()
+            if dataset_name == 'refclef':
+                item["file_name"] = os.path.join(dataset_dir, "images", "saiapr_tc-12", item["file_name"])
+            else:
+                item["file_name"] = os.path.join(dataset_dir.replace("Refer_Segm/", ""), "coco_2014/train2014",
+                                                 item["file_name"])
+            images.append(item)
+        return images
+
+    def create_img_to_refs_mapping(self, refs_train):
+        img2refs = {}
+        for ref in refs_train:
+            img2refs[ref["image_id"]] = img2refs.get(ref["image_id"], []) + [ref, ]
+        return img2refs
+
+    def __len__(self):
+        return self.epoch_samples
+
+    def _set_len(self, length):
+        self.epoch_samples = length
+
+    def grounding_enc_processor(self, x: torch.Tensor) -> torch.Tensor:
+        x = (x - self.IMG_MEAN) / self.IMG_STD
+        h, w = x.shape[-2:]
+        x = F.pad(x, (0, self.IMG_SIZE - w, 0, self.IMG_SIZE - h))
+        return x
+
+    def create_conversations(self, labels):
+        questions = []
+        answers = []
+        for i, label in enumerate(labels):
+            label = label.strip()
+            assert len(label.split("||")) == 1
+            question_template = random.choice(self.question_templates)
+            questions.append(question_template.format(class_name=label.lower()))
+            answers.append(random.choice(self.answer_list))
+
+        conversations = []
+        conv = conversation_lib.default_conversation.copy()
+        conv.messages = []
+        for i, (question, answer) in enumerate(zip(questions, answers)):
+            if i == 0:
+                question = self.begin_str + question
+            conv.append_message(conv.roles[0], question)
+            conv.append_message(conv.roles[1], answer)
+        conversations.append(conv.get_prompt())
+        return questions, conversations
+
+    def __getitem__(self, idx):
+        dataset_idx = random.randint(0, len(self.refer_seg_ds_list) - 1)
+        dataset_name = self.refer_seg_ds_list[dataset_idx]
+        refer_seg_ds = self.refer_segm_data[dataset_name]
+        images = refer_seg_ds["images"]
+        annotations = refer_seg_ds["annotations"]
+        img2refs = refer_seg_ds["img2refs"]
+        idx = idx if (self.validation or not self.random_sampling) else random.randint(0, len(images) - 1)
+        image_info = images[idx]
+        image_id = image_info["id"]
+        refs = img2refs[image_id]
+        if len(refs) == 0:
+            return self.__getitem__(0)
+
+        sents = []
+        ann_ids = []
+        for ref in refs:
+            for sent in ref["sentences"]:
+                text = sent["sent"]
+                sents.append(text)
+                ann_ids.append(ref["ann_id"])
+        if len(sents) >= self.num_classes_per_sample:
+            sampled_inds = np.random.choice(
+                list(range(len(sents))), size=self.num_classes_per_sample, replace=False
+            )
+        else:
+            sampled_inds = list(range(len(sents)))
+        sampled_sents = np.vectorize(sents.__getitem__)(sampled_inds).tolist()
+        # sampled_ann_ids = np.vectorize(ann_ids.__getitem__)(sampled_inds).tolist()
+        sampled_ann_ids = [ann_ids[ind] for ind in sampled_inds]
+        selected_labels = sampled_sents
+
+        # Load and process the image
+        image_path = image_info["file_name"]
+        image = cv2.imread(image_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        global_enc_img = self.global_enc_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+        image = self.transform.apply_image(image)
+        image_resize = image.shape[:2]
+        grounding_enc_img = self.grounding_enc_processor(torch.from_numpy(image).permute(2, 0, 1).contiguous())
+
+        # Generate questions and answers
+        questions, conversations = self.create_conversations(selected_labels)
+
+        flag = False
+        masks = []
+        for ann_id in sampled_ann_ids:
+            if isinstance(ann_id, list):
+                flag = True
+                if -1 in ann_id:
+                    assert len(ann_id) == 1
+                    m = np.zeros((image_info["height"], image_info["width"])).astype(
+                        np.uint8
+                    )
+                else:
+                    m_final = np.zeros(
+                        (image_info["height"], image_info["width"])
+                    ).astype(np.uint8)
+                    for ann_id_i in ann_id:
+                        ann = annotations[ann_id_i]
+
+                        if len(ann["segmentation"]) == 0:
+                            m = np.zeros(
+                                (image_info["height"], image_info["width"])
+                            ).astype(np.uint8)
+                        else:
+                            if type(ann["segmentation"][0]) == list:  # polygon
+                                rle = mask.frPyObjects(
+                                    ann["segmentation"], image_info["height"], image_info["width"], )
+                            else:
+                                rle = ann["segmentation"]
+                                for i in range(len(rle)):
+                                    if not isinstance(rle[i]["counts"], bytes):
+                                        rle[i]["counts"] = rle[i]["counts"].encode()
+                            m = mask.decode(rle)
+                            m = np.sum(
+                                m, axis=2
+                            )  # sometimes there are multiple binary map (corresponding to multiple segs)
+                            m = m.astype(np.uint8)  # convert to np.uint8
+                        m_final = m_final | m
+                    m = m_final
+                masks.append(m)
+                continue
+
+            ann = annotations[ann_id]
+
+            if len(ann["segmentation"]) == 0:
+                m = np.zeros((image_info["height"], image_info["width"])).astype(
+                    np.uint8
+                )
+                masks.append(m)
+                continue
+
+            if type(ann["segmentation"][0]) == list:  # polygon
+                rle = mask.frPyObjects(
+                    ann["segmentation"], image_info["height"], image_info["width"]
+                )
+            else:
+                rle = ann["segmentation"]
+                for i in range(len(rle)):
+                    if not isinstance(rle[i]["counts"], bytes):
+                        rle[i]["counts"] = rle[i]["counts"].encode()
+            m = mask.decode(rle)
+            m = np.sum(m, axis=2)  # sometimes there are multiple binary map (corresponding to multiple segs)
+            m = m.astype(np.uint8)  # convert to np.uint8
+            masks.append(m)
+
+        masks = np.stack(masks, axis=0)
+
+        masks = torch.from_numpy(masks)
+        label = torch.ones(masks.shape[1], masks.shape[2]) * self.IGNORE_LABEL
+        # set bboxes to None for segmentation datasets
+        bboxes = None
+
+        return (image_path, global_enc_img, grounding_enc_img, bboxes, conversations, masks, label,
+                image_resize, questions, selected_labels)
diff --git a/groundingLMM/dataset/segm_datasets/Semantic_Segm_ds.py b/groundingLMM/dataset/segm_datasets/Semantic_Segm_ds.py
new file mode 100644
index 0000000000000000000000000000000000000000..b50ff6d113ee57b7d02a39b149c5d86348d32eb8
--- /dev/null
+++ b/groundingLMM/dataset/segm_datasets/Semantic_Segm_ds.py
@@ -0,0 +1,248 @@
+import os
+import cv2
+import glob
+import json
+import random
+import numpy as np
+from PIL import Image
+import torch
+import torch.nn.functional as F
+from pycocotools.coco import COCO
+from transformers import CLIPImageProcessor
+from model.llava import conversation as conversation_lib
+from model.SAM.utils.transforms import ResizeLongestSide
+from tools.utils import DEFAULT_IMAGE_TOKEN
+from dataset.utils.utils import ANSWER_LIST, SEG_QUESTIONS
+
+
+def load_json_file(file_path):
+    with open(file_path, 'r') as file:
+        return json.load(file)
+
+
+def init_ade20k(dataset_dir):
+    ade20k_classes = load_json_file("dataset/utils/ade20k_classes.json")
+    ade20k_image_dir = os.path.join(dataset_dir, "ade20k", "images", "training")
+    ade20k_images = [os.path.join(ade20k_image_dir, img) for img in os.listdir(ade20k_image_dir) if
+                     img.endswith('.jpg')]
+    ade20k_labels = [img.replace(".jpg", ".png").replace("images", "annotations") for img in ade20k_images]
+    return np.array(ade20k_classes), ade20k_images, ade20k_labels
+
+
+def init_cocostuff(dataset_dir):
+    with open("dataset/utils/cocostuff_classes.txt") as file:
+        cocostuff_classes = [line.strip().split(": ")[-1] for line in file.readlines()[1:]]
+    # Annotations
+    cocostuff_labels = glob.glob(os.path.join(dataset_dir, "cocostuff", "train2017", "*.png"))
+    # Images are obtained from COCO 2017 images
+    cocostuff_images = [label.replace(".png", ".jpg").replace("cocostuff", "coco_2017").replace("Semantic_Segm/", "") for
+                        label in cocostuff_labels]
+    return np.array(cocostuff_classes), cocostuff_images, cocostuff_labels
+
+
+def init_paco_lvis(dataset_dir):
+    paco_lvis_api = COCO(os.path.join(dataset_dir, "paco_lvis", "annotations", "paco_lvis_v1_train.json"))
+    all_classes = paco_lvis_api.loadCats(paco_lvis_api.getCatIds())
+    class_map_paco_lvis = {}
+
+    for cat in all_classes:
+        cat_split = cat["name"].strip().split(":")
+        if len(cat_split) == 1:
+            name = cat_split[0].split("_(")[0]
+        else:
+            assert len(cat_split) == 2
+            obj, part = cat_split
+            obj = obj.split("_(")[0]
+            part = part.split("_(")[0]
+            name = (obj, part)
+        class_map_paco_lvis[cat["id"]] = name
+
+    img_ids = paco_lvis_api.getImgIds()
+    return class_map_paco_lvis, img_ids, paco_lvis_api
+
+
+def init_pascal_part(dataset_dir):
+    pascal_part_api = COCO(os.path.join(dataset_dir, "pascal_part", "train.json"))
+    all_classes = pascal_part_api.loadCats(pascal_part_api.getCatIds())
+    class_map_pascal_part = {}
+    for cat in all_classes:
+        cat_main, cat_part = cat["name"].strip().split(":")
+        name = (cat_main, cat_part)
+        class_map_pascal_part[cat["id"]] = name
+    img_ids = pascal_part_api.getImgIds()
+    return class_map_pascal_part, img_ids, pascal_part_api
+
+
+def init_mapillary(dataset_dir):
+    mapillary_path = os.path.join(dataset_dir, "mapillary")
+    mapillary_classes = [cls["readable"].lower() for cls in
+                         load_json_file(os.path.join(mapillary_path, "config_v2.0.json"))["labels"]]
+    mapillary_labels = sorted(glob.glob(os.path.join(mapillary_path, "training", "v2.0", "labels", "*.png")))
+    mapillary_images = [label.replace(".png", ".jpg").replace("v2.0/labels", "images") for label in mapillary_labels]
+    return np.array(mapillary_classes), mapillary_images, mapillary_labels
+
+
+class SemanticSegmDataset(torch.utils.data.Dataset):
+    CLASSES = ('object',)
+    IMG_MEAN = torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1)
+    IMG_STD = torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1)
+    IMG_SIZE = 1024
+    IGNORE_LABEL = 255
+
+    def __init__(self, dataset_dir, tokenizer, global_image_encoder, epoch_samples=500 * 8 * 2 * 10,
+                 precision: str = "fp32", image_size: int = 224, num_classes_per_sample: int = 3,
+                 semantic_segm_data="ade20k||cocostuff||pascal_part||paco_lvis||mapillary", validation=False,
+                 random_sampling=True):
+        self.epoch_samples = epoch_samples
+        self.num_classes_per_sample = num_classes_per_sample
+
+        self.image_size = image_size
+        self.tokenizer = tokenizer
+        self.precision = precision
+        self.transform = ResizeLongestSide(image_size)
+        self.global_enc_processor = CLIPImageProcessor.from_pretrained(global_image_encoder)
+
+        self.question_templates = SEG_QUESTIONS
+        self.answer_list = ANSWER_LIST
+        self.begin_str = f"""The {DEFAULT_IMAGE_TOKEN} provides an overview of the picture.\n"""
+        self.validation = validation
+        self.random_sampling = random_sampling
+        
+        self.data2list = {}
+        self.data2classes = {}
+        self.dataset_dir = os.path.join(dataset_dir, "Semantic_Segm")
+        self.semantic_seg_ds_list = semantic_segm_data.split("||")
+        for ds in self.semantic_seg_ds_list:
+            classes, images, labels = eval("init_{}".format(ds))(self.dataset_dir)
+            self.data2list[ds] = (images, labels)
+            self.data2classes[ds] = classes
+            print(f'\033[92m----SEG-{"Val" if validation else "Train"}: Loaded ReferSeg - {ds} dataset ----\033[0m')
+
+        if "cocostuff" in self.semantic_seg_ds_list:
+            self.cocostuff_class2index = {c: i for i, c in enumerate(self.data2classes["cocostuff"])}
+        
+    def __len__(self):
+        return self.epoch_samples
+
+    def _set_len(self, length):
+        self.epoch_samples = length
+
+    def grounding_enc_processor(self, x: torch.Tensor) -> torch.Tensor:
+        x = (x - self.IMG_MEAN) / self.IMG_STD
+        h, w = x.shape[-2:]
+        x = F.pad(x, (0, self.IMG_SIZE - w, 0, self.IMG_SIZE - h))
+        return x
+
+    def create_conversations(self, labels, dataset_name):
+        questions = []
+        answers = []
+        class_ids = []
+        for i, label in enumerate(labels):
+            label = label.strip()
+            assert len(label.split("||")) == 1
+            question_template = random.choice(self.question_templates)
+            questions.append(question_template.format(class_name=label.lower()))
+            answers.append(random.choice(self.answer_list))
+
+            if dataset_name in ["paco_lvis", "pascal_part"]:
+                continue
+            class_id = self.data2classes[dataset_name].tolist().index(label)
+            class_ids.append(class_id)
+
+        conversations = []
+        conv = conversation_lib.default_conversation.copy()
+        conv.messages = []
+        for i, (question, answer) in enumerate(zip(questions, answers)):
+            if i == 0:
+                question = self.begin_str + question
+            conv.append_message(conv.roles[0], question)
+            conv.append_message(conv.roles[1], answer)
+        conversations.append(conv.get_prompt())
+        return questions, conversations, class_ids
+
+    def __getitem__(self, idx):
+        dataset_idx = random.randint(0, len(self.semantic_seg_ds_list) - 1)
+        dataset_name = self.semantic_seg_ds_list[dataset_idx]
+
+        if dataset_name in ["paco_lvis", "pascal_part"]:
+            class_map = self.data2classes[dataset_name]
+            img_ids, coco_api = self.data2list[dataset_name]
+            random_idx = random.randint(0, len(img_ids) - 1)
+            img_info = coco_api.loadImgs([img_ids[random_idx]])[0]
+            file_name = img_info["file_name"]
+            image_path = (os.path.join(
+                 self.dataset_dir, dataset_name, "VOCdevkit", "VOC2010", "JPEGImages", file_name
+                ) if dataset_name == "pascal_part" else self.dataset_dir.replace("Semantic_Segm/", ""),
+                          "coco_2017", file_name)
+
+            annotation_ids = coco_api.getAnnIds(imgIds=img_info["id"])
+            annotations = coco_api.loadAnns(annotation_ids)
+            if not annotations:
+                return self.__getitem__(0)
+
+            sampled_anns = np.random.choice(annotations, self.num_classes_per_sample, replace=False) if len(
+                annotations
+                ) >= self.num_classes_per_sample else annotations
+            selected_labels = []
+            for ann in sampled_anns:
+                category_id = ann["category_id"]
+                sampled_cls = class_map[category_id]
+                if isinstance(sampled_cls, tuple):
+                    obj, part = sampled_cls
+                    name = f"{obj} {part}" if random.random() < 0.5 else f"the {part} of the {obj}"
+                else:
+                    name = sampled_cls
+                selected_labels.append(name)
+
+        elif dataset_name in ["ade20k", "cocostuff", "mapillary"]:
+            images, labels = self.data2list[dataset_name]
+            idx = idx if (self.validation or not self.random_sampling) else random.randint(0, len(images) - 1)
+            image_path, label_path = images[idx], labels[idx]
+            label = np.array(Image.open(label_path))
+            if dataset_name == "ade20k":
+                label = np.where(label == 0, 255, label - 1)
+            elif dataset_name == "cocostuff":
+                ignored_classes = [index for class_name, index in self.cocostuff_class2index.items() if
+                                   "-" in class_name]
+                label = np.where(np.isin(label, ignored_classes), 255, label)
+
+            unique_labels = [lbl for lbl in np.unique(label) if lbl != 255]
+            if not unique_labels:
+                return self.__getitem__(0)
+
+            classes = [self.data2classes[dataset_name][lbl] for lbl in unique_labels]
+            selected_labels = np.random.choice(
+                classes, min(len(classes), self.num_classes_per_sample), replace=False
+                ) if len(classes) >= self.num_classes_per_sample else classes
+
+        # Load and process the image
+        image = cv2.imread(image_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        global_enc_img = self.global_enc_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+        image = self.transform.apply_image(image)
+        image_resize = image.shape[:2]
+        grounding_enc_img = self.grounding_enc_processor(torch.from_numpy(image).permute(2, 0, 1).contiguous())
+
+        # Generate questions and answers
+        questions, conversations, class_ids = self.create_conversations(selected_labels, dataset_name)
+        if dataset_name in ["paco_lvis", "pascal_part"]:
+            try:
+                masks = [coco_api.annToMask(ann) for ann in sampled_anns]
+            except Exception as e:
+                print(f"Error generating mask: {e}")
+                return self.__getitem__(0)
+
+            masks = np.stack(masks, axis=0)
+            masks = torch.from_numpy(masks)
+            label = torch.ones(masks.shape[1], masks.shape[2]) * self.IGNORE_LABEL
+        else:
+            label = torch.from_numpy(label).long()
+            masks = torch.stack([label == class_id for class_id in class_ids], dim=0)
+
+        assert len(conversations) == 1
+        assert conversations[0].count("[SEG]") == masks.shape[0]
+        # set bboxes to None for segmentation datasets
+        bboxes = None
+
+        return (image_path, global_enc_img, grounding_enc_img, bboxes, conversations, masks, label,
+                image_resize, questions, selected_labels)
diff --git a/groundingLMM/dataset/utils/ade20k_classes.json b/groundingLMM/dataset/utils/ade20k_classes.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f96e616bc3fd2f8c0ec4caea975d77c680f44bb
--- /dev/null
+++ b/groundingLMM/dataset/utils/ade20k_classes.json
@@ -0,0 +1,30 @@
+[
+    "wall", "building", "sky", "floor", "tree", "ceiling", "road",
+    "bed", "windowpane", "grass", "cabinet", "sidewalk",
+    "person", "earth", "door", "table", "mountain", "plant",
+    "curtain", "chair", "car", "water", "painting", "sofa",
+    "shelf", "house", "sea", "mirror", "rug", "field", "armchair",
+    "seat", "fence", "desk", "rock", "wardrobe", "lamp",
+    "bathtub", "railing", "cushion", "base", "box", "column",
+    "signboard", "chest of drawers", "counter", "sand", "sink",
+    "skyscraper", "fireplace", "refrigerator", "grandstand",
+    "path", "stairs", "runway", "case", "pool table", "pillow",
+    "screen door", "stairway", "river", "bridge", "bookcase",
+    "blind", "coffee table", "toilet", "flower", "book", "hill",
+    "bench", "countertop", "stove", "palm", "kitchen island",
+    "computer", "swivel chair", "boat", "bar", "arcade machine",
+    "hovel", "bus", "towel", "light", "truck", "tower",
+    "chandelier", "awning", "streetlight", "booth",
+    "television receiver", "airplane", "dirt track", "apparel",
+    "pole", "land", "bannister", "escalator", "ottoman", "bottle",
+    "buffet", "poster", "stage", "van", "ship", "fountain",
+    "conveyer belt", "canopy", "washer", "plaything",
+    "swimming pool", "stool", "barrel", "basket", "waterfall",
+    "tent", "bag", "minibike", "cradle", "oven", "ball", "food",
+    "step", "tank", "trade name", "microwave", "pot", "animal",
+    "bicycle", "lake", "dishwasher", "screen", "blanket",
+    "sculpture", "hood", "sconce", "vase", "traffic light",
+    "tray", "ashcan", "fan", "pier", "crt screen", "plate",
+    "monitor", "bulletin board", "shower", "radiator", "glass",
+    "clock", "flag"
+]
\ No newline at end of file
diff --git a/groundingLMM/dataset/utils/cocostuff_classes.txt b/groundingLMM/dataset/utils/cocostuff_classes.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1d5a692b83ac8eead2bfffa805e1115cef737bae
--- /dev/null
+++ b/groundingLMM/dataset/utils/cocostuff_classes.txt
@@ -0,0 +1,183 @@
+0: unlabeled
+1: person
+2: bicycle
+3: car
+4: motorcycle
+5: airplane
+6: bus
+7: train
+8: truck
+9: boat
+10: traffic light
+11: fire hydrant
+12: street sign
+13: stop sign
+14: parking meter
+15: bench
+16: bird
+17: cat
+18: dog
+19: horse
+20: sheep
+21: cow
+22: elephant
+23: bear
+24: zebra
+25: giraffe
+26: hat
+27: backpack
+28: umbrella
+29: shoe
+30: eye glasses
+31: handbag
+32: tie
+33: suitcase
+34: frisbee
+35: skis
+36: snowboard
+37: sports ball
+38: kite
+39: baseball bat
+40: baseball glove
+41: skateboard
+42: surfboard
+43: tennis racket
+44: bottle
+45: plate
+46: wine glass
+47: cup
+48: fork
+49: knife
+50: spoon
+51: bowl
+52: banana
+53: apple
+54: sandwich
+55: orange
+56: broccoli
+57: carrot
+58: hot dog
+59: pizza
+60: donut
+61: cake
+62: chair
+63: couch
+64: potted plant
+65: bed
+66: mirror
+67: dining table
+68: window
+69: desk
+70: toilet
+71: door
+72: tv
+73: laptop
+74: mouse
+75: remote
+76: keyboard
+77: cell phone
+78: microwave
+79: oven
+80: toaster
+81: sink
+82: refrigerator
+83: blender
+84: book
+85: clock
+86: vase
+87: scissors
+88: teddy bear
+89: hair drier
+90: toothbrush
+91: hair brush
+92: banner
+93: blanket
+94: branch
+95: bridge
+96: building-other
+97: bush
+98: cabinet
+99: cage
+100: cardboard
+101: carpet
+102: ceiling-other
+103: ceiling-tile
+104: cloth
+105: clothes
+106: clouds
+107: counter
+108: cupboard
+109: curtain
+110: desk-stuff
+111: dirt
+112: door-stuff
+113: fence
+114: floor-marble
+115: floor-other
+116: floor-stone
+117: floor-tile
+118: floor-wood
+119: flower
+120: fog
+121: food-other
+122: fruit
+123: furniture-other
+124: grass
+125: gravel
+126: ground-other
+127: hill
+128: house
+129: leaves
+130: light
+131: mat
+132: metal
+133: mirror-stuff
+134: moss
+135: mountain
+136: mud
+137: napkin
+138: net
+139: paper
+140: pavement
+141: pillow
+142: plant-other
+143: plastic
+144: platform
+145: playingfield
+146: railing
+147: railroad
+148: river
+149: road
+150: rock
+151: roof
+152: rug
+153: salad
+154: sand
+155: sea
+156: shelf
+157: sky
+158: skyscraper
+159: snow
+160: solid-other
+161: stairs
+162: stone
+163: straw
+164: structural-other
+165: table
+166: tent
+167: textile-other
+168: towel
+169: tree
+170: vegetable
+171: wall-brick
+172: wall-concrete
+173: wall-other
+174: wall-panel
+175: wall-stone
+176: wall-tile
+177: wall-wood
+178: water-other
+179: waterdrops
+180: window-blind
+181: window-other
+182: wood
diff --git a/groundingLMM/dataset/utils/grefer.py b/groundingLMM/dataset/utils/grefer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c881c5860a2bbfc89eb91b8fcf91cc32c27fbbf
--- /dev/null
+++ b/groundingLMM/dataset/utils/grefer.py
@@ -0,0 +1,352 @@
+"""
+grefer v0.1
+This interface provides access to gRefCOCO.
+
+The following API functions are defined:
+G_REFER      - REFER api class
+getRefIds    - get ref ids that satisfy given filter conditions.
+getAnnIds    - get ann ids that satisfy given filter conditions.
+getImgIds    - get image ids that satisfy given filter conditions.
+getCatIds    - get category ids that satisfy given filter conditions.
+loadRefs     - load refs with the specified ref ids.
+loadAnns     - load anns with the specified ann ids.
+loadImgs     - load images with the specified image ids.
+loadCats     - load category names with the specified category ids.
+getRefBox    - get ref's bounding box [x, y, w, h] given the ref_id
+showRef      - show image, segmentation or box of the referred object with the ref
+getMaskByRef - get mask and area of the referred object given ref or ref ids
+getMask      - get mask and area of the referred object given ref
+showMask     - show mask of the referred object given ref
+"""
+
+import itertools
+import json
+import os.path as osp
+import pickle
+import time
+
+import matplotlib.pyplot as plt
+import numpy as np
+import skimage.io as io
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from pycocotools import mask
+
+
+class G_REFER:
+    def __init__(self, data_root, dataset="grefcoco", splitBy="unc"):
+        # provide data_root folder which contains grefcoco
+        print("loading dataset %s into memory..." % dataset)
+        self.ROOT_DIR = osp.abspath(osp.dirname(__file__))
+        self.DATA_DIR = osp.join(data_root, dataset)
+        if dataset in ["grefcoco"]:
+            self.IMAGE_DIR = osp.join(data_root, "images/train2014")
+        else:
+            raise KeyError("No refer dataset is called [%s]" % dataset)
+
+        tic = time.time()
+
+        # load refs from data/dataset/refs(dataset).json
+        self.data = {}
+        self.data["dataset"] = dataset
+
+        ref_file = osp.join(self.DATA_DIR, f"grefs({splitBy}).p")
+        if osp.exists(ref_file):
+            self.data["refs"] = pickle.load(open(ref_file, "rb"), fix_imports=True)
+        else:
+            ref_file = osp.join(self.DATA_DIR, f"grefs({splitBy}).json")
+            if osp.exists(ref_file):
+                self.data["refs"] = json.load(open(ref_file, "rb"))
+            else:
+                raise FileNotFoundError("JSON file not found")
+
+        # load annotations from data/dataset/instances.json
+        instances_file = osp.join(self.DATA_DIR, "instances.json")
+        instances = json.load(open(instances_file, "r"))
+        self.data["images"] = instances["images"]
+        self.data["annotations"] = instances["annotations"]
+        self.data["categories"] = instances["categories"]
+
+        # create index
+        self.createIndex()
+        print("DONE (t=%.2fs)" % (time.time() - tic))
+
+    @staticmethod
+    def _toList(x):
+        return x if isinstance(x, list) else [x]
+
+    @staticmethod
+    def match_any(a, b):
+        a = a if isinstance(a, list) else [a]
+        b = b if isinstance(b, list) else [b]
+        return set(a) & set(b)
+
+    def createIndex(self):
+        # create sets of mapping
+        # 1)  Refs: 	 	{ref_id: ref}
+        # 2)  Anns: 	 	{ann_id: ann}
+        # 3)  Imgs:		 	{image_id: image}
+        # 4)  Cats: 	 	{category_id: category_name}
+        # 5)  Sents:     	{sent_id: sent}
+        # 6)  imgToRefs: 	{image_id: refs}
+        # 7)  imgToAnns: 	{image_id: anns}
+        # 8)  refToAnn:  	{ref_id: ann}
+        # 9)  annToRef:  	{ann_id: ref}
+        # 10) catToRefs: 	{category_id: refs}
+        # 11) sentToRef: 	{sent_id: ref}
+        # 12) sentToTokens: {sent_id: tokens}
+        print("creating index...")
+        # fetch info from instances
+        Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {}
+        Anns[-1] = None
+        for ann in self.data["annotations"]:
+            Anns[ann["id"]] = ann
+            imgToAnns[ann["image_id"]] = imgToAnns.get(ann["image_id"], []) + [ann]
+        for img in self.data["images"]:
+            Imgs[img["id"]] = img
+        for cat in self.data["categories"]:
+            Cats[cat["id"]] = cat["name"]
+
+        # fetch info from refs
+        Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {}
+        Sents, sentToRef, sentToTokens = {}, {}, {}
+        availableSplits = []
+        for ref in self.data["refs"]:
+            # ids
+            ref_id = ref["ref_id"]
+            ann_id = ref["ann_id"]
+            category_id = ref["category_id"]
+            image_id = ref["image_id"]
+
+            if ref["split"] not in availableSplits:
+                availableSplits.append(ref["split"])
+
+            # add mapping related to ref
+            if ref_id in Refs:
+                print("Duplicate ref id")
+            Refs[ref_id] = ref
+            imgToRefs[image_id] = imgToRefs.get(image_id, []) + [ref]
+
+            category_id = self._toList(category_id)
+            added_cats = []
+            for cat in category_id:
+                if cat not in added_cats:
+                    added_cats.append(cat)
+                    catToRefs[cat] = catToRefs.get(cat, []) + [ref]
+
+            ann_id = self._toList(ann_id)
+            refToAnn[ref_id] = [Anns[ann] for ann in ann_id]
+            for ann_id_n in ann_id:
+                annToRef[ann_id_n] = annToRef.get(ann_id_n, []) + [ref]
+
+            # add mapping of sent
+            for sent in ref["sentences"]:
+                Sents[sent["sent_id"]] = sent
+                sentToRef[sent["sent_id"]] = ref
+                sentToTokens[sent["sent_id"]] = sent["tokens"]
+
+        # create class members
+        self.Refs = Refs
+        self.Anns = Anns
+        self.Imgs = Imgs
+        self.Cats = Cats
+        self.Sents = Sents
+        self.imgToRefs = imgToRefs
+        self.imgToAnns = imgToAnns
+        self.refToAnn = refToAnn
+        self.annToRef = annToRef
+        self.catToRefs = catToRefs
+        self.sentToRef = sentToRef
+        self.sentToTokens = sentToTokens
+        self.availableSplits = availableSplits
+        print("index created.")
+
+    def getRefIds(self, image_ids=[], cat_ids=[], split=[]):
+        image_ids = self._toList(image_ids)
+        cat_ids = self._toList(cat_ids)
+        split = self._toList(split)
+
+        for s in split:
+            if s not in self.availableSplits:
+                raise ValueError(f"Invalid split name: {s}")
+
+        refs = self.data["refs"]
+
+        if len(image_ids) > 0:
+            lists = [self.imgToRefs[image_id] for image_id in image_ids]
+            refs = list(itertools.chain.from_iterable(lists))
+        if len(cat_ids) > 0:
+            refs = [ref for ref in refs if self.match_any(ref["category_id"], cat_ids)]
+        if len(split) > 0:
+            refs = [ref for ref in refs if ref["split"] in split]
+
+        ref_ids = [ref["ref_id"] for ref in refs]
+        return ref_ids
+
+    def getAnnIds(self, image_ids=[], ref_ids=[]):
+        image_ids = self._toList(image_ids)
+        ref_ids = self._toList(ref_ids)
+
+        if any([len(image_ids), len(ref_ids)]):
+            if len(image_ids) > 0:
+                lists = [
+                    self.imgToAnns[image_id]
+                    for image_id in image_ids
+                    if image_id in self.imgToAnns
+                ]
+                anns = list(itertools.chain.from_iterable(lists))
+            else:
+                anns = self.data["annotations"]
+            ann_ids = [ann["id"] for ann in anns]
+            if len(ref_ids) > 0:
+                lists = [self.Refs[ref_id]["ann_id"] for ref_id in ref_ids]
+                anns_by_ref_id = list(itertools.chain.from_iterable(lists))
+                ann_ids = list(set(ann_ids).intersection(set(anns_by_ref_id)))
+        else:
+            ann_ids = [ann["id"] for ann in self.data["annotations"]]
+
+        return ann_ids
+
+    def getImgIds(self, ref_ids=[]):
+        ref_ids = self._toList(ref_ids)
+
+        if len(ref_ids) > 0:
+            image_ids = list(set([self.Refs[ref_id]["image_id"] for ref_id in ref_ids]))
+        else:
+            image_ids = self.Imgs.keys()
+        return image_ids
+
+    def getCatIds(self):
+        return self.Cats.keys()
+
+    def loadRefs(self, ref_ids=[]):
+        return [self.Refs[ref_id] for ref_id in self._toList(ref_ids)]
+
+    def loadAnns(self, ann_ids=[]):
+        if isinstance(ann_ids, str):
+            ann_ids = int(ann_ids)
+        return [self.Anns[ann_id] for ann_id in self._toList(ann_ids)]
+
+    def loadImgs(self, image_ids=[]):
+        return [self.Imgs[image_id] for image_id in self._toList(image_ids)]
+
+    def loadCats(self, cat_ids=[]):
+        return [self.Cats[cat_id] for cat_id in self._toList(cat_ids)]
+
+    def getRefBox(self, ref_id):
+        anns = self.refToAnn[ref_id]
+        return [ann["bbox"] for ann in anns]  # [x, y, w, h]
+
+    def showRef(self, ref, seg_box="seg"):
+        ax = plt.gca()
+        # show image
+        image = self.Imgs[ref["image_id"]]
+        I = io.imread(osp.join(self.IMAGE_DIR, image["file_name"]))
+        ax.imshow(I)
+        # show refer expression
+        for sid, sent in enumerate(ref["sentences"]):
+            print("%s. %s" % (sid + 1, sent["sent"]))
+        # show segmentations
+        if seg_box == "seg":
+            ann_id = ref["ann_id"]
+            ann = self.Anns[ann_id]
+            polygons = []
+            color = []
+            c = "none"
+            if type(ann["segmentation"][0]) == list:
+                # polygon used for refcoco*
+                for seg in ann["segmentation"]:
+                    poly = np.array(seg).reshape((len(seg) / 2, 2))
+                    polygons.append(Polygon(poly, True, alpha=0.4))
+                    color.append(c)
+                p = PatchCollection(
+                    polygons,
+                    facecolors=color,
+                    edgecolors=(1, 1, 0, 0),
+                    linewidths=3,
+                    alpha=1,
+                )
+                ax.add_collection(p)  # thick yellow polygon
+                p = PatchCollection(
+                    polygons,
+                    facecolors=color,
+                    edgecolors=(1, 0, 0, 0),
+                    linewidths=1,
+                    alpha=1,
+                )
+                ax.add_collection(p)  # thin red polygon
+            else:
+                # mask used for refclef
+                rle = ann["segmentation"]
+                m = mask.decode(rle)
+                img = np.ones((m.shape[0], m.shape[1], 3))
+                color_mask = np.array([2.0, 166.0, 101.0]) / 255
+                for i in range(3):
+                    img[:, :, i] = color_mask[i]
+                ax.imshow(np.dstack((img, m * 0.5)))
+        # show bounding-box
+        elif seg_box == "box":
+            ann_id = ref["ann_id"]
+            ann = self.Anns[ann_id]
+            bbox = self.getRefBox(ref["ref_id"])
+            box_plot = Rectangle(
+                (bbox[0], bbox[1]),
+                bbox[2],
+                bbox[3],
+                fill=False,
+                edgecolor="green",
+                linewidth=3,
+            )
+            ax.add_patch(box_plot)
+
+    def getMask(self, ann):
+        if not ann:
+            return None
+        if ann["iscrowd"]:
+            raise ValueError("Crowd object")
+        image = self.Imgs[ann["image_id"]]
+        if type(ann["segmentation"][0]) == list:  # polygon
+            rle = mask.frPyObjects(ann["segmentation"], image["height"], image["width"])
+        else:
+            rle = ann["segmentation"]
+
+        m = mask.decode(rle)
+        m = np.sum(
+            m, axis=2
+        )  # sometimes there are multiple binary map (corresponding to multiple segs)
+        m = m.astype(np.uint8)  # convert to np.uint8
+        # compute area
+        area = sum(mask.area(rle))  # should be close to ann['area']
+        return {"mask": m, "area": area}
+
+    def getMaskByRef(self, ref=None, ref_id=None, merge=False):
+        if not ref and not ref_id:
+            raise ValueError
+        if ref:
+            ann_ids = ref["ann_id"]
+            ref_id = ref["ref_id"]
+        else:
+            ann_ids = self.getAnnIds(ref_ids=ref_id)
+
+        if ann_ids == [-1]:
+            img = self.Imgs[self.Refs[ref_id]["image_id"]]
+            return {
+                "mask": np.zeros([img["height"], img["width"]], dtype=np.uint8),
+                "empty": True,
+            }
+
+        anns = self.loadAnns(ann_ids)
+        mask_list = [self.getMask(ann) for ann in anns if not ann["iscrowd"]]
+
+        if merge:
+            merged_masks = sum([mask["mask"] for mask in mask_list])
+            merged_masks[np.where(merged_masks > 1)] = 1
+            return {"mask": merged_masks, "empty": False}
+        else:
+            return mask_list
+
+    def showMask(self, ref):
+        M = self.getMask(ref)
+        msk = M["mask"]
+        ax = plt.gca()
+        ax.imshow(msk)
diff --git a/groundingLMM/dataset/utils/refcoco_refer.py b/groundingLMM/dataset/utils/refcoco_refer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b4cea716e40e73d0b5aa118143eb076392f5eb1
--- /dev/null
+++ b/groundingLMM/dataset/utils/refcoco_refer.py
@@ -0,0 +1,391 @@
+__author__ = "licheng"
+
+"""
+This interface provides access to four datasets:
+1) refclef
+2) refcoco
+3) refcoco+
+4) refcocog
+split by unc and google
+
+The following API functions are defined:
+REFER      - REFER api class
+getRefIds  - get ref ids that satisfy given filter conditions.
+getAnnIds  - get ann ids that satisfy given filter conditions.
+getImgIds  - get image ids that satisfy given filter conditions.
+getCatIds  - get category ids that satisfy given filter conditions.
+loadRefs   - load refs with the specified ref ids.
+loadAnns   - load anns with the specified ann ids.
+loadImgs   - load images with the specified image ids.
+loadCats   - load category names with the specified category ids.
+getRefBox  - get ref's bounding box [x, y, w, h] given the ref_id
+showRef    - show image, segmentation or box of the referred object with the ref
+getMask    - get mask and area of the referred object given ref
+showMask   - show mask of the referred object given ref
+"""
+
+import itertools
+import json
+import os.path as osp
+import pickle
+import sys
+import time
+from pprint import pprint
+
+import matplotlib.pyplot as plt
+import numpy as np
+import skimage.io as io
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from pycocotools import mask
+
+
+class REFER:
+    def __init__(self, data_root, dataset="refcoco", splitBy="unc"):
+        # provide data_root folder which contains refclef, refcoco, refcoco+ and refcocog
+        # also provide dataset name and splitBy information
+        # e.g., dataset = 'refcoco', splitBy = 'unc'
+        print("loading dataset %s into memory..." % dataset)
+        self.ROOT_DIR = osp.abspath(osp.dirname(__file__))
+        self.DATA_DIR = osp.join(data_root, dataset)
+        if dataset in ["refcoco", "refcoco+", "refcocog"]:
+            self.IMAGE_DIR = osp.join(data_root, "images/mscoco/images/train2014")
+        elif dataset == "refclef":
+            self.IMAGE_DIR = osp.join(data_root, "images/saiapr_tc-12")
+        else:
+            print("No refer dataset is called [%s]" % dataset)
+            sys.exit()
+
+        self.dataset = dataset
+
+        # load refs from data/dataset/refs(dataset).json
+        tic = time.time()
+
+        ref_file = osp.join(self.DATA_DIR, "refs(" + splitBy + ").p")
+        print("ref_file: ", ref_file)
+        self.data = {}
+        self.data["dataset"] = dataset
+        self.data["refs"] = pickle.load(open(ref_file, "rb"))
+
+        # load annotations from data/dataset/instances.json
+        instances_file = osp.join(self.DATA_DIR, "instances.json")
+        instances = json.load(open(instances_file, "rb"))
+        self.data["images"] = instances["images"]
+        self.data["annotations"] = instances["annotations"]
+        self.data["categories"] = instances["categories"]
+
+        # create index
+        self.createIndex()
+        print("DONE (t=%.2fs)" % (time.time() - tic))
+
+    def createIndex(self):
+        # create sets of mapping
+        # 1)  Refs: 	 	{ref_id: ref}
+        # 2)  Anns: 	 	{ann_id: ann}
+        # 3)  Imgs:		 	{image_id: image}
+        # 4)  Cats: 	 	{category_id: category_name}
+        # 5)  Sents:     	{sent_id: sent}
+        # 6)  imgToRefs: 	{image_id: refs}
+        # 7)  imgToAnns: 	{image_id: anns}
+        # 8)  refToAnn:  	{ref_id: ann}
+        # 9)  annToRef:  	{ann_id: ref}
+        # 10) catToRefs: 	{category_id: refs}
+        # 11) sentToRef: 	{sent_id: ref}
+        # 12) sentToTokens: {sent_id: tokens}
+        print("creating index...")
+        # fetch info from instances
+        Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {}
+        for ann in self.data["annotations"]:
+            Anns[ann["id"]] = ann
+            imgToAnns[ann["image_id"]] = imgToAnns.get(ann["image_id"], []) + [ann]
+        for img in self.data["images"]:
+            Imgs[img["id"]] = img
+        for cat in self.data["categories"]:
+            Cats[cat["id"]] = cat["name"]
+
+        # fetch info from refs
+        Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {}
+        Sents, sentToRef, sentToTokens = {}, {}, {}
+        for ref in self.data["refs"]:
+            # ids
+            ref_id = ref["ref_id"]
+            ann_id = ref["ann_id"]
+            category_id = ref["category_id"]
+            image_id = ref["image_id"]
+
+            # add mapping related to ref
+            Refs[ref_id] = ref
+            imgToRefs[image_id] = imgToRefs.get(image_id, []) + [ref]
+            catToRefs[category_id] = catToRefs.get(category_id, []) + [ref]
+            refToAnn[ref_id] = Anns[ann_id]
+            annToRef[ann_id] = ref
+
+            # add mapping of sent
+            for sent in ref["sentences"]:
+                Sents[sent["sent_id"]] = sent
+                sentToRef[sent["sent_id"]] = ref
+                sentToTokens[sent["sent_id"]] = sent["tokens"]
+
+        # create class members
+        self.Refs = Refs
+        self.Anns = Anns
+        self.Imgs = Imgs
+        self.Cats = Cats
+        self.Sents = Sents
+        self.imgToRefs = imgToRefs
+        self.imgToAnns = imgToAnns
+        self.refToAnn = refToAnn
+        self.annToRef = annToRef
+        self.catToRefs = catToRefs
+        self.sentToRef = sentToRef
+        self.sentToTokens = sentToTokens
+        print("index created.")
+
+    def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=""):
+        image_ids = image_ids if type(image_ids) == list else [image_ids]
+        cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+
+        if len(image_ids) == len(cat_ids) == len(ref_ids) == len(split) == 0:
+            refs = self.data["refs"]
+        else:
+            if not len(image_ids) == 0:
+                refs = [self.imgToRefs[image_id] for image_id in image_ids]
+            else:
+                refs = self.data["refs"]
+            if not len(cat_ids) == 0:
+                refs = [ref for ref in refs if ref["category_id"] in cat_ids]
+            if not len(ref_ids) == 0:
+                refs = [ref for ref in refs if ref["ref_id"] in ref_ids]
+            if not len(split) == 0:
+                if split in ["testA", "testB", "testC"]:
+                    refs = [
+                        ref for ref in refs if split[-1] in ref["split"]
+                    ]  # we also consider testAB, testBC, ...
+                elif split in ["testAB", "testBC", "testAC"]:
+                    refs = [
+                        ref for ref in refs if ref["split"] == split
+                    ]  # rarely used I guess...
+                elif split == "test":
+                    refs = [ref for ref in refs if "test" in ref["split"]]
+                elif split == "train" or split == "val":
+                    refs = [ref for ref in refs if ref["split"] == split]
+                else:
+                    print("No such split [%s]" % split)
+                    sys.exit()
+        ref_ids = [ref["ref_id"] for ref in refs]
+        return ref_ids
+
+    def getAnnIds(self, image_ids=[], cat_ids=[], ref_ids=[]):
+        image_ids = image_ids if type(image_ids) == list else [image_ids]
+        cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+
+        if len(image_ids) == len(cat_ids) == len(ref_ids) == 0:
+            ann_ids = [ann["id"] for ann in self.data["annotations"]]
+        else:
+            if not len(image_ids) == 0:
+                lists = [
+                    self.imgToAnns[image_id]
+                    for image_id in image_ids
+                    if image_id in self.imgToAnns
+                ]  # list of [anns]
+                anns = list(itertools.chain.from_iterable(lists))
+            else:
+                anns = self.data["annotations"]
+            if not len(cat_ids) == 0:
+                anns = [ann for ann in anns if ann["category_id"] in cat_ids]
+            ann_ids = [ann["id"] for ann in anns]
+            if not len(ref_ids) == 0:
+                ids = set(ann_ids).intersection(
+                    set([self.Refs[ref_id]["ann_id"] for ref_id in ref_ids])
+                )
+        return ann_ids
+
+    def getImgIds(self, ref_ids=[]):
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+
+        if not len(ref_ids) == 0:
+            image_ids = list(set([self.Refs[ref_id]["image_id"] for ref_id in ref_ids]))
+        else:
+            image_ids = self.Imgs.keys()
+        return image_ids
+
+    def getCatIds(self):
+        return self.Cats.keys()
+
+    def loadRefs(self, ref_ids=[]):
+        if type(ref_ids) == list:
+            return [self.Refs[ref_id] for ref_id in ref_ids]
+        elif type(ref_ids) == int:
+            return [self.Refs[ref_ids]]
+
+    def loadAnns(self, ann_ids=[]):
+        if type(ann_ids) == list:
+            return [self.Anns[ann_id] for ann_id in ann_ids]
+        elif type(ann_ids) == int or type(ann_ids) == unicode:
+            return [self.Anns[ann_ids]]
+
+    def loadImgs(self, image_ids=[]):
+        if type(image_ids) == list:
+            return [self.Imgs[image_id] for image_id in image_ids]
+        elif type(image_ids) == int:
+            return [self.Imgs[image_ids]]
+
+    def loadCats(self, cat_ids=[]):
+        if type(cat_ids) == list:
+            return [self.Cats[cat_id] for cat_id in cat_ids]
+        elif type(cat_ids) == int:
+            return [self.Cats[cat_ids]]
+
+    def getRefBox(self, ref_id):
+        ref = self.Refs[ref_id]
+        ann = self.refToAnn[ref_id]
+        return ann["bbox"]  # [x, y, w, h]
+
+    def showRef(self, ref, seg_box="seg"):
+        ax = plt.gca()
+        # show image
+        image = self.Imgs[ref["image_id"]]
+        I = io.imread(osp.join(self.IMAGE_DIR, image["file_name"]))
+        ax.imshow(I)
+        # show refer expression
+        for sid, sent in enumerate(ref["sentences"]):
+            print("%s. %s" % (sid + 1, sent["sent"]))
+        # show segmentations
+        if seg_box == "seg":
+            ann_id = ref["ann_id"]
+            ann = self.Anns[ann_id]
+            polygons = []
+            color = []
+            c = "none"
+            if type(ann["segmentation"][0]) == list:
+                # polygon used for refcoco*
+                for seg in ann["segmentation"]:
+                    poly = np.array(seg).reshape((len(seg) / 2, 2))
+                    polygons.append(Polygon(poly, True, alpha=0.4))
+                    color.append(c)
+                p = PatchCollection(
+                    polygons,
+                    facecolors=color,
+                    edgecolors=(1, 1, 0, 0),
+                    linewidths=3,
+                    alpha=1,
+                )
+                ax.add_collection(p)  # thick yellow polygon
+                p = PatchCollection(
+                    polygons,
+                    facecolors=color,
+                    edgecolors=(1, 0, 0, 0),
+                    linewidths=1,
+                    alpha=1,
+                )
+                ax.add_collection(p)  # thin red polygon
+            else:
+                # mask used for refclef
+                rle = ann["segmentation"]
+                m = mask.decode(rle)
+                img = np.ones((m.shape[0], m.shape[1], 3))
+                color_mask = np.array([2.0, 166.0, 101.0]) / 255
+                for i in range(3):
+                    img[:, :, i] = color_mask[i]
+                ax.imshow(np.dstack((img, m * 0.5)))
+        # show bounding-box
+        elif seg_box == "box":
+            ann_id = ref["ann_id"]
+            ann = self.Anns[ann_id]
+            bbox = self.getRefBox(ref["ref_id"])
+            box_plot = Rectangle(
+                (bbox[0], bbox[1]),
+                bbox[2],
+                bbox[3],
+                fill=False,
+                edgecolor="green",
+                linewidth=3,
+            )
+            ax.add_patch(box_plot)
+
+    def getMask(self, ref):
+        # return mask, area and mask-center
+        ann = self.refToAnn[ref["ref_id"]]
+        image = self.Imgs[ref["image_id"]]
+        if type(ann["segmentation"][0]) == list:  # polygon
+            rle = mask.frPyObjects(ann["segmentation"], image["height"], image["width"])
+        else:
+            rle = ann["segmentation"]
+        m = mask.decode(rle)
+        m = np.sum(
+            m, axis=2
+        )  # sometimes there are multiple binary map (corresponding to multiple segs)
+        m = m.astype(np.uint8)  # convert to np.uint8
+        # compute area
+        area = sum(mask.area(rle))  # should be close to ann['area']
+        return {"mask": m, "area": area}
+        # # position
+        # position_x = np.mean(np.where(m==1)[1]) # [1] means columns (matlab style) -> x (c style)
+        # position_y = np.mean(np.where(m==1)[0]) # [0] means rows (matlab style)    -> y (c style)
+        # # mass position (if there were multiple regions, we use the largest one.)
+        # label_m = label(m, connectivity=m.ndim)
+        # regions = regionprops(label_m)
+        # if len(regions) > 0:
+        # 	largest_id = np.argmax(np.array([props.filled_area for props in regions]))
+        # 	largest_props = regions[largest_id]
+        # 	mass_y, mass_x = largest_props.centroid
+        # else:
+        # 	mass_x, mass_y = position_x, position_y
+        # # if centroid is not in mask, we find the closest point to it from mask
+        # if m[mass_y, mass_x] != 1:
+        # 	print('Finding closes mask point ...')
+        # 	kernel = np.ones((10, 10),np.uint8)
+        # 	me = cv2.erode(m, kernel, iterations = 1)
+        # 	points = zip(np.where(me == 1)[0].tolist(), np.where(me == 1)[1].tolist())  # row, col style
+        # 	points = np.array(points)
+        # 	dist   = np.sum((points - (mass_y, mass_x))**2, axis=1)
+        # 	id     = np.argsort(dist)[0]
+        # 	mass_y, mass_x = points[id]
+        # 	# return
+        # return {'mask': m, 'area': area, 'position_x': position_x, 'position_y': position_y, 'mass_x': mass_x, 'mass_y': mass_y}
+        # # show image and mask
+        # I = io.imread(osp.join(self.IMAGE_DIR, image['file_name']))
+        # plt.figure()
+        # plt.imshow(I)
+        # ax = plt.gca()
+        # img = np.ones( (m.shape[0], m.shape[1], 3) )
+        # color_mask = np.array([2.0,166.0,101.0])/255
+        # for i in range(3):
+        #     img[:,:,i] = color_mask[i]
+        # ax.imshow(np.dstack( (img, m*0.5) ))
+        # plt.show()
+
+    def showMask(self, ref):
+        M = self.getMask(ref)
+        msk = M["mask"]
+        ax = plt.gca()
+        ax.imshow(msk)
+
+
+if __name__ == "__main__":
+    refer = REFER(dataset="refcocog", splitBy="google")
+    ref_ids = refer.getRefIds()
+    print(len(ref_ids))
+
+    print(len(refer.Imgs))
+    print(len(refer.imgToRefs))
+
+    ref_ids = refer.getRefIds(split="train")
+    print("There are %s training referred objects." % len(ref_ids))
+
+    for ref_id in ref_ids:
+        ref = refer.loadRefs(ref_id)[0]
+        if len(ref["sentences"]) < 2:
+            continue
+
+        pprint(ref)
+        print("The label is %s." % refer.Cats[ref["category_id"]])
+        plt.figure()
+        refer.showRef(ref, seg_box="box")
+        plt.show()
+
+        # plt.figure()
+        # refer.showMask(ref)
+        # plt.show()
diff --git a/groundingLMM/dataset/utils/utils.py b/groundingLMM/dataset/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7f8b23ca90ec2986761bf8b5317c3247588e55c
--- /dev/null
+++ b/groundingLMM/dataset/utils/utils.py
@@ -0,0 +1,115 @@
+
+CAPTION_QUESTIONS = [
+    'Could you please give me a detailed description of the image?',
+    'Can you provide a thorough description of the this image?',
+    'Please provide a thorough description of the this image',
+    'Please provide a thorough description of the this image.',
+    'Please describe in detail the contents of the image.',
+    'Please describe in detail the contents of the image',
+    'Could you give a comprehensive explanation of what can be found within this picture?',
+    'Could you give me an elaborate explanation of this picture?',
+    'Could you provide me with a detailed analysis of this photo?',
+    'Could you please give me a detailed description of the image?',
+    'Can you provide a thorough description of the this image?',
+    'Please describe in detail the contents of the image',
+    'Please describe in detail the contents of the image.',
+    'Can you give a comprehensive explanation of this photo',
+    'Please provide an elaborate explanation of this picture.',
+    'Please provide an elaborate explanation of this picture',
+    'Could you provide me with a detailed analysis of this photo',
+]
+
+REGION_QUESTIONS = [
+    'Can you provide me with a detailed description of the region in the picture marked by <region>?',
+    "I'm curious about the region represented by <region> in the picture. Could you describe it in detail?",
+    'What can you tell me about the region indicated by <region> in the image?',
+    "I'd like to know more about the area in the photo labeled <region>. Can you give me a detailed description?",
+    'Could you describe the region shown as <region> in the picture in great detail?',
+    'What details can you give me about the region outlined by <region> in the photo?',
+    'Please provide me with a comprehensive description of the region marked with <region> in the image.',
+    'Can you give me a detailed account of the region labeled as <region> in the picture?',
+    "I'm interested in learning more about the region represented by <region> in the photo. Can you describe it in detail?",
+    'What is the region outlined by <region> in the picture like? Could you give me a detailed description?',
+    'Can you provide me with a detailed description of the region in the picture marked by <region>, please?',
+    "I'm curious about the region represented by <region> in the picture. Could you describe it in detail, please?",
+    'What can you tell me about the region indicated by <region> in the image, exactly?',
+    "I'd like to know more about the area in the photo labeled <region>, please. Can you give me a detailed description?",
+    'Could you describe the region shown as <region> in the picture in great detail, please?',
+    'What details can you give me about the region outlined by <region> in the photo, please?',
+    'Please provide me with a comprehensive description of the region marked with <region> in the image, please.',
+    'Can you give me a detailed account of the region labeled as <region> in the picture, please?',
+    "I'm interested in learning more about the region represented by <region> in the photo. Can you describe it in detail, please?",
+    'What is the region outlined by <region> in the picture like, please? Could you give me a detailed description?',
+]
+
+REGION_GROUP_QUESTIONS = [
+    'Could you please give me a detailed description of these areas <region>?',
+    'Can you provide a thorough description of the regions <region> in this image?',
+    'Please describe in detail the contents of the boxed areas <region>.',
+    'Could you give a comprehensive explanation of what can be found within <region> in the picture?',
+    'Could you give me an elaborate explanation of the <region> regions in this picture?',
+    'Can you provide a comprehensive description of the areas identified by <region> in this photo?',
+    'Help me understand the specific locations labeled <region> in this picture in detail, please.',
+    'What is the detailed information about the areas marked by <region> in this image?',
+    'Could you provide me with a detailed analysis of the regions designated <region> in this photo?',
+    'What are the specific features of the areas marked <region> in this picture that you can describe in detail?',
+    'Could you elaborate on the regions identified by <region> in this image?',
+    'What can you tell me about the areas labeled <region> in this picture?',
+    'Can you provide a thorough analysis of the specific locations designated <region> in this photo?',
+    'I am interested in learning more about the regions marked <region> in this image. Can you provide me with more information?',
+    'Could you please provide a detailed description of the areas identified by <region> in this photo?',
+    'What is the significance of the regions labeled <region> in this picture?',
+    'I would like to know more about the specific locations designated <region> in this image. Can you provide me with more information?',
+    'Can you provide a detailed breakdown of the regions marked <region> in this photo?',
+    'What specific features can you tell me about the areas identified by <region> in this picture?',
+    'Could you please provide a comprehensive explanation of the locations labeled <region> in this image?',
+    'Can you provide a detailed account of the regions designated <region> in this photo?',
+    'I am curious about the areas marked <region> in this picture. Can you provide me with a detailed analysis?',
+    'What important details can you tell me about the specific locations identified by <region> in this image?',
+    'Could you please provide a detailed description of the regions labeled <region> in this photo?',
+    'What can you tell me about the features of the areas designated <region> in this picture?',
+    'Can you provide a comprehensive overview of the regions marked <region> in this image?',
+    'I would like to know more about the specific locations identified by <region> in this photo. Can you provide me with more information?',
+    'What is the detailed information you have on the areas labeled <region> in this picture?',
+    'Could you provide me with a thorough analysis of the regions designated <region> in this image?',
+    'Can you provide a detailed explanation of the specific locations marked by <region> in this photo?'
+]
+
+GCG_QUESTIONS = [
+    'Could you please give me a detailed description of the image? Please respond with interleaved segmentation masks for the corresponding parts of the answer.',
+    'Can you provide a thorough description of the this image? Please output with interleaved segmentation masks for the corresponding phrases.',
+    'Please describe in detail the contents of the image. Please respond with interleaved segmentation masks for the corresponding parts of the answer.',
+    'Could you give a comprehensive explanation of what can be found within this picture? Please output with interleaved segmentation masks for the corresponding phrases.',
+    'Could you give me an elaborate explanation of this picture? Please respond with interleaved segmentation masks for the corresponding phrases.',
+    'Could you provide me with a detailed analysis of this photo? Please output with interleaved segmentation masks for the corresponding parts of the answer.',
+]
+
+SEG_QUESTIONS = [
+    "Can you segment the {class_name} in this image?",
+    "Please segment {class_name} in this image.",
+    "What is {class_name} in this image? Please respond with segmentation mask.",
+    "What is {class_name} in this image? Please output segmentation mask.",
+
+    "Can you segment the {class_name} in this image",
+    "Please segment {class_name} in this image",
+    "What is {class_name} in this image? Please respond with segmentation mask",
+    "What is {class_name} in this image? Please output segmentation mask",
+
+    "Could you provide a segmentation mask for the {class_name} in this image?",
+    "Please identify and segment the {class_name} in this image.",
+    "Where is the {class_name} in this picture? Please respond with a segmentation mask.",
+    "Can you highlight the {class_name} in this image with a segmentation mask?",
+
+    "Could you provide a segmentation mask for the {class_name} in this image",
+    "Please identify and segment the {class_name} in this image",
+    "Where is the {class_name} in this picture? Please respond with a segmentation mask",
+    "Can you highlight the {class_name} in this image with a segmentation mask",
+]
+
+ANSWER_LIST = [
+    "It is [SEG].",
+    "Sure, [SEG].",
+    "Sure, it is [SEG].",
+    "Sure, the segmentation result is [SEG].",
+    "[SEG].",
+]
\ No newline at end of file
diff --git a/groundingLMM/mmcv/tests/data/config/a.py b/groundingLMM/mmcv/tests/data/config/a.py
new file mode 100644
index 0000000000000000000000000000000000000000..2364e1d10b054e99c2e1e5780cf8d0e007d659c2
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/a.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+item1 = [1, 2]
+item2 = {'a': 0}
+item3 = True
+item4 = 'test'
diff --git a/groundingLMM/mmcv/tests/data/config/b.json b/groundingLMM/mmcv/tests/data/config/b.json
new file mode 100644
index 0000000000000000000000000000000000000000..4bbbd09e8edebb1e8b93b9727a6ad5faab88e71e
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/b.json
@@ -0,0 +1,8 @@
+{
+    "item1": [1, 2],
+    "item2": {
+        "a": 0
+    },
+    "item3": true,
+    "item4": "test"
+}
\ No newline at end of file
diff --git a/groundingLMM/mmcv/tests/data/config/base.py b/groundingLMM/mmcv/tests/data/config/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..2364e1d10b054e99c2e1e5780cf8d0e007d659c2
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/base.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+item1 = [1, 2]
+item2 = {'a': 0}
+item3 = True
+item4 = 'test'
diff --git a/groundingLMM/mmcv/tests/data/config/c.yaml b/groundingLMM/mmcv/tests/data/config/c.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5365b7142fa06524678f3fd2502a97f4080c1d6c
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/c.yaml
@@ -0,0 +1,4 @@
+item1: [1, 2]
+item2: {'a': 0}
+item3: True
+item4: 'test'
diff --git a/groundingLMM/mmcv/tests/data/config/d.py b/groundingLMM/mmcv/tests/data/config/d.py
new file mode 100644
index 0000000000000000000000000000000000000000..19edcf82d0c9a40c007ba6a1eca03153f7056ce0
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/d.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = './base.py'
+item1 = [2, 3]
+item2 = {'a': 1}
+item3 = False
+item4 = 'test_base'
diff --git a/groundingLMM/mmcv/tests/data/config/delete.py b/groundingLMM/mmcv/tests/data/config/delete.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8a1eaf64c46d301f47a90d4ac907d1a0362e84e
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/delete.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = './base.py'
+item1 = {'a': 0, '_delete_': True}
+item2 = {'b': 0}
diff --git a/groundingLMM/mmcv/tests/data/config/deprecated.py b/groundingLMM/mmcv/tests/data/config/deprecated.py
new file mode 100644
index 0000000000000000000000000000000000000000..791b0f6ad8c41dbe14c4dd373beee1d8613b859a
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/deprecated.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = './expected.py'
+
+_deprecation_ = dict(
+    expected='tests/data/config/expected.py',
+    reference='https://github.com/open-mmlab/mmcv/pull/1275')
diff --git a/groundingLMM/mmcv/tests/data/config/deprecated_as_base.py b/groundingLMM/mmcv/tests/data/config/deprecated_as_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..406964d102ef0bfe1a6ab7513cee9e32052621cc
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/deprecated_as_base.py
@@ -0,0 +1,2 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = './deprecated.py'
\ No newline at end of file
diff --git a/groundingLMM/mmcv/tests/data/config/e.py b/groundingLMM/mmcv/tests/data/config/e.py
new file mode 100644
index 0000000000000000000000000000000000000000..1340e4bd27198e3d3ef82dbf516f22d8daf236f2
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/e.py
@@ -0,0 +1,3 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = './base.py'
+item3 = {'a': 1}
diff --git a/groundingLMM/mmcv/tests/data/config/expected.py b/groundingLMM/mmcv/tests/data/config/expected.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f6b729171a5b0c6158514bc500390c4ddbbbc76
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/expected.py
@@ -0,0 +1,2 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+item1 = 'expected'
diff --git a/groundingLMM/mmcv/tests/data/config/f.py b/groundingLMM/mmcv/tests/data/config/f.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6ed109bdeb01c0fede98d01d7f5e308113f7591
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/f.py
@@ -0,0 +1,3 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = './d.py'
+item4 = 'test_recursive_bases'
diff --git a/groundingLMM/mmcv/tests/data/config/g.py b/groundingLMM/mmcv/tests/data/config/g.py
new file mode 100644
index 0000000000000000000000000000000000000000..34d4ebe2f898a01ee8aa11a51f0383040213dc7f
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/g.py
@@ -0,0 +1,2 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+filename = 'reserved.py'
diff --git a/groundingLMM/mmcv/tests/data/config/h.py b/groundingLMM/mmcv/tests/data/config/h.py
new file mode 100644
index 0000000000000000000000000000000000000000..82594590cf4a73bed123e92ad8c392f3d4723148
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/h.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+item1 = '{{fileBasename}}'
+item2 = '{{ fileDirname}}'
+item3 = 'abc_{{ fileBasenameNoExtension }}'
diff --git a/groundingLMM/mmcv/tests/data/config/i_base.py b/groundingLMM/mmcv/tests/data/config/i_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..f31a46a15de9d84191e25e8117d84a50fc967474
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/i_base.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+item1 = [1, 2]
+item2 = {'a': 0}
+item3 = True
+item4 = 'test'
+item_cfg = {'b': 1}
+item5 = {'cfg': item_cfg}
+item6 = {'cfg': item_cfg}
diff --git a/groundingLMM/mmcv/tests/data/config/l.py b/groundingLMM/mmcv/tests/data/config/l.py
new file mode 100644
index 0000000000000000000000000000000000000000..85736f96e0b75e641d3520d4acfc4a179d3da422
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/l.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = ['./l1.py', './l2.yaml', './l3.json', './l4.py']
+item3 = False
+item4 = 'test'
diff --git a/groundingLMM/mmcv/tests/data/config/l1.py b/groundingLMM/mmcv/tests/data/config/l1.py
new file mode 100644
index 0000000000000000000000000000000000000000..13db1375e71095d4295bde140bceaad9db9e1c31
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/l1.py
@@ -0,0 +1,2 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+item1 = [1, 2]
diff --git a/groundingLMM/mmcv/tests/data/config/l3.json b/groundingLMM/mmcv/tests/data/config/l3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3251c5d6395974fa788eb27a368b03150eabd72c
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/l3.json
@@ -0,0 +1,3 @@
+{
+  "item3": true
+}
diff --git a/groundingLMM/mmcv/tests/data/config/l4.py b/groundingLMM/mmcv/tests/data/config/l4.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb7b4365ec3674339d3de106bee06c451d4d09ee
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/l4.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+item5 = dict(a=0, b=1)
+item6 = [dict(a=0), dict(b=1)]
+item7 = dict(a=[0, 1, 2], b=dict(c=[3.1, 4.2, 5.3]))
diff --git a/groundingLMM/mmcv/tests/data/config/m.py b/groundingLMM/mmcv/tests/data/config/m.py
new file mode 100644
index 0000000000000000000000000000000000000000..af81ca35ca5086e5288a823f7c60269d8e751e99
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/m.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = ['./l1.py', './l2.yaml', './l3.json', 'a.py']
+item3 = False
+item4 = 'test'
diff --git a/groundingLMM/mmcv/tests/data/config/n.py b/groundingLMM/mmcv/tests/data/config/n.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa7aae266e9068f9c85ffdf4d268ddfa01923ebb
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/n.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+test_item1 = [1, 2]
+bool_item2 = True
+str_item3 = 'test'
+dict_item4 = dict(
+    a={
+        'c/d': 'path/d',
+        'f': 's3//f',
+        6: '2333',
+        '2333': 'number'
+    },
+    b={'8': 543},
+    c={9: 678},
+    d={'a': 0},
+    f=dict(a='69'))
+dict_item5 = {'x/x': {'a.0': 233}}
+dict_list_item6 = {'x/x': [{'a.0': 1., 'b.0': 2.}, {'c/3': 3.}]}
diff --git a/groundingLMM/mmcv/tests/data/config/o.json b/groundingLMM/mmcv/tests/data/config/o.json
new file mode 100644
index 0000000000000000000000000000000000000000..84c5e3ed33ffb4365385c4af9b83196f9d28008d
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/o.json
@@ -0,0 +1,3 @@
+{
+    "item1": "{{ fileDirname }}"
+}
diff --git a/groundingLMM/mmcv/tests/data/config/p.yaml b/groundingLMM/mmcv/tests/data/config/p.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3b3e46e81a0b44a8c029e034d7008fa68fdf1c7f
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/p.yaml
@@ -0,0 +1 @@
+item1: '{{ fileDirname }}'
diff --git a/groundingLMM/mmcv/tests/data/config/q.py b/groundingLMM/mmcv/tests/data/config/q.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7ca0a70bb381f5b7249fec97b5ab8630f5dd57c
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/q.py
@@ -0,0 +1,2 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+custom_imports = dict(imports=['r'], allow_failed_imports=False)
diff --git a/groundingLMM/mmcv/tests/data/config/t.json b/groundingLMM/mmcv/tests/data/config/t.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f7b9b4a171e1fc617872c01026cebd09e6bea22
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/t.json
@@ -0,0 +1,13 @@
+{
+    "_base_": [
+        "./l1.py",
+        "./l2.yaml",
+        "./l3.json",
+        "./l4.py"
+    ],
+    "item3": false,
+    "item4": "test",
+    "item8": "{{fileBasename}}",
+    "item9": {{ _base_.item2 }},
+    "item10": {{ _base_.item7.b.c }}
+}
diff --git a/groundingLMM/mmcv/tests/data/config/t.py b/groundingLMM/mmcv/tests/data/config/t.py
new file mode 100644
index 0000000000000000000000000000000000000000..1df57cb5ad2343c3791925b00a53a0bfe726c626
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/t.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+_base_ = ['./l1.py', './l2.yaml', './l3.json', './l4.py']
+item3 = False
+item4 = 'test'
+item8 = '{{fileBasename}}'
+item9 = {{ _base_.item2 }}
+item10 = {{ _base_.item7.b.c }}
diff --git a/groundingLMM/mmcv/tests/data/config/t.yaml b/groundingLMM/mmcv/tests/data/config/t.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ab42859ec92af833e33fe757cf1f8ca116662b09
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/t.yaml
@@ -0,0 +1,6 @@
+_base_ : ['./l1.py', './l2.yaml', './l3.json', './l4.py']
+item3 : False
+item4 : 'test'
+item8 : '{{fileBasename}}'
+item9 : {{ _base_.item2 }}
+item10 : {{ _base_.item7.b.c }}
diff --git a/groundingLMM/mmcv/tests/data/config/u.json b/groundingLMM/mmcv/tests/data/config/u.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6a01e3c08f383802b4bd87de03760417efffb2d
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/u.json
@@ -0,0 +1,26 @@
+{
+    "_base_": [
+        "./t.py"
+    ],
+    "base": "_base_.item8",
+    "item11": {{ _base_.item8 }},
+    "item12": {{ _base_.item9 }},
+    "item13": {{ _base_.item10 }},
+    "item14": {{ _base_.item1 }},
+    "item15": {
+        "a": {
+            "b": {{ _base_.item2 }}
+        },
+        "b": [
+            {{ _base_.item3 }}
+        ],
+        "c": [{{ _base_.item4 }}],
+        "d": [[
+            {
+                "e": {{ _base_.item5.a }}
+            }
+        ],
+        {{ _base_.item6 }}],
+        "e": {{ _base_.item1 }}
+    }
+}
diff --git a/groundingLMM/mmcv/tests/data/config/u.yaml b/groundingLMM/mmcv/tests/data/config/u.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d201cb926dc948e292b22057c89b8d6734285b72
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/config/u.yaml
@@ -0,0 +1,15 @@
+_base_: ["./t.py"]
+base: "_base_.item8"
+item11: {{ _base_.item8 }}
+item12: {{ _base_.item9 }}
+item13: {{ _base_.item10 }}
+item14: {{ _base_.item1 }}
+item15:
+    a:
+        b: {{ _base_.item2 }}
+    b: [{{ _base_.item3 }}]
+    c: [{{ _base_.item4 }}]
+    d:
+        - [e: {{ _base_.item5.a }}]
+        - {{ _base_.item6 }}
+    e: {{ _base_.item1 }}
diff --git a/groundingLMM/mmcv/tests/data/for_scan/sub/1.json b/groundingLMM/mmcv/tests/data/for_scan/sub/1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/groundingLMM/mmcv/tests/data/for_scan/sub/1.txt b/groundingLMM/mmcv/tests/data/for_scan/sub/1.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/groundingLMM/mmcv/tests/data/model_zoo/deprecated.json b/groundingLMM/mmcv/tests/data/model_zoo/deprecated.json
new file mode 100644
index 0000000000000000000000000000000000000000..7c2d3e4584d516a1791b8324e5cdddcbb155c307
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/model_zoo/deprecated.json
@@ -0,0 +1,4 @@
+{
+  "train_old": "train",
+  "test_old": "test"
+}
\ No newline at end of file
diff --git a/groundingLMM/mmcv/tests/data/model_zoo/mmcv_home/open_mmlab.json b/groundingLMM/mmcv/tests/data/model_zoo/mmcv_home/open_mmlab.json
new file mode 100644
index 0000000000000000000000000000000000000000..c9ee238373053d310ab4ff8213a7c9e56765341d
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/model_zoo/mmcv_home/open_mmlab.json
@@ -0,0 +1,5 @@
+{
+  "test": "test.pth",
+  "val": "val.pth",
+  "train_empty": "train.pth"
+}
\ No newline at end of file
diff --git a/groundingLMM/mmcv/tests/data/model_zoo/open_mmlab.json b/groundingLMM/mmcv/tests/data/model_zoo/open_mmlab.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f5ada7ac511927fcb916baaf9bb04a8979b4fc4
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/model_zoo/open_mmlab.json
@@ -0,0 +1,4 @@
+{
+  "train": "https://localhost/train.pth",
+  "test": "https://localhost/test.pth"
+}
\ No newline at end of file
diff --git a/groundingLMM/mmcv/tests/data/scripts/hello.py b/groundingLMM/mmcv/tests/data/scripts/hello.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ed1a1e319fa36eb11ed3f0fcd365eb43a382d01
--- /dev/null
+++ b/groundingLMM/mmcv/tests/data/scripts/hello.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+#!/usr/bin/env python
+
+import argparse
+import warnings
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Say hello.')
+    parser.add_argument('name', help='To whom.')
+
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+    print(f'hello {args.name}!')
+    if args.name == 'agent':
+        warnings.warn('I have a secret!')
+
+
+if __name__ == '__main__':
+    main()