tuandunghcmut
/

vlm_clone_2

Model card Files Files and versions Community

tuandunghcmut commited on Apr 10

Commit

89fb2cb

verified ·

1 Parent(s): e3320cd

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
PaddleMIX/.travis/codestyle/clang_format.hook +4 -0
PaddleMIX/.travis/codestyle/cpplint_pre_commit.hook +27 -0
PaddleMIX/.travis/codestyle/pylint_pre_commit.hook +25 -0
PaddleMIX/applications/Audio2Caption/README.md +49 -0
PaddleMIX/applications/Audio2Img/README.md +106 -0
PaddleMIX/applications/Audio2Img/audio2img_imagebind.py +176 -0
PaddleMIX/applications/Audio2Img/gradio_demo.py +135 -0
PaddleMIX/applications/AudioChat/README.md +36 -0
PaddleMIX/applications/Automatic_label/README.md +19 -0
PaddleMIX/applications/Automatic_label/automatic_label.py +61 -0
PaddleMIX/applications/CVinW/README.md +19 -0
PaddleMIX/applications/CVinW/grounded_sam.py +46 -0
PaddleMIX/applications/Inpainting/README.md +87 -0
PaddleMIX/applications/Inpainting/grounded_sam_chatglm.py +256 -0
PaddleMIX/applications/Inpainting/grounded_sam_inpainting.py +234 -0
PaddleMIX/applications/MusicGeneration/README.md +89 -0
PaddleMIX/applications/VLChat/README.md +44 -0
PaddleMIX/applications/image2image/README.md +92 -0
PaddleMIX/applications/image2text/README.md +66 -0
PaddleMIX/applications/text2image/README.md +27 -0
PaddleMIX/applications/text2video/README.md +23 -0
PaddleMIX/deploy/llava/README.md +83 -0
PaddleMIX/deploy/llava/export_model.py +98 -0
PaddleMIX/deploy/llava/llama_inference_model.py +127 -0
PaddleMIX/deploy/llava/run_static_predict.py +403 -0
PaddleMIX/deploy/llava/utils.py +83 -0
PaddleMIX/deploy/qwen2_vl/README.md +50 -0
PaddleMIX/deploy/qwen2_vl/single_image_infer.py +276 -0
PaddleMIX/deploy/qwen_vl/run_static_predict.py +203 -0
PaddleMIX/deploy/sam/README.md +37 -0
PaddleMIX/deploy/sam/export.py +106 -0
PaddleMIX/deploy/sam/predict.py +374 -0
PaddleMIX/docs/hardware_support/ascend_usage.md +222 -0
PaddleMIX/paddlemix/datasets/__init__.py +37 -0
PaddleMIX/paddlemix/datasets/caption_dataset.py +109 -0
PaddleMIX/paddlemix/datasets/cc_sbu_dataset.py +93 -0
PaddleMIX/paddlemix/datasets/chatml_dataset.py +50 -0
PaddleMIX/paddlemix/datasets/coco_caption.py +17 -0
PaddleMIX/paddlemix/datasets/coco_vqa.py +138 -0
PaddleMIX/paddlemix/datasets/collator.py +362 -0
PaddleMIX/paddlemix/datasets/dataset.py +1169 -0
PaddleMIX/paddlemix/datasets/got_dataset.py +439 -0
PaddleMIX/paddlemix/datasets/internvl_dataset.py +688 -0
PaddleMIX/paddlemix/datasets/laiondata.py +139 -0
PaddleMIX/paddlemix/datasets/mixtoken_dataset.py +131 -0
PaddleMIX/paddlemix/datasets/vg_caption.py +37 -0
PaddleMIX/paddlemix/demo_images/critic_img_seven.png +0 -0
PaddleMIX/paddlemix/external_ops/setup.py +107 -0
PaddleMIX/paddlemix/metrics/clip_zero_shot.py +146 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 DeepSeek-VL2/vg.jpg filter=lfs diff=lfs merge=lfs -text
 Ovis/temp.png filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 DeepSeek-VL2/vg.jpg filter=lfs diff=lfs merge=lfs -text
 Ovis/temp.png filter=lfs diff=lfs merge=lfs -text
+VLM2Vec/figures/vlm2vec_results.png filter=lfs diff=lfs merge=lfs -text

PaddleMIX/.travis/codestyle/clang_format.hook ADDED Viewed

	@@ -0,0 +1,4 @@

+#!/bin/bash
+set -e
+clang-format $@

PaddleMIX/.travis/codestyle/cpplint_pre_commit.hook ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/bin/bash
+TOTAL_ERRORS=0
+if [[ ! $TRAVIS_BRANCH ]]; then
+  # install cpplint on local machine.
+  if [[ ! $(which cpplint) ]]; then
+    pip install cpplint
+  fi
+  # diff files on local machine.
+  files=$(git diff --cached --name-status | awk '$1 != "D" {print $2}')
+else
+  # diff files between PR and latest commit on Travis CI.
+  branch_ref=$(git rev-parse "$TRAVIS_BRANCH")
+  head_ref=$(git rev-parse HEAD)
+  files=$(git diff --name-status $branch_ref $head_ref | awk '$1 != "D" {print $2}')
+fi
+# The trick to remove deleted files: https://stackoverflow.com/a/2413151
+for file in $files; do
+    if [[ $file =~ ^(patches/.*) ]]; then
+        continue;
+    else
+        cpplint --filter=-readability/fn_size,-build/include_what_you_use,-build/c++11 $file;
+        TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
+    fi
+done
+exit $TOTAL_ERRORS

PaddleMIX/.travis/codestyle/pylint_pre_commit.hook ADDED Viewed

	@@ -0,0 +1,25 @@

+#!/bin/bash
+TOTAL_ERRORS=0
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+export PYTHONPATH=$DIR:$PYTHONPATH
+readonly VERSION="2.12.0"
+version=$(pylint --version | grep 'pylint')
+if ! [[ $version == *"$VERSION"* ]]; then
+    pip install pylint==2.12.0
+fi
+# The trick to remove deleted files: https://stackoverflow.com/a/2413151
+for file in $(git diff --name-status | awk '$1 != "D" {print $2}'); do
+    pylint --disable=all --load-plugins=docstring_checker \
+    --enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file;
+    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
+done
+exit $TOTAL_ERRORS
+#For now, just warning:
+#exit 0

PaddleMIX/applications/Audio2Caption/README.md ADDED Viewed

	@@ -0,0 +1,49 @@

+### 音频描述（Audio-to-Caption Generation）
+#### 1. Application introduction
+Enter audio and prompt words for question and answer.
+*****
+- No training is need.
+- Integration with the moedel of [conformer_u2pp_online_wenetspeech](), [chatglm]().
+----
+#### 2. Demo
+*****
+example:
+<!-- ```python
+python applications/AudioChat/audiochat.py \
+--chatglm_question_prompt "please describe this passage." \
+--input_audio_file "./zh.wav" \
+--chatglm_model_name_or_path "THUDM/chatglm-6b"   \
+``` -->
+```python
+#audio2caption -- Audio to caption converter
+from paddlemix.appflow import Appflow
+import paddle
+paddle.seed(1024)
+task = Appflow(app="audio2caption", models=["conformer_u2pp_online_wenetspeech", "THUDM/chatglm-6b"])
+audio_file = "./zh.wav"
+prompt = (
+    "描述这段话：{}."
+)
+result = task(audio=audio_file, prompt=prompt)['prompt']
+print(result)
+# 这段话表达了作者认为跑步最重要的好处之一是身体健康。作者认为,通过跑步,身体得到了良好的锻炼,身体健康得到了改善。作者还强调了跑步对身体健康的重要性,并认为这是最值得投资的运动之一。
+```
+<div align="center">
+|  Input Audio | Input Prompt | Output ASR | Output Text |
+| --- | --- | ---  | --- |
+|[zh.wav](https://github.com/luyao-cv/file_download/blob/main/assets/zh.wav) | "描述这段话." |"我认为跑步最重要的就是给我带来了身体健康" |这段话表达了作者认为跑步最重要的好处之一是身体健康。作者认为,通过跑步,身体得到了良好的锻炼,身体健康得到了改善。作者还强调了跑步对身体健康的重要性,并认为这是最值得投资的运动之一。 |
+<div>

PaddleMIX/applications/Audio2Img/README.md ADDED Viewed

	@@ -0,0 +1,106 @@

+### 音频生成图像（Audio-to-Image Generation）
+#### 1. Application introduction
+*****
+Generate image from audio(w/ prompt or image) with [ImageBind](https://facebookresearch.github.io/ImageBind/paper)'s unified latent space and stable-diffusion-2-1-unclip.
+- No training is need.
+- Integration with [ppdiffusers](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers).
+----
+**Support Tasks**
+- [Audio To Image](#audio-to-image)
+  - [1. Application Introduction](#1-Application)
+  - [2. Run](#2-Run)
+  - [3. Visualization](#3-Visualization)
+    - [Audio to Image](#audio-to-image-1)
+      - [3.1.1 Instruction](#311-Instruction)
+      - [3.1.2 Result](#312-Result)
+    - [Audio+Text to Image](#audiotext-to-image)
+      - [3.2.1 Instruction](#321-Instruction)
+      - [3.2.2 Result](#322-Result)
+    - [Audio+Image to Image](#audioimage-to-image)
+      - [3.3.1 Instruction](#331-Instruction)
+      - [3.3.2 Result](#332-Result)
+----
+**Update**
+[2023/8/15]:
+- [v0.0]: Support fusing audio, text(prompt) and imnage in ImageBind latent space.
+#### 2. Run
+*****
+example: Use audio generate image across modalities (e.g. Image, Text and Audio) with the model of ImageBind and StableUnCLIPImg2ImgPipeline.
+```python
+cd applications/Audio2Img
+python audio2img_imagebind.py \
+--model_name_or_path imagebind-1.2b/ \
+--stable_unclip_model_name_or_path stabilityai/stable-diffusion-2-1-unclip \
+--input_audio https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/bird_audio.wav \
+```
+----
+#### 3. Visualization
+----
+#### Audio to Image
+#### 3.1.1 Instruction
+```python
+cd applications/Audio2Img
+python audio2img_imagebind.py \
+--model_name_or_path imagebind-1.2b/ \
+--stable_unclip_model_name_or_path stabilityai/stable-diffusion-2-1-unclip \
+--input_audio https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/bird_audio.wav  \
+```
+#### 3.1.2 Result
+|  Input Audio | Output Image |
+| --- | --- |
+|[bird_audio.wav](https://github.com/luyao-cv/file_download/blob/main/assets/bird_audio.wav)| ![audio2img_output_bird](https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/audio2img_output_bird.jpg)  |
+#### Audio+Text to Image
+#### 3.2.1 Instruction
+```python
+cd applications/Audio2Img
+python audio2img_imagebind.py \
+--model_name_or_path imagebind-1.2b/ \
+--stable_unclip_model_name_or_path stabilityai/stable-diffusion-2-1-unclip \
+--input_audio https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/bird_audio.wav  \
+--input_text 'A photo.' \
+```
+#### 3.2.2 Result
+|  Input Audio | Input Text | Output Image |
+| --- | --- |  --- |
+|[bird_audio.wav](https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/bird_audio.wav) | 'A photo.' | ![audio_text_to_img_output_bird_a_photo](https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/audio_text_to_img_output_bird_a_photo.jpg)
+#### Audio+Image to Image
+#### 3.3.1 Instruction
+```python
+cd applications/Audio2Img
+python audio2img_imagebind.py \
+--model_name_or_path imagebind-1.2b/ \
+--stable_unclip_model_name_or_path stabilityai/stable-diffusion-2-1-unclip \
+--input_audio https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/wave.wav \
+--input_image https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/dog_image.jpg \
+```
+#### 3.3.2 Result
+|  Input Audio | Input Image | Output Image |
+| --- | --- |  --- |
+|[wave.wav](https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/wave.wav) | ![input_dog_image](https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/dog_image.jpg) | ![audio_img_to_img_output_wave_dog](https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/audio_img_to_img_output_wave_dog.jpg)

PaddleMIX/applications/Audio2Img/audio2img_imagebind.py ADDED Viewed

	@@ -0,0 +1,176 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse  # noqa: F401
+import os
+import sys  # noqa: F401
+from dataclasses import dataclass, field
+from types import SimpleNamespace
+import numpy as np  # noqa: F401
+import paddle
+import requests  # noqa: F401
+from paddlenlp.trainer import PdArgumentParser
+from PIL import Image
+import paddlemix.models.imagebind as ib  # noqa: F401
+from paddlemix import ImageBindModel, ImageBindProcessor
+from paddlemix.datasets import *  # noqa: F401,F403
+from paddlemix.models import *  # noqa: F401,F403
+from paddlemix.models.imagebind.modeling import ImageBindModel  # noqa: F811
+from paddlemix.models.imagebind.utils import *  # noqa: F401, F403
+from paddlemix.utils.log import logger
+from ppdiffusers import StableUnCLIPImg2ImgPipeline
+from ppdiffusers.utils import load_image
+# from paddlemix.models.imagebind.utils.resample import *
+# from paddlemix.models.imagebind.utils.paddle_aux import *
+ModalityType = SimpleNamespace(
+    VISION="vision",
+    TEXT="text",
+    AUDIO="audio",
+    THERMAL="thermal",
+    DEPTH="depth",
+    IMU="imu",
+)
+class Predictor:
+    def __init__(self, model_args):
+        self.processor = ImageBindProcessor.from_pretrained(model_args.model_name_or_path)
+        self.predictor = ImageBindModel.from_pretrained(model_args.model_name_or_path)
+        self.predictor.eval()
+    def run(self, inputs):
+        with paddle.no_grad():
+            embeddings = self.predictor(inputs)
+        return embeddings
+def main(model_args, data_args):
+    # build model
+    logger.info("imagebind_model: {}".format(model_args.model_name_or_path))
+    url = data_args.input_image
+    if os.path.isfile(url):
+        # read image
+        image_pil = Image.open(data_args.input_image).convert("RGB")
+    elif url:
+        image_pil = load_image(url)
+    else:
+        image_pil = None
+    url = data_args.input_audio
+    if os.path.isfile(url):
+        # read image
+        input_audio = data_args.input_audio
+    elif url:
+        os.system("wget {}".format(url))
+        input_audio = os.path.basename(data_args.input_audio)
+    else:
+        input_audio = None
+    predictor = Predictor(model_args)
+    encoding = predictor.processor(images=image_pil, text="", audios=input_audio, return_tensors="pd")
+    inputs = {}
+    if image_pil:
+        image_processor = encoding["pixel_values"]
+        inputs.update({ModalityType.VISION: image_processor})
+    if data_args.input_audio:
+        audio_processor = encoding["audio_values"]
+        inputs.update({ModalityType.AUDIO: audio_processor})
+    embeddings = predictor.run(inputs)
+    image_proj_embeds = embeddings[ModalityType.AUDIO]
+    if image_pil:
+        logger.info("Generate vision embedding: {}".format(embeddings[ModalityType.VISION]))
+        image_proj_embeds += embeddings[ModalityType.VISION]
+    if data_args.input_audio:
+        logger.info("Generate audio embedding: {}".format(embeddings[ModalityType.AUDIO]))
+    prompt = data_args.input_text
+    pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(model_args.stable_unclip_model_name_or_path)
+    pipe.set_progress_bar_config(disable=None)
+    output = pipe(image_embeds=image_proj_embeds, prompt=prompt)
+    os.makedirs(model_args.output_dir, exist_ok=True)
+    save_path = os.path.join(model_args.output_dir, "audio2img_imagebind_output.jpg")
+    logger.info("Generate image to: {}".format(save_path))
+    output.images[0].save(save_path)
+@dataclass
+class DataArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `PdArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+    input_text: str = field(default="", metadata={"help": "The name of prompt input."})
+    input_image: str = field(
+        default="",
+        # wget https://github.com/facebookresearch/ImageBind/blob/main/.assets/bird_image.jpg
+        metadata={"help": "The name of image input."},
+    )
+    input_audio: str = field(
+        default="",
+        # wget https://github.com/facebookresearch/ImageBind/blob/main/.assets/bird_audio.wav
+        metadata={"help": "The name of audio input."},
+    )
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        default="imagebind-1.2b/",
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
+    stable_unclip_model_name_or_path: str = field(
+        default="stabilityai/stable-diffusion-2-1-unclip",
+        metadata={"help": "Path to pretrained model or model identifier in stable_unclip_model_name_or_path"},
+    )
+    output_dir: str = field(default="vis_audio2img", metadata={"help": "The name of imagebind audio input."})
+    device: str = field(
+        default="GPU",
+        metadata={"help": "Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU."},
+    )
+if __name__ == "__main__":
+    parser = PdArgumentParser((ModelArguments, DataArguments))
+    model_args, data_args = parser.parse_args_into_dataclasses()
+    model_args.device = model_args.device.upper()
+    assert model_args.device in ["CPU", "GPU", "XPU", "NPU"], "device should be CPU, GPU, XPU or NPU"
+    main(model_args, data_args)

PaddleMIX/applications/Audio2Img/gradio_demo.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from types import SimpleNamespace
+import gradio as gr
+import paddle
+from paddlemix import ImageBindModel, ImageBindProcessor
+from paddlemix.utils.log import logger
+from ppdiffusers import StableUnCLIPImg2ImgPipeline
+ModalityType = SimpleNamespace(
+    VISION="vision",
+    TEXT="text",
+    AUDIO="audio",
+    THERMAL="thermal",
+    DEPTH="depth",
+    IMU="imu",
+)
+class Predictor:
+    def __init__(self, model_args):
+        self.processor = ImageBindProcessor.from_pretrained(model_args.model_name_or_path)
+        self.predictor = ImageBindModel.from_pretrained(model_args.model_name_or_path)
+        self.predictor.eval()
+    def run(self, inputs):
+        with paddle.no_grad():
+            embeddings = self.predictor(inputs)
+        return embeddings
+def model_init(model_args):
+    predictor = Predictor(model_args)
+    return predictor
+def infer(input_image, input_audio, input_text):
+    global predictor
+    image_pil = input_image
+    encoding = predictor.processor(images=image_pil, text="", audios=input_audio, return_tensors="pd")
+    inputs = {}
+    if image_pil is not None:
+        image_processor = encoding["pixel_values"]
+        inputs.update({ModalityType.VISION: image_processor})
+    if input_audio is not None:
+        audio_processor = encoding["audio_values"]
+        inputs.update({ModalityType.AUDIO: audio_processor})
+    else:
+        pass
+    embeddings = predictor.run(inputs)
+    image_proj_embeds = embeddings[ModalityType.AUDIO]
+    if image_pil is not None:
+        logger.info("Generate vision embedding: {}".format(embeddings[ModalityType.VISION]))
+        image_proj_embeds += embeddings[ModalityType.VISION]
+    logger.info("Generate audio embedding: {}".format(embeddings[ModalityType.AUDIO]))
+    if input_text is not None:
+        prompt = input_text
+    else:
+        prompt = ""
+    pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(model_args.stable_unclip_model_name_or_path)
+    pipe.set_progress_bar_config(disable=None)
+    output = pipe(image_embeds=image_proj_embeds, prompt=prompt)
+    return output.images[0]
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default="imagebind-1.2b/",
+        help="Path to pretrained model or model identifier",
+    )
+    parser.add_argument(
+        "--stable_unclip_model_name_or_path",
+        type=str,
+        default="stabilityai/stable-diffusion-2-1-unclip",
+        help="Path to pretrained model or model identifier in stable_unclip_model_name_or_path",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="GPU",
+        choices=["CPU", "GPU", "XPU"],
+        help="Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU.",
+    )
+    return parser.parse_args()
+with gr.Blocks() as demo:
+    gr.Markdown("音频生成图像（Audio-to-Image Generation）")
+    with gr.Row():
+        with gr.Column():
+            input_audio = gr.Audio(label="input audio", type="filepath")
+            with gr.Tab(label="input text（可选）") as txttab:
+                input_text = gr.Textbox(label="input text")
+            with gr.Tab(label="input image（可选）") as imgtab:
+                input_image = gr.Image(label="input image")
+            infer_button = gr.Button("推理")
+        output_image = gr.Image(label="result")
+        txttab.select(fn=lambda: None, outputs=input_image)
+        imgtab.select(fn=lambda: None, outputs=input_text)
+        infer_button.click(fn=infer, inputs=[input_image, input_audio, input_text], outputs=[output_image])
+if __name__ == "__main__":
+    model_args = parse_arguments()
+    assert model_args.device in ["CPU", "GPU", "XPU", "NPU"], "device should be CPU, GPU, XPU or NPU"
+    predictor = model_init(model_args)
+    demo.launch()

PaddleMIX/applications/AudioChat/README.md ADDED Viewed

	@@ -0,0 +1,36 @@

+### 音频对话（Audio-to-Chat Generation）
+#### 1. Application introduction
+Enter audio and prompt words for question and answer.
+*****
+- No training is need.
+- Integration with the moedel of [conformer_u2pp_online_wenetspeech](), [chatglm](). [fastspeech2]().
+----
+#### 2. Demo
+*****
+example:
+```python
+#audio_chat
+from paddlemix.appflow import Appflow
+import paddle
+paddle.seed(1024)
+task = Appflow(app="audio_chat", models=["conformer_u2pp_online_wenetspeech", "THUDM/chatglm-6b", "speech"])
+audio_file = "./zh.wav"
+prompt = (
+    "描述这段话：{}."
+)
+output_path = "tmp.wav"
+result = task(audio=audio_file, prompt=prompt, output=output_path)
+# 这段话表达了作者认为跑步最重要的好处之一是身体健康。作者认为,通过跑步,身体得到了良好的锻炼,身体健康得到了改善。作者还强调了跑步对身体健康的重要性,并认为这是最值得投资的运动之一。
+```
+|  Input Audio | Input Prompt |Output Text| Output Audio|
+| --- | --- | ---  | --- |
+|[zh.wav](https://github.com/luyao-cv/file_download/blob/main/assets/zh.wav) | "描述这段话." |"这段话表达了作者认为跑步最重要的好处之一是身体健康。作者认为,通过跑步,身体得到了良好的锻炼,身体健康得到了改善。作者还强调了跑步对身体健康的重要性,并认为这是最值得投资的运动之一。" |[audiochat-result.wav](https://github.com/luyao-cv/file_download/blob/main/assets/audiochat-result.wav)|

PaddleMIX/applications/Automatic_label/README.md ADDED Viewed

	@@ -0,0 +1,19 @@

+### 自动标注（AutoLabel）
+`automatic_label` 示例:
+```python
+python applications/Automatic_label/automatic_label.py
+```
+效果展示
+<div align="center">
+| Input Image | prompt| Generate Description | annotated image|
+|:----:|:----:|:----:|:----:|
+|![dog](https://github.com/LokeZhou/PaddleMIX/assets/13300429/badcfbdc-6b5a-40c4-9128-65259b3d1995) |describe the image| of the dog sitting on the bench in the field | ![dog_mask](https://github.com/LokeZhou/PaddleMIX/assets/13300429/6a1bd63e-6253-4354-8828-b4f45301fb30)|
+|![horse](https://github.com/LokeZhou/PaddleMIX/assets/13300429/2c68bf76-a402-4b7e-992a-20b9d19b017c) |describe the image| of the horse in the field with the mountains in the background |![horse_mask](https://github.com/LokeZhou/PaddleMIX/assets/13300429/f1188dce-457c-4116-9a34-cd95ec459cd6) |
+</div>

PaddleMIX/applications/Automatic_label/automatic_label.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import matplotlib.pyplot as plt
+import numpy as np
+from paddlemix.appflow import Appflow
+from ppdiffusers.utils import load_image
+def show_mask(mask, ax, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    ax.imshow(mask_image)
+def show_box(box, ax, label):
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2))
+    ax.text(x0, y0, label)
+task = Appflow(
+    app="auto_label",
+    models=["paddlemix/blip2-caption-opt2.7b", "GroundingDino/groundingdino-swint-ogc", "Sam/SamVitH-1024"],
+)
+url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+image_pil = load_image(url)
+blip2_prompt = "describe the image"
+result = task(image=image_pil, blip2_prompt=blip2_prompt)
+plt.figure(figsize=(10, 10))
+plt.imshow(result["image"])
+for mask in result["seg_masks"]:
+    show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
+    for box, label in zip(result["boxes"], result["labels"]):
+        show_box(box, plt.gca(), label)
+plt.axis("off")
+plt.savefig(
+    "mask_pred.jpg",
+    bbox_inches="tight",
+    dpi=300,
+    pad_inches=0.0,
+)

PaddleMIX/applications/CVinW/README.md ADDED Viewed

	@@ -0,0 +1,19 @@

+### 开放世界检测分割（Grounded-SAM: Detect and Segment Everything with Text Prompt）
+`Grounded-SAM` 示例:
+```python
+python applications/CVinW/grounded_sam.py
+```
+效果展示
+<div align="center">
+| Text prompt | Input Image | Generated Mask |
+|:----:|:----:|:----:|
+| dog | ![overture-creations](https://github.com/LokeZhou/PaddleMIX/assets/13300429/fe13b5f6-e773-41c2-9660-3b2747575fc1) | ![dog](https://github.com/LokeZhou/PaddleMIX/assets/13300429/f472cbd9-7b68-4699-888c-d4ea87fa8256) |
+| horse,grasses,sky | ![horse](https://github.com/LokeZhou/PaddleMIX/assets/13300429/cae06f3c-a0e3-46cb-8231-6e9eae58bc2b) | ![horse_mask](https://github.com/LokeZhou/PaddleMIX/assets/13300429/3e5e14b9-1089-43d5-8775-1fe678f104b1) |
+</div>

PaddleMIX/applications/CVinW/grounded_sam.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import matplotlib.pyplot as plt
+import numpy as np
+from paddlemix.appflow import Appflow
+from ppdiffusers.utils import load_image
+def show_mask(mask, ax, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    ax.imshow(mask_image)
+task = Appflow(
+    app="openset_det_sam", models=["GroundingDino/groundingdino-swint-ogc", "Sam/SamVitH-1024"], static_mode=False
+)  # 如果开启静态图推理，设置为True,默认动态图
+url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+image_pil = load_image(url)
+result = task(image=image_pil, prompt="dog")
+plt.figure(figsize=(10, 10))
+plt.imshow(image_pil)
+for mask in result["seg_masks"]:
+    show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
+plt.axis("off")
+plt.savefig("dog.jpg", bbox_inches="tight", dpi=300, pad_inches=0.0)

PaddleMIX/applications/Inpainting/README.md ADDED Viewed

	@@ -0,0 +1,87 @@

+### 检测框引导的图像编辑（Det-Guided-Inpainting)
+`Grounded-SAM-Inpainting` 示例:
+```python
+from paddlemix.appflow import Appflow
+from ppdiffusers.utils import load_image
+import paddle
+task = Appflow(app="inpainting",
+               models=["GroundingDino/groundingdino-swint-ogc","Sam/SamVitH-1024","stabilityai/stable-diffusion-2-inpainting"]
+               )
+paddle.seed(1024)
+url = "https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg"
+image_pil =  load_image(url)
+result = task(image=image_pil,prompt="bus",inpaint_prompt="a yellow van")
+```
+<div align="center">
+| Input Image | Det Prompt | Generated Mask | Inpaint Prompt | Inpaint Image |
+|:----:|:----:|:----:|:----:|:----:|
+| ![bus](https://github.com/LokeZhou/PaddleMIX/assets/13300429/95f73037-097e-4712-95be-17d5ca489f11) | bus | ![text_inapinting_seg](https://github.com/LokeZhou/PaddleMIX/assets/13300429/5b68fc15-aebe-4e05-b420-edd6989a66ef)| a yellow van | ![text_inpainting](https://github.com/LokeZhou/PaddleMIX/assets/13300429/451da53c-3b7d-4a9d-8063-01a92eae0768)|
+</div>
+### 文本检测框引导的图像编辑（ChatAndDet-Guided-Inpainting)
+`Grounded-SAM-chatglm` 示例:
+```python
+import paddle
+from paddlemix.appflow import Appflow
+from ppdiffusers.utils import load_image
+task = Appflow(app="inpainting",
+               models=["THUDM/chatglm-6b","GroundingDino/groundingdino-swint-ogc","Sam/SamVitH-1024","stabilityai/stable-diffusion-2-inpainting"]
+               )
+paddle.seed(1024)
+url = "https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg"
+image_pil =  load_image(url)
+inpaint_prompt = "bus is changed to A school bus parked on the roadside"
+prompt = "Given caption,extract the main object to be replaced and marked it as 'main_object'," \
+         + "Extract the remaining part as 'other prompt', " \
+         + "Return main_object, other prompt in English" \
+         + "Given caption: {}.".format(inpaint_prompt)
+result = task(image=image_pil,prompt=prompt)
+```
+一些效果展示
+<div align="center">
+| Input Image | Prompt | Generated Mask | Inpaint Prompt |
+|:----:|:----:|:----:|:----:|
+| ![bus](https://github.com/LokeZhou/PaddleMIX/assets/13300429/95f73037-097e-4712-95be-17d5ca489f11) |  bus is changed to A school bus parked on the roadside | ![chat_inpainting_seg](https://github.com/LokeZhou/PaddleMIX/assets/13300429/dedf9943-6ef2-42df-b4ad-b8336208b283)| ![chat_inpainting](https://github.com/LokeZhou/PaddleMIX/assets/13300429/1e3c2cdb-8202-41ee-acc9-b56e6b53005c)|
+</div>
+### 文本引导的图像编辑（Text-Guided Image Inpainting)
+```python
+import paddle
+from paddlemix.appflow import Appflow
+from PIL import Image
+from ppdiffusers.utils import load_image
+img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
+image = load_image(img_url)
+mask_image = load_image(mask_url)
+paddle.seed(1024)
+prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+app = Appflow(app='inpainting',models=['stabilityai/stable-diffusion-2-inpainting'])
+image = app(inpaint_prompt=prompt,image=image,seg_masks=mask_image)['result']
+image.save("a_yellow_cat.png")
+```
+<div align="center">
+| Input Image | Inpaint Prompt | Mask | Inpaint Image |
+|:----:|:----:|:----:|:----:|
+| ![overture-creations](https://github.com/LokeZhou/PaddleMIX/assets/13300429/fe13b5f6-e773-41c2-9660-3b2747575fc1) | Face of a yellow cat, high resolution, sitting on a park bench|![overture-creations-mask](https://github.com/LokeZhou/PaddleMIX/assets/13300429/8c3dbb3a-5a32-4c22-b66e-7b82fcd18b77) |![a_yellow_cat](https://github.com/LokeZhou/PaddleMIX/assets/13300429/094ba90a-35c0-4a50-ac1f-6e0ce91ea931) |
+</div>

PaddleMIX/applications/Inpainting/grounded_sam_chatglm.py ADDED Viewed

	@@ -0,0 +1,256 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from dataclasses import dataclass, field
+import matplotlib.pyplot as plt
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+import requests
+from paddlenlp import Taskflow
+from paddlenlp.trainer import PdArgumentParser
+from PIL import Image
+from paddlemix.models.groundingdino.modeling import GroundingDinoModel
+from paddlemix.models.sam.modeling import SamModel
+from paddlemix.processors.groundingdino_processing import GroundingDinoProcessor
+from paddlemix.processors.sam_processing import SamProcessor
+from paddlemix.utils.log import logger
+from ppdiffusers import StableDiffusionInpaintPipeline
+def show_mask(mask, ax, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    ax.imshow(mask_image)
+def show_box(box, ax, label):
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2))
+    ax.text(x0, y0, label)
+@dataclass
+class DataArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `PdArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+    input_image: str = field(
+        metadata={"help": "The name of input image."},
+    )
+    prompt: str = field(
+        default=None,
+        metadata={"help": "The prompt of the image to be inpaint."},
+    )
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    stable_diffusion_pipeline_name_or_path: str = field(
+        default="stabilityai/stable-diffusion-2-inpainting",
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
+    dino_model_name_or_path: str = field(
+        default="GroundingDino/groundingdino-swint-ogc",
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
+    sam_model_name_or_path: str = field(
+        default="Sam/SamVitH-1024",
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
+    chatglm_model_name_or_path: str = field(
+        default="THUDM/chatglm-6b",
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
+    box_threshold: float = field(
+        default=0.3,
+        metadata={"help": "box threshold."},
+    )
+    text_threshold: float = field(
+        default=0.25,
+        metadata={"help": "text threshold."},
+    )
+    output_dir: str = field(
+        default="inpainting_output",
+        metadata={"help": "output directory."},
+    )
+    visual: bool = field(
+        default=True,
+        metadata={"help": "save visual image."},
+    )
+def filter_prompts_with_chatglm(caption, model_name_or_path="THUDM/chatglm-6b"):
+    prompt = (
+        "Given caption,extract the main object to be replaced and marked it as 'main_object', "
+        + "Extract the remaining part as 'other prompt', "
+        + "Return main_object, other prompt in English"
+        + "Given caption: {}.".format(caption)
+    )
+    logger.info("chatglm: {}".format(model_name_or_path))
+    textGen = Taskflow("text2text_generation", model=model_name_or_path)
+    reply = textGen(prompt)["result"][0]
+    det_prompt, inpaint_prompt = (
+        reply.split("\n")[0].split(":")[-1].strip(),
+        reply.split("\n")[-1].split(":")[-1].strip(),
+    )
+    return det_prompt, inpaint_prompt
+def main():
+    parser = PdArgumentParser((ModelArguments, DataArguments))
+    model_args, data_args = parser.parse_args_into_dataclasses()
+    url = data_args.input_image
+    logger.info("dino_model: {}".format(model_args.dino_model_name_or_path))
+    # build dino processor
+    dino_processor = GroundingDinoProcessor.from_pretrained(model_args.dino_model_name_or_path)
+    # build dino model
+    dino_model = GroundingDinoModel.from_pretrained(model_args.dino_model_name_or_path)
+    dino_model.eval()
+    logger.info("dino_model build finish!")
+    # build sam processor
+    sam_processor = SamProcessor.from_pretrained(model_args.sam_model_name_or_path)
+    # build model
+    logger.info("SamModel: {}".format(model_args.sam_model_name_or_path))
+    sam_model = SamModel.from_pretrained(model_args.sam_model_name_or_path, input_type="boxs")
+    logger.info("SamModel build finish!")
+    # read image
+    if os.path.isfile(url):
+        # read image
+        image_pil = Image.open(url)
+    else:
+        image_pil = Image.open(requests.get(url, stream=True).raw)
+    det_prompt, inpaint_prompt = filter_prompts_with_chatglm(data_args.prompt, model_args.chatglm_model_name_or_path)
+    logger.info("det prompt: {}".format(det_prompt))
+    logger.info("inpaint prompt: {}".format(inpaint_prompt))
+    image_pil = image_pil.convert("RGB")
+    # preprocess image text_prompt
+    image_tensor, mask, tokenized_out = dino_processor(images=image_pil, text=det_prompt)
+    with paddle.no_grad():
+        outputs = dino_model(
+            image_tensor,
+            mask,
+            input_ids=tokenized_out["input_ids"],
+            attention_mask=tokenized_out["attention_mask"],
+            text_self_attention_masks=tokenized_out["text_self_attention_masks"],
+            position_ids=tokenized_out["position_ids"],
+        )
+    logits = F.sigmoid(outputs["pred_logits"])[0]  # (nq, 256)
+    boxes = outputs["pred_boxes"][0]  # (nq, 4)
+    # filter output
+    logits_filt = logits.clone()
+    boxes_filt = boxes.clone()
+    filt_mask = logits_filt.max(axis=1) > model_args.box_threshold
+    logits_filt = logits_filt[filt_mask]  # num_filt, 256
+    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+    # build pred
+    pred_phrases = []
+    for logit, box in zip(logits_filt, boxes_filt):
+        pred_phrase = dino_processor.decode(logit > model_args.text_threshold)
+        pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+    size = image_pil.size
+    pred_dict = {
+        "boxes": boxes_filt,
+        "size": [size[1], size[0]],  # H,W
+        "labels": pred_phrases,
+    }
+    logger.info("dino output{}".format(pred_dict))
+    H, W = size[1], size[0]
+    boxes = []
+    for box in zip(boxes_filt):
+        box = box[0] * paddle.to_tensor([W, H, W, H])
+        box[:2] -= box[2:] / 2
+        box[2:] += box[:2]
+        x0, y0, x1, y1 = box.numpy()
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+        boxes.append([x0, y0, x1, y1])
+    boxes = np.array(boxes)
+    image_seg, prompt = sam_processor(image_pil, input_type="boxs", box=boxes, point_coords=None)
+    seg_masks = sam_model(img=image_seg, prompt=prompt)
+    seg_masks = sam_processor.postprocess_masks(seg_masks)
+    logger.info("Sam finish!")
+    if model_args.visual:
+        # make dir
+        os.makedirs(model_args.output_dir, exist_ok=True)
+        # draw output image
+        plt.figure(figsize=(10, 10))
+        plt.imshow(image_pil)
+        for mask in seg_masks:
+            show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
+        for box, label in zip(boxes, pred_phrases):
+            show_box(box, plt.gca(), label)
+        plt.axis("off")
+        plt.savefig(
+            os.path.join(model_args.output_dir, "mask_pred.jpg"),
+            bbox_inches="tight",
+            dpi=300,
+            pad_inches=0.0,
+        )
+    logger.info("stable diffusion pipeline: {}".format(model_args.stable_diffusion_pipeline_name_or_path))
+    pipe = StableDiffusionInpaintPipeline.from_pretrained(model_args.stable_diffusion_pipeline_name_or_path)
+    logger.info("stable diffusion pipeline build finish!")
+    merge_mask = paddle.sum(seg_masks, axis=0).unsqueeze(0)
+    merge_mask = merge_mask > 0
+    mask_pil = Image.fromarray(merge_mask[0][0].cpu().numpy())
+    image_pil = image_pil.resize((512, 512))
+    mask_pil = mask_pil.resize((512, 512))
+    image = pipe(prompt=inpaint_prompt, image=image_pil, mask_image=mask_pil).images[0]
+    image = image.resize(size)
+    image.save(os.path.join(model_args.output_dir, "grounded_sam_chatglm_output.jpg"))
+    logger.info("finish!")
+if __name__ == "__main__":
+    main()

PaddleMIX/applications/Inpainting/grounded_sam_inpainting.py ADDED Viewed

	@@ -0,0 +1,234 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from dataclasses import dataclass, field
+import matplotlib.pyplot as plt
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+import requests
+from paddlenlp.trainer import PdArgumentParser
+from PIL import Image
+from paddlemix.models.groundingdino.modeling import GroundingDinoModel
+from paddlemix.models.sam.modeling import SamModel
+from paddlemix.processors.groundingdino_processing import GroundingDinoProcessor
+from paddlemix.processors.sam_processing import SamProcessor
+from paddlemix.utils.log import logger
+from ppdiffusers import StableDiffusionInpaintPipeline
+def show_mask(mask, ax, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    ax.imshow(mask_image)
+def show_box(box, ax, label):
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2))
+    ax.text(x0, y0, label)
+@dataclass
+class DataArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `PdArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+    input_image: str = field(
+        metadata={"help": "The name of input image."},
+    )
+    det_prompt: str = field(
+        default=None,
+        metadata={"help": "The prompt of the image to be det."},
+    )
+    inpaint_prompt: str = field(
+        default=None,
+        metadata={"help": "The prompt of the image to be inpaint."},
+    )
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    stable_diffusion_pipeline_name_or_path: str = field(
+        default="stabilityai/stable-diffusion-2-inpainting",
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
+    dino_model_name_or_path: str = field(
+        default="GroundingDino/groundingdino-swint-ogc",
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
+    sam_model_name_or_path: str = field(
+        default="Sam/SamVitH-1024",
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
+    box_threshold: float = field(
+        default=0.3,
+        metadata={"help": "box threshold."},
+    )
+    text_threshold: float = field(
+        default=0.25,
+        metadata={"help": "text threshold."},
+    )
+    output_dir: str = field(
+        default="inpainting_output",
+        metadata={"help": "output directory."},
+    )
+    visual: bool = field(
+        default=True,
+        metadata={"help": "save visual image."},
+    )
+def main():
+    parser = PdArgumentParser((ModelArguments, DataArguments))
+    model_args, data_args = parser.parse_args_into_dataclasses()
+    url = data_args.input_image
+    logger.info("stable diffusion pipeline: {}".format(model_args.stable_diffusion_pipeline_name_or_path))
+    pipe = StableDiffusionInpaintPipeline.from_pretrained(model_args.stable_diffusion_pipeline_name_or_path)
+    logger.info("stable diffusion pipeline build finish!")
+    logger.info("dino_model: {}".format(model_args.dino_model_name_or_path))
+    # build dino processor
+    dino_processor = GroundingDinoProcessor.from_pretrained(model_args.dino_model_name_or_path)
+    # build dino model
+    dino_model = GroundingDinoModel.from_pretrained(model_args.dino_model_name_or_path)
+    dino_model.eval()
+    logger.info("dino_model build finish!")
+    # build sam processor
+    sam_processor = SamProcessor.from_pretrained(model_args.sam_model_name_or_path)
+    # build model
+    logger.info("SamModel: {}".format(model_args.sam_model_name_or_path))
+    sam_model = SamModel.from_pretrained(model_args.sam_model_name_or_path, input_type="boxs")
+    logger.info("SamModel build finish!")
+    # read image
+    if os.path.isfile(url):
+        # read image
+        image_pil = Image.open(url)
+    else:
+        image_pil = Image.open(requests.get(url, stream=True).raw)
+    logger.info("det prompt: {}".format(data_args.det_prompt))
+    logger.info("inpaint prompt: {}".format(data_args.inpaint_prompt))
+    image_pil = image_pil.convert("RGB")
+    # preprocess image text_prompt
+    image_tensor, mask, tokenized_out = dino_processor(images=image_pil, text=data_args.det_prompt)
+    with paddle.no_grad():
+        outputs = dino_model(
+            image_tensor,
+            mask,
+            input_ids=tokenized_out["input_ids"],
+            attention_mask=tokenized_out["attention_mask"],
+            text_self_attention_masks=tokenized_out["text_self_attention_masks"],
+            position_ids=tokenized_out["position_ids"],
+        )
+    logits = F.sigmoid(outputs["pred_logits"])[0]  # (nq, 256)
+    boxes = outputs["pred_boxes"][0]  # (nq, 4)
+    # filter output
+    logits_filt = logits.clone()
+    boxes_filt = boxes.clone()
+    filt_mask = logits_filt.max(axis=1) > model_args.box_threshold
+    logits_filt = logits_filt[filt_mask]  # num_filt, 256
+    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+    # build pred
+    pred_phrases = []
+    for logit, box in zip(logits_filt, boxes_filt):
+        pred_phrase = dino_processor.decode(logit > model_args.text_threshold)
+        pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+    size = image_pil.size
+    pred_dict = {
+        "boxes": boxes_filt,
+        "size": [size[1], size[0]],  # H,W
+        "labels": pred_phrases,
+    }
+    logger.info("dino output{}".format(pred_dict))
+    H, W = size[1], size[0]
+    boxes = []
+    for box in zip(boxes_filt):
+        box = box[0] * paddle.to_tensor([W, H, W, H])
+        box[:2] -= box[2:] / 2
+        box[2:] += box[:2]
+        x0, y0, x1, y1 = box.numpy()
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+        boxes.append([x0, y0, x1, y1])
+    boxes = np.array(boxes)
+    image_seg, prompt = sam_processor(image_pil, input_type="boxs", box=boxes, point_coords=None)
+    seg_masks = sam_model(img=image_seg, prompt=prompt)
+    seg_masks = sam_processor.postprocess_masks(seg_masks)
+    logger.info("Sam finish!")
+    if model_args.visual:
+        # make dir
+        os.makedirs(model_args.output_dir, exist_ok=True)
+        # draw output image
+        plt.figure(figsize=(10, 10))
+        plt.imshow(image_pil)
+        for mask in seg_masks:
+            show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
+        for box, label in zip(boxes, pred_phrases):
+            show_box(box, plt.gca(), label)
+        plt.axis("off")
+        plt.savefig(
+            os.path.join(model_args.output_dir, "mask_pred.jpg"),
+            bbox_inches="tight",
+            dpi=300,
+            pad_inches=0.0,
+        )
+    merge_mask = paddle.sum(seg_masks, axis=0).unsqueeze(0)
+    merge_mask = merge_mask > 0
+    mask_pil = Image.fromarray(merge_mask[0][0].cpu().numpy())
+    image_pil = image_pil.resize((512, 512))
+    mask_pil = mask_pil.resize((512, 512))
+    image = pipe(prompt=data_args.inpaint_prompt, image=image_pil, mask_image=mask_pil).images[0]
+    image = image.resize(size)
+    image.save(os.path.join(model_args.output_dir, "grounded_sam_inpainting_output.jpg"))
+    logger.info("finish!")
+if __name__ == "__main__":
+    main()

PaddleMIX/applications/MusicGeneration/README.md ADDED Viewed

	@@ -0,0 +1,89 @@

+### 音乐生成（Music Generation）
+#### 1. Application introduction
+Enter audio and prompt words for question and answer.
+*****
+- No training is need.
+- Integration with the moedel of [minigpt4](), [chatglm](), [audioldm]().
+----
+#### 2. Demo
+*****
+example:
+使用miniGPT4前，需要下载相应权重进行转换，具体可参考[miniGPT4](../../paddlemix/examples/minigpt4/README.md),在完成权重转换后，根据模型权重文件以及配置文件按下存放：
+```bash
+--PPMIX_HOME  #默认路径 /root/.paddlemix  可通过export PPMIX_HOME 设置
+  --models
+    --miniGPT4
+      --MiniGPT4-7B
+        config.json
+        model_state.pdparams
+        special_tokens_map.json
+        image_preprocessor_config.json
+        preprocessor_config.json
+        tokenizer_config.json
+        model_config.json
+        sentencepiece.bpe.model
+        tokenizer.json
+      --MiniGPT4-13B
+        ...
+        ...
+    ...
+```
+完成之后，可使用appflow 一键预测
+```python
+#music generation
+from paddlemix.appflow import Appflow
+import paddle
+from PIL import Image
+import scipy
+paddle.seed(1024)
+# Text to music
+task = Appflow(app="music_generation", models=["cvssp/audioldm"])
+prompt = "A classic cocktail lounge vibe with smooth jazz piano and a cool, relaxed atmosphere."
+negative_prompt = 'low quality, average quality, muffled quality, noise interference, poor and low-grade quality, inaudible quality, low-fidelity quality'
+audio_length_in_s = 5
+num_inference_steps = 20
+output_path = "tmp.wav"
+result = task(prompt=prompt, negative_prompt=negative_prompt, num_inference_steps=num_inference_steps, audio_length_in_s=audio_length_in_s, generator = paddle.Generator().manual_seed(120))['result']
+scipy.io.wavfile.write(output_path, rate=16000, data=result)
+# image to music
+task1 = Appflow(app="music_generation", models=["miniGPT4/MiniGPT4-7B"])
+negative_prompt = 'low quality, average quality, muffled quality, noise interference, poor and low-grade quality, inaudible quality, low-fidelity quality'
+audio_length_in_s = 5
+num_inference_steps = 20
+output_path = "tmp.wav"
+minigpt4_text = 'describe the image, '
+image_pil = Image.open("dance.png").convert("RGB")
+result = task1(image=image_pil, minigpt4_text=minigpt4_text )['result'].split('#')[0]
+paddle.device.cuda.empty_cache()
+# miniGPT4 output: The image shows a crowded nightclub with people dancing on the dance floor. The lights on the dance floor are green and red, and there are several people on the dance floor. The stage is at the back of the room, and there are several people on stage. The walls of the nightclub are decorated with neon lights and there are several people sitting at tables in the background. The atmosphere is lively and energetic.
+prompt = "Given the scene description in the following paragraph, please create a musical style sentence that fits the scene.  Description:{}.".format(result)
+task2 = Appflow(app="music_generation", models=["THUDM/chatglm-6b", "cvssp/audioldm"])
+result = task2(prompt=prompt, negative_prompt=negative_prompt, num_inference_steps=num_inference_steps, audio_length_in_s=audio_length_in_s, generator = paddle.Generator().manual_seed(120))['result']
+scipy.io.wavfile.write(output_path, rate=16000, data=result)
+# chatglm ouptput: The music is playing, and the crowd is dancing like never before. The lights are bright and the atmosphere is electric, with people swaying to the rhythm of the music and the energy of the night. The dance floor is a sea of movement, with people moving to the music and feeling the rhythm of their feet. The stage is a place of magic, with people on it, performing their best. The neon lights of the nightclub are a testament to the energy and excitement of the night, with people's faces lit up as they perform. And as the music continues to play, the crowd continues to dance, never letting up, until the night is over.
+```
+#### Text to music
+|  Input Prompt | Output Music |
+| --- | --- |
+|'A classic cocktail lounge vibe with smooth jazz piano and a cool, relaxed atmosphere.'| [jazz_output.wav](https://github.com/luyao-cv/file_download/blob/main/assets/jazz_output.wav)
+---
+#### image to music
+|  Input Image | Output Caption | Output Text | Output Music |
+| --- | --- |  --- |  --- |
+|![dance.png](https://github.com/luyao-cv/file_download/blob/main/vis_music_generation/dance.png) | 'The image shows a crowded nightclub with people dancing on the dance floor. The lights on the dance floor are green and red, and there are several people on the dance floor. The stage is at the back of the room, and there are several people on stage. The walls of the nightclub are decorated with neon lights and there are several people sitting at tables in the background. The atmosphere is lively and energetic.' | 'The music is playing, and the crowd is dancing like never before. The lights are bright and the atmosphere is electric, with people swaying to the rhythm of the music and the energy of the night. The dance floor is a sea of movement, with people moving to the music and feeling the rhythm of their feet. The stage is a place of magic, with people on it, performing their best. The neon lights of the nightclub are a testament to the energy and excitement of the night, with people's faces lit up as they perform. And as the music continues to play, the crowd continues to dance, never letting up, until the night is over.' | [dance_output.wav](https://github.com/luyao-cv/file_download/blob/main/assets/dance_output.wav)

PaddleMIX/applications/VLChat/README.md ADDED Viewed

	@@ -0,0 +1,44 @@

+### 视觉语言对话（Vision-Language-Chat）
+#### 1. 应用介绍
+输入图像或文字进行多轮对话，包括captions、grounding、视觉定位能力
+#### 2. Demo
+example:
+```python
+import paddle
+from paddlemix.appflow import Appflow
+from ppdiffusers.utils import load_image
+paddle.seed(1234)
+task = Appflow(app="image2text_generation",
+                   models=["qwen-vl/qwen-vl-chat-7b"])
+image= "https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg"
+prompt = "这是什么？"
+result = task(image=image,prompt=prompt)
+print(result["result"])
+prompt2 = "框出图中公交车的位置"
+result = task(prompt=prompt2)
+print(result["result"])
+```
+输入图片：<center><img src="https://github.com/LokeZhou/PaddleMIX/assets/13300429/95f73037-097e-4712-95be-17d5ca489f11" /></center>
+prompt：“这是什么？”
+输出:
+```
+这是一张红色城市公交车的图片，它正在道路上行驶，穿越城市。该区域似乎是一个住宅区，因为可以在背景中看到一些房屋。除了公交车之外，还有其他车辆，包括一辆汽车和一辆卡车，共同构成了交通场景。此外，图片中还显示了一一个人，他站在路边，可能是在等待公交车或进行其他活动。
+```
+prompt2：“框出图中公交车的位置”
+输出:
+```
+<ref>公交车</ref><box>(178,280),(803,894)</box>
+```

PaddleMIX/applications/image2image/README.md ADDED Viewed

	@@ -0,0 +1,92 @@

+### 文本引导的图像放大（Text-Guided Image Upscaling
+```python
+from paddlemix.appflow import Appflow
+from PIL import Image
+from ppdiffusers.utils import load_image
+url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/low_res_cat.png"
+low_res_img = load_image(url).resize((128, 128))
+prompt = "a white cat"
+app = Appflow(app='image2image_text_guided_upscaling',models=['stabilityai/stable-diffusion-x4-upscaler'])
+image = app(prompt=prompt,image=low_res_img)['result']
+image.save("upscaled_white_cat.png")
+```
+效果展示
+<div align="center">
+| prompt |image | Generated Image |
+|:----:|:----:|:----:|
+| a white cat| ![low_res_cat](https://github.com/LokeZhou/PaddleMIX/assets/13300429/5cc5f2ee-5709-4722-b5f2-3adabe98cbf2) |![upscaled_white_cat](https://github.com/LokeZhou/PaddleMIX/assets/13300429/f5688dd6-b328-4c3f-a9ab-9575b6ee77b2) |
+</div>
+### 文本图像双引导图像生成（Dual Text and Image Guided Generation）
+```python
+from paddlemix.appflow import Appflow
+from PIL import Image
+from ppdiffusers.utils import load_image
+url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/benz.jpg"
+image = load_image(url)
+prompt = "a red car in the sun"
+app = Appflow(app='dual_text_and_image_guided_generation',models=['shi-labs/versatile-diffusion'])
+image = app(prompt=prompt,image=image)['result']
+image.save("versatile-diffusion-red_car.png")
+```
+效果展示
+<div align="center">
+| prompt |image | Generated Image |
+|:----:|:----:|:----:|
+| a red car in the sun | ![benz](https://github.com/LokeZhou/PaddleMIX/assets/13300429/2a71f5fd-3dd3-4f3b-a3cb-fe5282eb728b) | ![versatile-diffusion-red_car](https://github.com/LokeZhou/PaddleMIX/assets/13300429/3904d53e-5412-4896-92d0-43c5770d8b39)|
+</div>
+### 文本引导的图像变换（Image-to-Image Text-Guided Generation）
+```python
+from paddlemix.appflow import Appflow
+from PIL import Image
+from ppdiffusers.utils import load_image
+import paddle
+url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/image_Kurisu.png"
+image = load_image(url).resize((512, 768))
+prompt = "a red car in the sun"
+paddle.seed(42)
+prompt = "Kurisu Makise, looking at viewer, long hair, standing, 1girl, hair ornament, hair flower, cute, jacket, white flower, white dress"
+negative_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry"
+app = Appflow(app='image2image_text_guided_generation',models=['admruul/anything-v3.0'])
+image = app(prompt=prompt,negative_prompt=negative_prompt,image=image)['result']
+image.save("image_Kurisu_img2img.png")
+```
+效果展示
+<div align="center">
+| prompt | negative_prompt |image | Generated Image |
+|:----:|:----:|:----:| :----:|
+| Kurisu Makise, looking at viewer, long hair, standing, 1girl, hair ornament, hair flower, cute, jacket, white flower, white dress | lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry | ![image_Kurisu](https://github.com/LokeZhou/PaddleMIX/assets/13300429/9596c6b9-2dea-4a66-9419-b60332a08cd1)|![image_Kurisu_img2img](https://github.com/LokeZhou/PaddleMIX/assets/13300429/f4fa0efe-bce2-4bea-b6f6-19591af7e423) |
+</div>

PaddleMIX/applications/image2text/README.md ADDED Viewed

	@@ -0,0 +1,66 @@

+### 图文生成（Image-to-Text Generation）
+## miniGPT4
+使用miniGPT4前，需要下载相应权重进行转换，具体可参考[miniGPT4](../../paddlemix/examples/minigpt4/README.md),在完成权重转换后，根据模型权重文件以及配置文件按下存放：
+```bash
+--PPMIX_HOME  #默认路径 /root/.paddlemix  可通过export PPMIX_HOME 设置
+  --models
+    --miniGPT4
+      --MiniGPT4-7B
+        config.json
+        model_state.pdparams
+        special_tokens_map.json
+        image_preprocessor_config.json
+        preprocessor_config.json
+        tokenizer_config.json
+        model_config.json
+        sentencepiece.bpe.model
+        tokenizer.json
+      --MiniGPT4-13B
+        ...
+        ...
+    ...
+```
+完成之后，可使用appflow 一键预测
+```python
+from paddlemix.appflow import Appflow
+import requests
+task = Appflow(app="image2text_generation",
+               models=["miniGPT4/MiniGPT4-7B"])
+url = "https://paddlenlp.bj.bcebos.com/data/images/mugs.png"
+image = Image.open(requests.get(url, stream=True).raw)
+minigpt4_text = "describe the image"
+result = task(image=image,minigpt4_text=minigpt4_text)
+```
+效果展示
+<div align="center">
+| Image | text | Generated text|
+|:----:|:----:|:----:|
+|![mugs](https://github.com/LokeZhou/PaddleMIX/assets/13300429/b5a95002-bb30-4683-8e62-ed21879f24e1) | describe the image|The image shows two mugs with cats on them, one is black and white and the other is blue and white. The mugs are sitting on a table with a book in the background. The mugs have a whimsical, cartoon-like appearance. The cats on the mugs are looking at each other with a playful expression. The overall style of the image is cute and fun.###|
+</div>
+## blip2
+```python
+from paddlemix.appflow import Appflow
+from ppdiffusers.utils import load_image
+task = Appflow(app="image2text_generation",
+               models=["paddlemix/blip2-caption-opt2.7b"])
+url = "https://paddlenlp.bj.bcebos.com/data/images/mugs.png"
+image_pil = load_image(url)
+blip2_prompt = 'describe the image'
+result = task(image=image_pil,blip2_prompt=blip2_prompt)
+```
+| Image | text | Generated text|
+|:----:|:----:|:----:|
+|![mugs](https://github.com/LokeZhou/PaddleMIX/assets/13300429/b5a95002-bb30-4683-8e62-ed21879f24e1) | describe the image|of the two coffee mugs with cats on them|
+</div>

PaddleMIX/applications/text2image/README.md ADDED Viewed

	@@ -0,0 +1,27 @@

+### 文图生成（Text-to-Image Generation）
+```python
+import paddle
+from paddlemix.appflow import Appflow
+paddle.seed(42)
+task = Appflow(app="text2image_generation",
+               models=["stabilityai/stable-diffusion-xl-base-1.0"]
+               )
+prompt = "a photo of an astronaut riding a horse on mars."
+result = task(prompt=prompt)['result']
+```
+效果展示
+<div align="center">
+| model| prompt | Generated Image |
+|:----:|:----:|:----:|
+|stabilityai/stable-diffusion-v1-5| a photo of an astronaut riding a horse on mars | ![astronaut_rides_horse_sd](https://github.com/LokeZhou/PaddleMIX/assets/13300429/1622fb1e-c841-4531-ad39-9c5092a2456c)|
+|stabilityai/stable-diffusion-xl-base-1.0| a photo of an astronaut riding a horse on mars |![sdxl_text2image](https://github.com/LokeZhou/PaddleMIX/assets/13300429/9e339d97-18cd-4cfc-89a6-c545e2872f7e) |
+</div>

PaddleMIX/applications/text2video/README.md ADDED Viewed

	@@ -0,0 +1,23 @@

+### 文本条件的视频生成（Text-to-Video Generation）
+```python
+from paddlemix.appflow import Appflow
+import imageio
+prompt = "An astronaut riding a horse."
+app = Appflow(app='text_to_video_generation',models=['damo-vilab/text-to-video-ms-1.7b'])
+video_frames = app(prompt=prompt,num_inference_steps=25)['result']
+imageio.mimsave("text_to_video_generation-synth-result-astronaut_riding_a_horse.gif", video_frames,duration=8)
+```
+<div align="center">
+| Prompt | video |
+|:----:|:----:|
+| An astronaut riding a horse.|![text_to_video_generation-synth-result-astronaut_riding_a_horse](https://github.com/LokeZhou/PaddleMIX/assets/13300429/21a21062-4ec3-489a-971b-7daa4305106e) |
+</div>

PaddleMIX/deploy/llava/README.md ADDED Viewed

	@@ -0,0 +1,83 @@

+# LLaVA
+## 1. 模型介绍
+[LLaVA](https://arxiv.org/pdf/2310.03744.pdf) 是基于大规模语言模型 llama 的视觉语言模型。支持多个多模态任务，包括零样本图像描述生成（Zero-shot Image Caption）、视觉问答（VQA）、细粒度视觉定位（Referring Expression Comprehension）等任务。
+其性能优于其他模型，在多个任务上取得了更好的效果。
+<p align="center">
+  <img src="https://github.com/haotian-liu/LLaVA/blob/main/images/llava_v1_5_radar.jpg" align="middle" width = "600" />
+</p>
+注：图片引用自[LLaVA](https://github.com/haotian-liu/LLaVA).
+本目录提供paddle版本的llava静态图推理部署示例，推荐使用A100进行推理部署。
+## 2. 安装依赖
+* `paddlenlp_ops`依赖安装
+```bash
+git submodule update --init --recursive
+cd PaddleNLP
+git reset --hard 498f70988431be278dac618411fbfb0287853cd9
+pip install -e .
+cd csrc
+python setup_cuda.py install
+```
+* 如果在V100上安装报错，可屏蔽 /PaddleNLP/csrc/generation/quant_int8.cu 以下语句:
+```bash
+# template<>
+# __forceinline__ __device__ __nv_bfloat16 add_mul<__nv_bfloat16>(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+#     return __hmul(__hadd(a, b), c);
+# }
+```
+* `fused_ln`需要安装 /PaddleNLP/model_zoo/gpt-3/external_ops 下的自定义OP, `python setup.py install`
+## 3. 示例
+### 3.1 转出静态图推理所需的视觉模型和语言模型
+* 在`PaddleMIX`目录下，执行转换脚本，得到视觉模型部分静态图
+```bash
+#!/bin/bash
+export PYTHONPATH=/path/to/PaddleNLP/:/path/to/PaddleMIX
+python deploy/llava/export_model.py \
+    --model_name_or_path "paddlemix/llava/llava-v1.5-7b" \
+    --save_path "./llava_static" \
+    --encode_image \
+    --fp16
+```
+* 在`PaddleMIX`目录下，执行转换脚本，得到语言模型部分静态图
+```bash
+#!/bin/bash
+export PYTHONPATH=/path/to/PaddleNLP/:/path/to/PaddleMIX
+python deploy/llava/export_model.py \
+    --model_name_or_path "paddlemix/llava/llava-v1.5-7b" \
+    --save_path "./llava_static" \
+    --encode_text \
+    --fp16
+```
+### 3.2 静态图推理
+* 在`PaddleMIX`目录下，运行执行脚本，进行静态图推理
+```bash
+#!/bin/bash
+python deploy/llava/run_static_predict.py --model_name_or_path "paddlemix/llava/llava-v1.5-7b" \
+--image_file "https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg" \
+--first_model_path "llava_static/encode_image/clip"  \
+--second_model_path "llava_static/encode_text/llama" \
+--fp16
+```

PaddleMIX/deploy/llava/export_model.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import paddle
+from llama_inference_model import LlamaForClipInferenceModel
+from paddlemix.auto import AutoConfigMIX, AutoModelMIX
+from paddlemix.utils.log import logger
+def export_encode_text(model, config, compute_dtype):
+    # save to static model
+    save_path = args.save_path + "/encode_text/llama"
+    model.to_static(save_path, config, compute_dtype)
+    logger.info(f"static model has been to {save_path}")
+def export_encode_image(model, compute_dtype):
+    paddle.save(model.llama.image_newline,args.save_path + "/encode_image/clip/image_newline.pdparams")
+    # convert to static graph with specific input description
+    model = paddle.jit.to_static(
+        model.encode_images,
+        input_spec=[
+            paddle.static.InputSpec(shape=[None,3, 336, 336], dtype=compute_dtype),  # images
+        ]
+    )
+    # save to static model
+    save_path = args.save_path + "/encode_image/clip"
+    paddle.jit.save(model, save_path)
+    logger.info(f"static model has been to {save_path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name_or_path",
+        default="paddlemix/llava/llava-v1.5-7b",
+        type=str,
+        help="The dir name of llava checkpoint.",
+    )
+    parser.add_argument(
+        "--save_path",
+        default="./llava_static",
+        type=str,
+        help="The saving path of static llava vision.",
+    )
+    parser.add_argument("--encode_image", action="store_true")
+    parser.add_argument("--encode_text", action="store_true")
+    parser.add_argument("--fp16", action="store_true")
+    args = parser.parse_args()
+    compute_dtype = "float16" if args.fp16 else "bfloat16"
+    if not paddle.amp.is_bfloat16_supported() and compute_dtype == "bfloat16":
+        logger.warning("bfloat16 is not supported on your device,change to float32")
+        compute_dtype = "float32"
+    if args.encode_image:
+        model = AutoModelMIX.from_pretrained(args.model_name_or_path, dtype=compute_dtype)
+        vision_tower = model.get_vision_tower()
+        vision_tower.load_model()
+        model.eval()
+        export_encode_image(model, compute_dtype)
+    elif args.encode_text:
+        config = AutoConfigMIX.from_pretrained(args.model_name_or_path)
+        config.tensor_parallel_degree = 1
+        config.tensor_parallel_rank = 0
+        config.weight_only_quant_bits = -1
+        config.quant_type = None
+        model = LlamaForClipInferenceModel.from_pretrained(args.model_name_or_path, config=config)
+        model.to(dtype=compute_dtype)
+        model.eval()
+        export_encode_text(model, config, compute_dtype)
+    else:
+        logger.info("please specify the task to export,--encode_image or --encode_text")

PaddleMIX/deploy/llava/llama_inference_model.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from paddlenlp.experimental.transformers import LlamaForCausalLMInferenceModel
+class LlamaForClipInferenceModel(LlamaForCausalLMInferenceModel):
+    """
+    This class is 99% like LlamaForCausalLMInferenceModel.
+    Used only for llava's second part.
+    """
+    @paddle.no_grad()
+    def generate_text_with_image_features(
+        self,
+        input_ids: paddle.Tensor,
+        image_features: paddle.Tensor,
+        img_pos: paddle.Tensor,
+        attention_mask=None,
+        position_ids=None,
+        penalty_score=None,
+        frequency_score=None,
+        presence_score=None,
+        min_length=None,
+        max_length=None,
+        temperature=None,
+        top_p=None,
+        eos_token_id=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        step_idx=None,
+        stop_flags=None,
+        tgt_ids=None,
+        tgt_pos=None,
+        tgt_generation_mask=None,
+        pre_ids=None,
+        stop_nums=None,
+        cache_kvs=[],
+        **generate_kwargs
+    ) -> paddle.Tensor:
+        inputs_embeds = self.llama.embed_tokens(input_ids)
+        for batch_idx, pos in enumerate(img_pos):
+            for idx, p in enumerate(pos):
+                index = paddle.arange(p[0], p[1]).unsqueeze(-1)
+                inputs_embeds[batch_idx] = paddle.scatter(inputs_embeds[batch_idx], index, image_features[idx])
+        outputs = self.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            penalty_score=penalty_score,
+            frequency_score=frequency_score,
+            presence_score=presence_score,
+            min_length=min_length,
+            max_length=max_length,
+            temperature=temperature,
+            top_p=top_p,
+            eos_token_id=eos_token_id,
+            seq_len_encoder=seq_len_encoder,
+            seq_len_decoder=seq_len_decoder,
+            step_idx=step_idx,
+            stop_flags=stop_flags,
+            tgt_ids=tgt_ids,
+            tgt_pos=tgt_pos,
+            tgt_generation_mask=tgt_generation_mask,
+            pre_ids=pre_ids,
+            stop_nums=stop_nums,
+            cache_kvs=cache_kvs,
+        )
+        return outputs
+    def to_static(self, output_path: str, config: dict, compute_dtype: str):
+        cache_kvs_shapes = self.get_cache_kvs_shape(config, max_length=config.get("max_length", None))
+        input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int32", name="inputs_ids"),
+            paddle.static.InputSpec(
+                shape=[None, None, None], dtype=compute_dtype, name="image_features"
+            ),  # image_features
+            paddle.static.InputSpec(shape=[None, None, 2], dtype="int64", name="img_pos"),  # img_pos
+            paddle.static.InputSpec(
+                shape=[None, None, None, None], dtype="int64", name="attention_mask"
+            ),  # attention_mask
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="position_ids"),  # position_ids
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="penalty_score"),  # penalty_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="frequency_score"),  # frequency_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="presence_score"),  # presence_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="min_length"),  # min_decode_length
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="max_length"),  # max_decode_length
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="temperature"),  # temperature
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="top_p"),  # top_p
+            paddle.static.InputSpec(shape=[None], dtype="int64", name="eos_token_id"),  # eos_token_id
+            paddle.static.InputSpec(shape=[None, 1], dtype="int32", name="seq_len_encoder"),  # seq_len_encoder
+            paddle.static.InputSpec(shape=[None, 1], dtype="int32", name="seq_len_decoder"),  # seq_len_decoder
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="step_idx"),  # step_idx
+            paddle.static.InputSpec(shape=[None, 1], dtype="bool", name="stop_flags"),  # stop_flags
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="tgt_ids"),  # tgt_ids
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="tgt_pos"),  # tgt_pos
+            paddle.static.InputSpec(shape=[None, 1, 1, None], name="tgt_generation_mask"),  # tgt_generation_mask
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="pre_ids"),  # pre_ids
+            paddle.static.InputSpec(shape=[1], dtype="int64", name="stop_nums"),  # stop_nums
+            [
+                paddle.static.InputSpec(
+                    shape=shape,
+                    dtype=compute_dtype,
+                    name="cache_kvs_{}".format(i),
+                )
+                for i, shape in enumerate(cache_kvs_shapes)
+            ],  # cache_kvs
+        ]
+        model = paddle.jit.to_static(self.generate_text_with_image_features, input_spec=input_spec)
+        paddle.jit.save(model, output_path, skip_prune_program=True)

PaddleMIX/deploy/llava/run_static_predict.py ADDED Viewed

	@@ -0,0 +1,403 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import paddle
+from utils import load_real_time_tokens
+from paddlemix.auto import AutoConfigMIX, AutoProcessorMIX, AutoTokenizerMIX
+from paddlemix.models.llava.constants import (
+    DEFAULT_IM_END_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IMAGE_TOKEN,
+    IMAGE_TOKEN_INDEX,
+)
+from paddlemix.models.llava.conversation import conv_templates
+from paddlemix.models.llava.mm_utils import load_image,get_anyres_image_grid_shape
+from paddlemix.models.llava.llava_arch import unpad_image
+from paddlemix.utils.log import logger
+class Predictor(object):
+    def __init__(self, args):
+        self.compute_dtype = "float16" if args.fp16 else "bfloat16"
+        if not paddle.amp.is_bfloat16_supported() and self.compute_dtype == "bfloat16":
+            logger.warning("bfloat16 is not supported on your device,change to float32")
+            self.compute_dtype = "float32"
+        self.args = args
+        self.config = AutoConfigMIX.from_pretrained(args.model_name_or_path)
+        self.clip_config = AutoConfigMIX.from_pretrained(self.config.mm_vision_tower)
+        self.tokenizer = AutoTokenizerMIX.from_pretrained(args.model_name_or_path)
+        self.processor, _ = AutoProcessorMIX.from_pretrained(args.model_name_or_path, image_aspect_ratio=self.config.image_aspect_ratio,eval="eval")
+        self.first_predictor = self.create_predictor(args.first_model_path)
+        print(f"first_model_path: {args.first_model_path}, {self.first_predictor}")
+        self.second_predictor = self.create_predictor(args.second_model_path)
+        print(f"second_model_path: {args.second_model_path}, {self.second_predictor}")
+        self.image_newline = paddle.load(os.path.join(args.first_model_path, "image_newline.pdparams"))
+    def create_predictor(self, model_path):
+        from paddlenlp.utils.import_utils import import_module
+        import_module("paddlenlp_ops.encode_rotary_qk")
+        import_module("paddlenlp_ops.get_padding_offset")
+        import_module("paddlenlp_ops.qkv_transpose_split")
+        import_module("paddlenlp_ops.rebuild_padding")
+        import_module("paddlenlp_ops.transpose_remove_padding")
+        import_module("paddlenlp_ops.write_cache_kv")
+        model_file = model_path + ".pdmodel"
+        params_file = model_path + ".pdiparams"
+        if not os.path.exists(model_file):
+            raise ValueError("not find model file path {}".format(model_file))
+        if not os.path.exists(params_file):
+            raise ValueError("not find params file path {}".format(params_file))
+        config = paddle.inference.Config(model_file, params_file)
+        config.switch_ir_optim(True)
+        if self.args.device == "gpu":
+            config.enable_use_gpu(100, 0)
+        config.switch_use_feed_fetch_ops(False)
+        predictor = paddle.inference.create_predictor(config)
+        return predictor
+    @paddle.no_grad()
+    def encode_images(self, images, image_sizes):
+        if type(images) is list or images.ndim == 5:
+            if type(images) is list:
+                images = [(x.unsqueeze(axis=0) if x.ndim == 3 else x) for x in images]
+            concat_images = paddle.concat(x=[image for image in images], axis=0)
+            image_features = self.first_predictor.run(concat_images)[0]
+            split_sizes = [image.shape[0] for image in images]
+            image_features = paddle.split(image_features, split_sizes, axis=0)
+            mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
+            image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
+            if mm_patch_merge_type == "flat":
+                image_features = [x.flatten(start_axis=0, stop_axis=1) for x in image_features]
+            elif mm_patch_merge_type.startswith("spatial"):
+                new_image_features = []
+                for image_idx, image_feature in enumerate(image_features):
+                    if image_feature.shape[0] > 1:
+                        base_image_feature = image_feature[0]
+                        image_feature = image_feature[1:]
+                        height = width = self.clip_config.image_resolution // self.clip_config.vision_patch_size
+                        assert height * width == base_image_feature.shape[0]
+                        if image_aspect_ratio == "anyres":
+                            num_patch_width, num_patch_height = get_anyres_image_grid_shape(
+                                image_sizes[image_idx],
+                                self.config.image_grid_pinpoints,
+                                self.clip_config.image_resolution,
+                            )
+                            image_feature = paddle.reshape(
+                                image_feature, (num_patch_height, num_patch_width, height, width, -1)
+                            )
+                        else:
+                            raise NotImplementedError
+                        if "unpad" in mm_patch_merge_type:
+                            image_feature = image_feature.transpose(perm=[4, 0, 2, 1, 3])
+                            image_feature = image_feature.flatten(start_axis=1, stop_axis=2).flatten(
+                                start_axis=2, stop_axis=3
+                            )
+                            image_feature = unpad_image(image_feature, image_sizes[image_idx])
+                            image_feature = paddle.concat(
+                                x=(
+                                    image_feature,
+                                    self.image_newline[:, (None), (None)].expand(
+                                        shape=[*image_feature.shape[:-1], 1]
+                                    ).astype(image_feature.dtype),
+                                ),
+                                axis=-1,
+                            )
+                            x = image_feature.flatten(start_axis=1, stop_axis=2)
+                            perm_12 = list(range(x.ndim))
+                            perm_12[0] = 1
+                            perm_12[1] = 0
+                            image_feature = x.transpose(perm=perm_12)
+                        else:
+                            image_feature = image_feature.transpose(perm=[0, 2, 1, 3, 4])
+                            image_feature = image_feature.flatten(start_axis=0, stop_axis=3)
+                        image_feature = paddle.concat(x=(base_image_feature, image_feature), axis=0)
+                    else:
+                        image_feature = image_feature[0]
+                        if "unpad" in mm_patch_merge_type:
+                            image_feature = paddle.concat(
+                                x=(image_feature, self.image_newline[None].to(image_feature.place)), axis=0
+                            )
+                    new_image_features.append(image_feature)
+                image_features = new_image_features
+                image_features = paddle.stack(x=image_features, axis=0)
+            else:
+                raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
+        else:
+            image_features = self.first_predictor.run(images)[0]
+        return image_features
+    @paddle.no_grad()
+    def generate_with_image_features(self, image_features, input_ids):
+        max_len = 2048
+        total_max_length = max_len + 1024
+        batch, seq, _ = image_features.shape
+        seq += input_ids.shape[1] - 1
+        _attention_mask = paddle.ones_like(x=input_ids, dtype="bool")
+        input_ids = [
+            cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, _attention_mask)
+        ]
+        cur_image_idx = 0
+        new_input_ids = []
+        img_pos = []
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            image_token_indices = (
+                [-1]
+                + paddle.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].squeeze(axis=1).tolist()
+                + [cur_input_ids.shape[0]]
+            )
+            cur_input_ids_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+            split_sizes = [x.shape[0] for x in cur_input_ids_noim]
+            split_start = 0
+            cur_new_input_ids = []
+            cur_img_pos = []
+            for i in range(num_images + 1):
+                cur_new_input_ids.append(cur_input_ids_noim[i])
+                if i < num_images:
+                    cur_image_features = image_features[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_ids.append(paddle.full([cur_image_features.shape[0]], 1, dtype="int64"))
+                    split_start += split_sizes[i - 1] if i > 0 else split_sizes[i]
+                    cur_img_pos.append([split_start, split_start + cur_image_features.shape[0]])
+                    split_start += cur_image_features.shape[0]
+            cur_new_input_ids = paddle.concat(x=cur_new_input_ids)
+            new_input_ids.append(cur_new_input_ids)
+            img_pos.append(cur_img_pos)
+        new_input_ids = paddle.to_tensor(new_input_ids)
+        img_pos = paddle.to_tensor(img_pos)
+        tgt_generation_mask = paddle.full([batch, 1, 1, total_max_length], 1)
+        attention_mask = paddle.zeros(
+            shape=(batch, 1, total_max_length, total_max_length),
+            dtype="int64",
+        )
+        length = seq
+        attention_mask[:, 0, :length, :length] = paddle.tril(paddle.ones(shape=(length, length), dtype="int64"))
+        position_ids = paddle.full([batch, total_max_length], 0, dtype="int64")
+        position_ids[:, :seq] = paddle.arange(0, seq)
+        inputs = [
+            new_input_ids,  # input_ids
+            image_features,  # image_features
+            img_pos,
+            attention_mask,
+            position_ids,
+            paddle.full([batch, 1], 1.0, dtype="float32"),  # penalty_score
+            paddle.full([batch, 1], 0.0, dtype="float32"),  # frequency_score,
+            paddle.full([batch, 1], 0.0, dtype="float32"),  # presence_score,
+            paddle.full([batch, 1], 1, dtype="int64"),  # min_length,
+            paddle.full([batch, 1], max_len, dtype="int64"),  # max_length,
+            paddle.full([batch, 1], 0.7, dtype="float32"),  # temperature,
+            paddle.full([batch, 1], 0.95, dtype="float32"),  # top_p,
+            paddle.full([1], self.config.eos_token_id, dtype="int64"),  # eos_token_id,
+            paddle.full([batch, 1], seq, dtype="int32"),  # seq_len_encoder,
+            paddle.full([batch, 1], seq, dtype="int32"),  # seq_len_decoder,
+            paddle.full([batch, 1], 0, dtype="int64"),  # step_idx,
+            paddle.full([batch, 1], False, dtype="bool"),  # stop_flags,
+            paddle.full([batch, 1], 29962, dtype="int64"),  # tgt_ids can be be initialized arbitrarily
+            paddle.full([batch, 1], seq - 1, dtype="int64"),  # tgt_pos,
+            tgt_generation_mask,  # tgt_generation_mask,
+            paddle.full([batch, total_max_length], -1, dtype="int64"),  # pre_ids, can be initialized arbitrarily
+            paddle.full([1], batch, dtype="int64"),  # stop_nums, be batch
+        ]
+        for i in range(self.config.num_hidden_layers):
+            tmp = paddle.zeros(
+                shape=[
+                    2,
+                    batch,
+                    self.config.num_attention_heads,
+                    total_max_length,
+                    self.config.hidden_size // self.config.num_attention_heads,
+                ],
+                dtype=self.compute_dtype,
+            )
+            inputs.append(tmp)
+        self.second_predictor.run(inputs)
+        tokens = load_real_time_tokens()
+        generate_ids = tokens.tolist()
+        return generate_ids, None
+    def pre_processing(self, inp, first_message):
+        model_name = self.args.model_name_or_path
+        if "llama-2" in model_name.lower():
+            conv_mode = "llava_llama_2"
+        elif "mistral" in model_name.lower():
+            conv_mode = "mistral_instruct"
+        elif "v1.6-34b" in model_name.lower():
+            conv_mode = "chatml_direct"
+        elif "v1" in model_name.lower():
+            conv_mode = "llava_v1"
+        elif "mpt" in model_name.lower():
+            conv_mode = "mpt"
+        else:
+            conv_mode = "llava_v0"
+        if self.args.conv_mode is not None and conv_mode != self.args.conv_mode:
+            print(
+                "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
+                    conv_mode, self.args.conv_mode, self.args.conv_mode
+                )
+            )
+        else:
+            self.args.conv_mode = conv_mode
+        conv = conv_templates[self.args.conv_mode].copy()
+        if self.args.image_file is not None and first_message:
+            if self.config.mm_use_im_start_end:
+                inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + "\n" + inp
+            else:
+                inp = DEFAULT_IMAGE_TOKEN + "\n" + inp
+            conv.append_message(conv.roles[0], inp)
+            first_message = False
+        else:
+            conv.append_message(conv.roles[0], inp)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        record = {"image": self.args.image_file, "conversations": prompt}
+        image_size = load_image(args.image_file).size
+        data_dict = self.processor(record=record, image_aspect_ratio=self.config.image_aspect_ratio)
+        data_dict['image_size'] = [image_size]
+        return data_dict
+    def post_processing(self, generate_ids):
+        msg = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        return msg
+    def run_benchmark(self):
+        first_message = True
+        import time
+        start = 0.0
+        total = 0.0
+        for i in range(20):
+            if i>10:
+                start = time.time()
+            inp = "user: Generate the caption in English with grounding"
+            data_dict = self.pre_processing(inp, first_message)
+            image = paddle.cast(data_dict["images"], self.compute_dtype)
+            image_features = self.encode_images(image,data_dict['image_size'])
+            generate_ids, _ = self.generate_with_image_features(
+                image_features,
+                data_dict["input_ids"],
+            )
+            msg = self.post_processing(generate_ids)
+            if i > 10:
+                total += time.time()-start
+        print("Time: ", total/10)
+    def predict(self):
+        roles = "user", "assistant"
+        first_message = True
+        if self.args.benchmark:
+            self.run_benchmark()
+        else:
+            while True:
+                try:
+                    inp = input(f"{roles[0]}: ")
+                except EOFError:
+                    inp = ""
+                if not inp:
+                    print("exit...")
+                    break
+                print(f"{roles[1]}: ", end="")
+                data_dict = self.pre_processing(inp, first_message)
+                image = paddle.cast(data_dict["images"], self.compute_dtype)
+                image_features = self.encode_images(image,data_dict['image_size'])
+                generate_ids, _ = self.generate_with_image_features(
+                    image_features,
+                    data_dict["input_ids"],
+                )
+                msg = self.post_processing(generate_ids)
+                print("Outputs: ", msg)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--first_model_path",
+        default="The dir name of image encoder model",
+        type=str,
+        help="",
+    )
+    parser.add_argument(
+        "--second_model_path",
+        default="The dir name of language model",
+        type=str,
+        help="",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default="qwen-vl/qwen-vl-7b",
+        help="The path of extraction model path that you want to load.",
+    )
+    parser.add_argument(
+        "--device", default="gpu", choices=["gpu", "cpu", "xpu"], help="Device selected for inference."
+    )
+    parser.add_argument("--seed", default=0)
+    parser.add_argument("--fp16", action="store_true")
+    parser.add_argument("--image_file", type=str, required=True)
+    parser.add_argument("--conv_mode", type=str, default=None)
+    parser.add_argument("--benchmark", action="store_true")
+    args = parser.parse_args()
+    paddle.seed(args.seed)
+    predictor = Predictor(args)
+    predictor.predict()

PaddleMIX/deploy/llava/utils.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import glob
+import math
+import os
+import struct
+import numpy as np
+def deserialize_from_file(fp):
+    x_type = fp.read(1)
+    x_type_out = struct.unpack("c", x_type)[0]
+    # data
+    data_list = []
+    if x_type_out == b"0":
+        data = fp.read(4)
+        data_out = struct.unpack("f", data)[0]
+        while data:
+            data_out = struct.unpack("f", data)[0]
+            data_list.append(data_out)
+            data = fp.read(4)
+    elif x_type_out == b"1":
+        data = fp.read(8)
+        while data:
+            data_out = struct.unpack("l", data)[0]
+            data_list.append(data_out)
+            data = fp.read(8)
+    elif x_type_out == b"2":
+        data = fp.read(4)
+        while data:
+            data_out = struct.unpack("i", data)[0]
+            data_list.append(data_out)
+            data = fp.read(4)
+    else:
+        print("type error")
+    data_arr = np.array(data_list)
+    return data_arr
+def load_real_time_tokens():
+    tokens = []
+    files = glob.glob(os.path.join("./real_time_save.*"))
+    for j in range(1, len(files) + 1):
+        filename = "./real_time_save.temp_ids_rank_0_step_{}".format(j)
+        if not os.path.exists(filename):
+            break
+        fp = open(filename, "rb+")
+        fp.read(1)
+        data_list = deserialize_from_file(fp)
+        fp.close()
+        tokens.append(np.array(data_list).reshape(-1, 1))
+    os.system("rm -f ./real_time_save.temp_ids_rank_*")
+    tokens = np.concatenate(tokens, axis=1)
+    return tokens
+def get_alibi_slopes(num_heads):
+    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
+    base = 2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3)))
+    powers = np.arange(1, 1 + closest_power_of_2)
+    slopes = np.power(base, powers)
+    if closest_power_of_2 != num_heads:
+        extra_base = 2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3)))
+        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+        extra_powers = np.arange(1, 1 + 2 * num_remaining_heads, 2)
+        slopes = np.concatante([slopes, np.power(extra_base, extra_powers)], axis=0)
+    return slopes.astype("float32")

PaddleMIX/deploy/qwen2_vl/README.md ADDED Viewed

	@@ -0,0 +1,50 @@

+# Qwen2-VL
+## 1. 模型介绍
+[Qwen2-VL](https://qwenlm.github.io/blog/qwen2-vl/) 是 Qwen 团队推出的一个专注于视觉与语言（Vision-Language, VL）任务的多模态大模型。它旨在通过结合图像和文本信息，提供强大的跨模态理解能力，可以处理涉及图像描述、视觉问答（VQA）、图文检索等多种任务。Qwen2-VL通过引入创新性的技术如 Naive Dynamic Resolution 和 M-RoPE，以及深入探讨大型多模态模型的潜力，显著地提高了多模态内容的视觉理解能力。
+## 2 环境准备
+- **python >= 3.10**
+- **paddlepaddle-gpu 要求是develop版本**
+```bash
+# 安装示例
+python -m pip install paddlepaddle-gpu==0.0.0.post118 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html
+```
+- **paddlenlp 需要特定版本**
+在PaddleMIX/代码目录下执行以下命令安装特定版本的paddlenlp：
+```bash
+# 安装示例
+git submodule update --init --recursive
+cd PaddleNLP
+git reset --hard e91c2d3d634b12769c30aa419ddf931c20b7ca9f
+pip install -e .
+cd csrc
+python setup_cuda.py install
+```
+> 注：
+* 请确保安装了以上依赖，否则无法运行。同时，需要安装 paddlemix/external_ops 下的自定义OP, `python setup.py install`。如果安装后仍然找不到算子，需要额外设置PYTHONPATH
+* (默认开启flash_attn)使用flash_attn 要求A100/A800显卡或者H20显卡
+## 3 高性能推理
+在Qwen2-VL的高性能推理优化中，**视觉模型部分继续使用PaddleMIX中的模型组网；但是语言模型部分调用PaddleNLP中高性能的Qwen2语言模型**，以得到高性能的Qwen2-VL推理版本。
+### 3.1. 文本&单张图像输入高性能推理
+```bash
+python deploy/qwen2_vl/single_image_infer.py \
+    --model_name_or_path Qwen/Qwen2-VL-2B-Instruct \
+    --dtype bfloat16 \
+    --benchmark True \
+```
+- 在 NVIDIA A100-SXM4-80GB 上测试的单图端到端速度性能如下：
+| model                  | Paddle Inference|    PyTorch   | Paddle 动态图 |
+| ---------------------- | --------------- | ------------ | ------------ |
+| Qwen2-VL-2B-Instruct   |      1.44 s     |     2.35 s   |    5.215 s   |
+| Qwen2-VL-7B-Instruct   |      1.73 s     |      4.4s    |    6.339 s   |

PaddleMIX/deploy/qwen2_vl/single_image_infer.py ADDED Viewed

	@@ -0,0 +1,276 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import datetime
+from dataclasses import dataclass, field
+import numpy as np
+import paddle
+from paddlenlp.generation import GenerationConfig
+from paddlenlp.trainer import PdArgumentParser
+from paddlenlp.transformers import AutoConfig, AutoInferenceModelForCausalLM
+from paddlenlp.trl import llm_utils
+from paddlemix.models.qwen2_vl import MIXQwen2Tokenizer
+from paddlemix.models.qwen2_vl.modeling_qwen2_vl import (
+    Qwen2RotaryEmbedding,
+    Qwen2VLForConditionalGeneration,
+)
+from paddlemix.processors.qwen2_vl_processing import (
+    Qwen2VLImageProcessor,
+    Qwen2VLProcessor,
+    process_vision_info,
+)
+MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct"
+vl_model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_NAME, dtype="bfloat16")
+# NOTE: (zhoukangkang、changwenbin) Because we only use the visual model here,
+# in order to reduce video memory,we delete the language model.
+del vl_model.model
+paddle.device.cuda.empty_cache()
+image_processor = Qwen2VLImageProcessor()
+tokenizer = MIXQwen2Tokenizer.from_pretrained(MODEL_NAME)
+processor = Qwen2VLProcessor(image_processor, tokenizer)
+# min_pixels = 256*28*28 # 200704
+# max_pixels = 1280*28*28 # 1003520
+# processor = Qwen2VLProcessor(image_processor, tokenizer, min_pixels=min_pixels, max_pixels=max_pixels)
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": "paddlemix/demo_images/examples_image1.jpg",
+            },
+            {"type": "text", "text": "Describe this image."},
+        ],
+    }
+]
+# Preparation for inference
+image_inputs, video_inputs = process_vision_info(messages)
+question = "Describe this image."
+image_pad_token = "<|vision_start|><|image_pad|><|vision_end|>"
+text = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{image_pad_token}{question}<|im_end|>\n<|im_start|>assistant\n"
+@dataclass
+class PredictorArgument:
+    # NOTE: (zhoukangkang、changwenbin)
+    # These parameters are all copied from https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/predict/predictor.py
+    # For simplicity and ease of use, only the necessary parameters are retained here.
+    # If you want to know the exact meaning of these parameters, please refer to the link above.
+    model_name_or_path: str = field(default=None, metadata={"help": "The directory of model."})
+    src_length = 1024
+    min_length = 2
+    max_length = 200
+    top_k = 0
+    top_p = 0.0
+    temperature = 0.95
+    repetition_penalty = 1.0
+    dtype: str = field(default=None, metadata={"help": "Model dtype"})
+    decode_strategy = "sampling"
+    mode = "dynamic"
+    inference_model = True
+    quant_type = ""
+    benchmark: bool = field(
+        default=False,
+        metadata={
+            "help": "If benchmark set as `True`, we will force model decode to max_length, which is helpful to compute throughput. "
+        },
+    )
+    use_fake_parameter = False
+    block_attn = True
+    block_size = 64
+    cachekv_int8_type = None
+    append_attn = True
+    total_max_length = 4096
+    speculate_method = None
+@dataclass
+class ModelArgument:
+    model_type: str = field(
+        default=None,
+        metadata={"help": "the type of the model, which can be one of ['gpt-3', 'ernie-3.5-se', 'llama-img2txt']"},
+    )
+def init_llm_model_inputs(vision_model_inputs, inputs_embeds, arg_config: PredictorArgument):
+    assert len(inputs_embeds.shape) == 3
+    batch_size = inputs_embeds.shape[0]
+    model_inputs = {}
+    model_inputs["input_ids"] = paddle.zeros(shape=[batch_size, arg_config.total_max_length], dtype="int64")
+    model_inputs["inputs_embeds"] = inputs_embeds
+    # I dislike write (arg_config.total_max_length + arg_config.block_size -1 ) // arg_config.block_size
+    assert arg_config.total_max_length % arg_config.block_size == 0
+    model_inputs["top_p"] = paddle.full(shape=[batch_size, 1], fill_value=arg_config.top_p, dtype="float32")
+    model_inputs["temperature"] = paddle.full(
+        shape=[batch_size, 1], fill_value=arg_config.temperature, dtype="float32"
+    )
+    model_inputs["eos_token_id"] = paddle.to_tensor(
+        np.array(llm_utils.get_eos_token_id(tokenizer, generation_config)).reshape(-1, 1).astype("int64")
+    )
+    model_inputs["penalty_score"] = paddle.full(
+        shape=[batch_size, 1], fill_value=arg_config.repetition_penalty, dtype="float32"
+    )
+    model_inputs["frequency_score"] = paddle.full(shape=[batch_size, 1], fill_value=0.0, dtype="float32")
+    model_inputs["presence_score"] = paddle.full(shape=[batch_size, 1], fill_value=0.0, dtype="float32")
+    model_inputs["min_length"] = paddle.full(shape=[batch_size, 1], fill_value=arg_config.min_length, dtype="int64")
+    model_inputs["max_length"] = paddle.full(shape=[batch_size, 1], fill_value=arg_config.max_length, dtype="int64")
+    position_ids, _ = vl_model.get_rope_index(
+        config.vision_config["spatial_merge_size"],
+        config.image_token_id,
+        config.video_token_id,
+        config.vision_start_token_id,
+        vision_model_inputs.get("input_ids"),
+        vision_model_inputs.get("image_grid_thw"),
+        vision_model_inputs.get("video_grid_thw", None),
+        vision_model_inputs.get("attention_mask"),
+    )
+    position_start = position_ids[0][0][-1].item()
+    position_end = 4096 - position_ids.shape[-1] + position_start
+    position_value = (
+        paddle.arange(position_start, position_end).reshape([1, 1, -1]).expand([position_ids.shape[0], 1, -1])
+    )
+    position_ids = paddle.concat([position_ids, position_value], axis=-1)
+    head_dim = config.hidden_size // config.num_attention_heads
+    qwen2_Embedding = Qwen2RotaryEmbedding(head_dim, 4096, config.rope_theta)
+    cos = qwen2_Embedding.cos_cached
+    sin = qwen2_Embedding.sin_cached
+    # NOTE: (zhoukangkang、changwenbin) Copied from PaddleMIX/paddlemix/models/qwen2_vl/modeling_qwen2_vl.py,
+    # for calculating M-ROPE.
+    cos = cos[position_ids]
+    sin = sin[position_ids]
+    mrope_section = config.rope_scaling["mrope_section"] * 2
+    cos = paddle.concat(x=[m[i % 3] for i, m in enumerate(cos.split(mrope_section, axis=-1))], axis=-1)
+    sin = paddle.concat(x=[m[i % 3] for i, m in enumerate(sin.split(mrope_section, axis=-1))], axis=-1)
+    rope_emb = paddle.stack([cos, sin], axis=0)
+    rope_emb = rope_emb.reshape([rope_emb.shape[0], 1, rope_emb.shape[2], 1, rope_emb.shape[-1]])
+    model_inputs["rope_emb"] = rope_emb
+    model_inputs["bad_tokens"] = paddle.to_tensor([-1], dtype="int64")
+    model_inputs["is_block_step"] = paddle.full(shape=[batch_size], fill_value=False, dtype="bool")
+    cache_kvs_shape = fast_llm_model.get_cache_kvs_shape(fast_llm_model.config, batch_size)
+    cachekv_dtype = config.dtype if arg_config.cachekv_int8_type is None else "uint8"
+    model_inputs["cache_kvs"] = [paddle.zeros(shape, dtype=cachekv_dtype) for shape in cache_kvs_shape]
+    block_nums = arg_config.total_max_length // arg_config.block_size
+    model_inputs["block_tables"] = paddle.arange(block_nums, dtype="int32").tile([batch_size, 1])
+    seq_lens = inputs_embeds.shape[1]
+    model_inputs["seq_lens_this_time"] = paddle.to_tensor(np.array(seq_lens).astype("int32").reshape(-1, 1))
+    model_inputs["seq_lens_encoder"] = paddle.to_tensor(np.array(seq_lens).astype("int32").reshape(-1, 1))
+    model_inputs["seq_lens_decoder"] = paddle.full(shape=[batch_size, 1], fill_value=0, dtype="int32")
+    model_inputs["step_idx"] = paddle.full(shape=[batch_size, 1], fill_value=0, dtype="int64")
+    model_inputs["not_need_stop"] = paddle.full(shape=[1], fill_value=True, dtype="bool")
+    model_inputs["stop_flags"] = paddle.full(shape=[batch_size, 1], fill_value=False, dtype="bool")
+    model_inputs["stop_nums"] = paddle.full(shape=[1], fill_value=batch_size, dtype="int64")
+    model_inputs["pre_ids"] = paddle.full(shape=[batch_size, arg_config.max_length], fill_value=-1, dtype="int64")
+    model_inputs["next_tokens"] = paddle.full(shape=[batch_size, 1], fill_value=-1, dtype="int64")
+    return model_inputs
+parser = PdArgumentParser((PredictorArgument, ModelArgument))
+predictor_args, model_args = parser.parse_args_into_dataclasses()
+paddle.set_default_dtype(predictor_args.dtype)
+config = AutoConfig.from_pretrained(predictor_args.model_name_or_path)
+# NOTE: (changwenbin) This is for using the inference optimization of paddlenlp qwen2.
+config.model_type = "qwen2"
+generation_config = GenerationConfig.from_pretrained(predictor_args.model_name_or_path)
+fast_llm_model = AutoInferenceModelForCausalLM.from_pretrained(
+    predictor_args.model_name_or_path,
+    config=config,
+    predictor_args=predictor_args,
+    model_args=model_args,
+    dtype=predictor_args.dtype,
+    tensor_parallel_degree=1,
+    tensor_parallel_rank=0,
+)
+fast_llm_model.eval()
+vl_model.model = fast_llm_model
+def run_model():
+    vision_model_inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pd",
+    )
+    inputs_embeds = vl_model.vision_forward(**vision_model_inputs)
+    llm_model_inputs = init_llm_model_inputs(vision_model_inputs, inputs_embeds, arg_config=predictor_args)
+    generated_text = ""
+    while llm_model_inputs["not_need_stop"]:
+        generated_ids = fast_llm_model.generate(**llm_model_inputs)  # already trimmed in paddle
+        llm_model_inputs["input_ids"] = generated_ids
+        llm_model_inputs["inputs_embeds"] = None
+        new_text_piece = processor.batch_decode(
+            generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        if new_text_piece == "<|im_end|>":
+            break
+        generated_text += new_text_piece
+    return generated_text
+if predictor_args.benchmark:
+    print(f"Benchmarking {predictor_args.model_name_or_path} ...")
+    warm_up = 3
+    repeat_times = 10
+    sumtime = 0.0
+    times = repeat_times + warm_up
+    for i in range(times):
+        if i > 2:
+            paddle.device.synchronize()
+            starttime = datetime.datetime.now()
+        generated_text = run_model()
+        if i > 2:
+            paddle.device.synchronize()
+            endtime = datetime.datetime.now()
+            print("Final output_text:\n", generated_text)
+        if i > 2:
+            duringtime = endtime - starttime
+            duringtime = duringtime.seconds * 1000 + duringtime.microseconds / 1000.0
+            sumtime += duringtime
+            print(f"Single {predictor_args.model_name_or_path} end to end time : ", duringtime, "ms")
+            inference_global_mem = paddle.device.cuda.memory_reserved() / (1024**3)
+            print(f"Inference used CUDA memory : {inference_global_mem:.3f} GiB")
+    print(f"Single {predictor_args.model_name_or_path} ave end to end time : ", sumtime / repeat_times, "ms")
+else:
+    generated_text = run_model()
+    print("Final output_text:\n", generated_text)

PaddleMIX/deploy/qwen_vl/run_static_predict.py ADDED Viewed

	@@ -0,0 +1,203 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+os.environ["FLAGS_use_cuda_managed_memory"] = "true"
+import paddle
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+from utils import load_real_time_tokens
+from paddlemix import QwenVLProcessor, QWenVLTokenizer
+class Predictor(object):
+    def __init__(self, args):
+        self.args = args
+        self.config = PretrainedConfig.from_pretrained(args.model_name_or_path)
+        self.tokenizer = QWenVLTokenizer.from_pretrained(args.model_name_or_path)
+        self.processor = QwenVLProcessor(tokenizer=self.tokenizer)
+        self.first_predictor = self.create_predictor(args.first_model_path)
+        print(f"first_model_path: {args.first_model_path}, {self.first_predictor}")
+        self.second_predictor = self.create_predictor(args.second_model_path)
+        print(f"second_model_path: {args.second_model_path}, {self.second_predictor}")
+    def create_predictor(self, model_path):
+        from paddlenlp.utils.import_utils import import_module
+        import_module("paddlenlp_ops.encode_rotary_qk")
+        import_module("paddlenlp_ops.get_padding_offset")
+        import_module("paddlenlp_ops.qkv_transpose_split")
+        import_module("paddlenlp_ops.rebuild_padding")
+        import_module("paddlenlp_ops.transpose_remove_padding")
+        import_module("paddlenlp_ops.write_cache_kv")
+        model_file = model_path + ".pdmodel"
+        params_file = model_path + ".pdiparams"
+        if not os.path.exists(model_file):
+            raise ValueError("not find model file path {}".format(model_file))
+        if not os.path.exists(params_file):
+            raise ValueError("not find params file path {}".format(params_file))
+        config = paddle.inference.Config(model_file, params_file)
+        config.switch_ir_optim(True)
+        if self.args.device == "gpu":
+            config.enable_use_gpu(100, 0)
+        config.switch_use_feed_fetch_ops(False)
+        predictor = paddle.inference.create_predictor(config)
+        return predictor
+    @paddle.no_grad()
+    def encode_images(self, pixel_values):
+        [language_model_inputs] = self.first_predictor.run([pixel_values])
+        return language_model_inputs
+    @paddle.no_grad()
+    def generate_with_image_features(self, image_features, input_ids):
+        batch, seq, _ = image_features.shape
+        seq = input_ids.shape[1]
+        max_len = 1024
+        dtype = "float16"
+        tgt_generation_mask = paddle.full([batch, 1, 1, max_len], 1, dtype=dtype)
+        img_pos = None
+        if paddle.any(input_ids == self.config.visual["image_start_id"]):
+            bos_pos = paddle.where(input_ids == self.config.visual["image_start_id"])
+            eos_pos = paddle.where(input_ids == self.config.visual["image_start_id"] + 1)
+            assert (bos_pos[0] == eos_pos[0]).astype("bool").all()
+            img_pos = paddle.concat((bos_pos[0], bos_pos[1], eos_pos[1]), axis=1)
+        attention_mask = paddle.full([batch, 1, max_len, max_len], 0, dtype=dtype)
+        attention_mask[:, 0, :seq, :seq] = paddle.tril(paddle.ones(shape=(seq, seq), dtype=dtype))
+        position_ids = paddle.full([batch, seq], 0, dtype="int64")
+        for i in range(batch):
+            position_ids[i, :] = paddle.to_tensor([i for i in range(seq)], dtype="int64")
+        inputs = [
+            input_ids,  # input_ids
+            image_features,  # image_features
+            img_pos,  # img_pos
+            attention_mask,  # attention_mask
+            position_ids,  # position_ids
+            paddle.full([batch, 1], 1.0, dtype="float32"),  # penalty_score
+            paddle.full([batch, 1], 0.0, dtype="float32"),  # frequency_score,
+            paddle.full([batch, 1], 0.0, dtype="float32"),  # presence_score,
+            paddle.full([batch, 1], 1, dtype="int64"),  # min_length,
+            paddle.full([batch, 1], max_len - seq, dtype="int64"),  # max_length,
+            paddle.full([batch, 1], 1.0, dtype="float32"),  # temperature,
+            paddle.full([batch, 1], 0.0, dtype="float32"),  # top_p,
+            paddle.full([1], 151643, dtype="int64"),  # eos_token_id,
+            paddle.full([batch, 1], seq, dtype="int32"),  # seq_len_encoder,
+            paddle.full([batch, 1], seq, dtype="int32"),  # seq_len_decoder,
+            paddle.full([batch, 1], 0, dtype="int64"),  # step_idx,
+            paddle.full([batch, 1], False, dtype="bool"),  # stop_flags,
+            paddle.full([batch, 1], -123, dtype="int64"),  # tgt_ids can be be initialized arbitrarily
+            paddle.full([batch, 1], seq - 1, dtype="int64"),  # tgt_pos,
+            tgt_generation_mask,  # tgt_generation_mask,
+            paddle.full([batch, max_len], -100, dtype="int64"),  # pre_ids, can be initialized arbitrarily
+            paddle.full([1], batch, dtype="int64"),  # stop_nums, be batch
+        ]
+        for i in range(32):
+            tmp = paddle.rand(shape=[2, batch, 32, max_len, 128], dtype=dtype)
+            inputs.append(tmp)
+        self.second_predictor.run(inputs)
+        tokens = load_real_time_tokens()
+        generate_ids = tokens.tolist()
+        return generate_ids, None
+    def pre_processing(self, url, prompt):
+        # input query
+        query = []
+        query.append({"image": url})
+        query.append({"text": prompt})
+        inputs = self.processor(query=query, return_tensors="pd")
+        return inputs
+    def post_processing(self, generate_ids):
+        msg = self.processor.batch_decode(generate_ids)
+        return msg
+    def predict(self, url, prompt):
+        inputs = self.pre_processing(url, prompt)
+        images = inputs["images"]
+        second_input_ids = inputs["input_ids"]
+        image_features = self.encode_images(images)
+        generate_ids, _ = self.generate_with_image_features(
+            image_features,
+            second_input_ids,
+        )
+        msg = self.post_processing(generate_ids)
+        return msg
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--first_model_path",
+        default="The dir name of image encoder model",
+        type=str,
+        help="",
+    )
+    parser.add_argument(
+        "--second_model_path",
+        default="The dir name of language model",
+        type=str,
+        help="",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default="qwen-vl/qwen-vl-7b",
+        help="The path of extraction model path that you want to load.",
+    )
+    parser.add_argument(
+        "--device", default="gpu", choices=["gpu", "cpu", "xpu"], help="Device selected for inference."
+    )
+    parser.add_argument("--seed", default=1234)
+    parser.add_argument("--benchmark", action="store_true")
+    args = parser.parse_args()
+    paddle.seed(args.seed)
+    predictor = Predictor(args)
+    url = "https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg"
+    prompt = "Generate the caption in English with grounding:"
+    if not args.benchmark:
+        msg = predictor.predict(url, prompt)
+        print("Outputs: ", msg)
+    else:
+        import time
+        start = 0.0
+        total = 0.0
+        for i in range(20):
+            if i>10:
+                start = time.time()
+            msg = predictor.predict(url, prompt)
+            if i > 10:
+                total += time.time()-start
+        print("Time :",total/10)

PaddleMIX/deploy/sam/README.md ADDED Viewed

	@@ -0,0 +1,37 @@

+# Segment Anything
+## 1. 模型简介
+[Segment Anything](https://ai.facebook.com/research/publications/segment-anything/) 是 Meta AI Research, FAIR
+的图像分割模型。根据输入提示（如点或框）生成高质量mask，可为图像中的所有对象进行分割。它已经在1100万张图像和11亿个掩模的数据集上进行了训练，并在各种分割任务上具有强大的零样本性能。
+本仓库提供该模型的Paddle部署实现。
+## 2. 快速开始
+## 2.1 静态图导出与预测
+```bash
+#导出输入类型是 bbox 的静态图
+python export.py --model_type Sam/SamVitH-1024 --input_type boxs  --save_dir sam_export
+#导出输入类型是 points 的静态图
+python export.py --model_type Sam/SamVitH-1024 --input_type points  --save_dir sam_export
+#bbox 提示词推理
+python predict.py \
+--input_image https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg \
+--box_prompt 112 118 513 382 \
+--input_type boxs \
+--model_name_or_path Sam/SamVitH-1024 \
+--cfg Sam/SamVitH-1024_boxs/deploy.yaml
+#points 提示词推理
+python predict.py \
+--input_image https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg \
+--points_prompt 548 372 \
+--input_type points \
+--model_name_or_path Sam/SamVitH-1024 \
+--cfg Sam/SamVitH-1024_points/deploy.yaml
+```

PaddleMIX/deploy/sam/export.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import paddle
+import yaml
+from paddlemix.models.sam.modeling import SamModel
+from paddlemix.utils.log import logger
+def parse_args():
+    parser = argparse.ArgumentParser(description="Export Inference Model.")
+    parser.add_argument(
+        "--model_type",
+        choices=["Sam/SamVitH-1024", "Sam/SamVitB", "Sam/SamVitL"],
+        required=True,
+        help="The model type.",
+        type=str,
+    )
+    parser.add_argument(
+        "--input_type",
+        choices=["boxs", "points", "points_grid"],
+        required=True,
+        help="The model type.",
+        type=str,
+    )
+    parser.add_argument(
+        "--save_dir",
+        help="The directory for saving the exported inference model",
+        type=str,
+        default="./output/inference_model",
+    )
+    parser.add_argument(
+        "--input_img_shape",
+        nargs="+",
+        help="Export the model with fixed input shape, e.g., `--input_img_shape 1 3 512 1024`.",
+        type=int,
+        default=[1, 3, 1024, 1024],
+    )
+    return parser.parse_args()
+def main(args):
+    os.environ["PADDLESEG_EXPORT_STAGE"] = "True"
+    model = SamModel.from_pretrained(args.model_type, input_type=args.input_type)
+    shape = [None, 3, None, None] if args.input_img_shape is None else args.input_img_shape
+    if args.input_type == "points":
+        shape2 = [1, 1, 2]
+    elif args.input_type == "boxs":
+        shape2 = [None, 4]
+    elif args.input_type == "points_grid":
+        shape2 = [64, 1, 2]
+    input_spec = [
+        paddle.static.InputSpec(shape=shape, dtype="float32"),
+        paddle.static.InputSpec(shape=shape2, dtype="int32"),
+    ]
+    model.eval()
+    model = paddle.jit.to_static(model, input_spec=input_spec)
+    save_path = f"{args.model_type}_{args.input_type}"
+    paddle.jit.save(model, os.path.join(save_path, "model"))
+    # TODO add test config
+    deploy_info = {
+        "Deploy": {
+            "model": "model.pdmodel",
+            "params": "model.pdiparams",
+            "input_img_shape": shape,
+            "input_prompt_shape": shape2,
+            "input_prompt_type": args.input_type,
+            "model_type": args.model_type,
+            "output_dtype": "float32",
+        }
+    }
+    msg = "\n---------------Deploy Information---------------\n"
+    msg += str(yaml.dump(deploy_info))
+    logger.info(msg)
+    yml_file = os.path.join(save_path, "deploy.yaml")
+    with open(yml_file, "w") as file:
+        yaml.dump(deploy_info, file)
+    logger.info(f"The inference model is saved in {save_path}")
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

PaddleMIX/deploy/sam/predict.py ADDED Viewed

	@@ -0,0 +1,374 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import codecs
+import os
+from dataclasses import dataclass, field
+from typing import List
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+import yaml
+from paddle.inference import Config as PredictConfig
+from paddle.inference import create_predictor
+from paddlenlp.trainer import PdArgumentParser
+from PIL import Image
+from paddlemix.processors.sam_processing import SamProcessor
+from paddlemix.utils.log import logger
+def show_mask(mask, ax, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    ax.imshow(mask_image)
+class DeployConfig:
+    def __init__(self, path):
+        with codecs.open(path, "r", "utf-8") as file:
+            self.dic = yaml.load(file, Loader=yaml.FullLoader)
+        self._dir = os.path.dirname(path)
+    @property
+    def model(self):
+        return os.path.join(self._dir, self.dic["Deploy"]["model"])
+    @property
+    def params(self):
+        return os.path.join(self._dir, self.dic["Deploy"]["params"])
+def use_auto_tune(args):
+    return (
+        hasattr(PredictConfig, "collect_shape_range_info")
+        and hasattr(PredictConfig, "enable_tuned_tensorrt_dynamic_shape")
+        and args.device == "gpu"
+        and args.use_trt
+        and args.enable_auto_tune
+    )
+def auto_tune(args, imgs, img_nums):
+    """
+    Use images to auto tune the dynamic shape for trt sub graph.
+    The tuned shape saved in args.auto_tuned_shape_file.
+    Args:
+        args(dict): input args.
+        imgs(str, list[str], numpy): the path for images or the origin images.
+        img_nums(int): the nums of images used for auto tune.
+    Returns:
+        None
+    """
+    logger.info("Auto tune the dynamic shape for GPU TRT.")
+    assert use_auto_tune(args), (
+        "Do not support auto_tune, which requires " "device==gpu && use_trt==True && paddle >= 2.2"
+    )
+    if not isinstance(imgs, (list, tuple)):
+        imgs = [imgs]
+    num = min(len(imgs), img_nums)
+    cfg = DeployConfig(args.cfg)
+    pred_cfg = PredictConfig(cfg.model, cfg.params)
+    pass_builder = pred_cfg.pass_builder()
+    pass_builder.delete_pass("identity_op_clean_pass")
+    pred_cfg.enable_use_gpu(100, 0)
+    if not args.print_detail:
+        pred_cfg.disable_glog_info()
+    pred_cfg.collect_shape_range_info(args.auto_tuned_shape_file)
+    # todo
+    predictor = create_predictor(pred_cfg)
+    input_names = predictor.get_input_names()
+    input_handle = predictor.get_input_handle(input_names[0])
+    for i in range(0, num):
+        if isinstance(imgs[i], str):
+            data = {"img": imgs[i]}
+            data = np.array([cfg.transforms(data)["img"]])
+        else:
+            data = imgs[i]
+        input_handle.reshape(data.shape)
+        input_handle.copy_from_cpu(data)
+        try:
+            predictor.run()
+        except Exception as e:
+            logger.info(str(e))
+            logger.info(
+                "Auto tune failed. Usually, the error is out of GPU memory " "for the model or image is too large. \n"
+            )
+            del predictor
+            if os.path.exists(args.auto_tuned_shape_file):
+                os.remove(args.auto_tuned_shape_file)
+            return
+    logger.info("Auto tune success.\n")
+class Predictor:
+    def __init__(self, args):
+        """
+        Prepare for prediction.
+        The usage and docs of paddle inference, please refer to
+        https://paddleinference.paddlepaddle.org.cn/product_introduction/summary.html
+        """
+        self.args = args
+        self.cfg = DeployConfig(args.cfg)
+        self.processor = SamProcessor.from_pretrained(args.model_name_or_path)
+        self._init_base_config()
+        if args.device == "cpu":
+            self._init_cpu_config()
+        elif args.device == "npu":
+            self.pred_cfg.enable_custom_device("npu")
+        elif args.device == "xpu":
+            self.pred_cfg.enable_xpu()
+        else:
+            self._init_gpu_config()
+        try:
+            self.predictor = create_predictor(self.pred_cfg)
+        except Exception as e:
+            logger.info(str(e))
+            logger.info(
+                "If the above error is '(InvalidArgument) some trt inputs dynamic shape info not set, "
+                "..., Expected all_dynamic_shape_set == true, ...', "
+                "please set --enable_auto_tune=True to use auto_tune. \n"
+            )
+            exit()
+    def _init_base_config(self):
+        self.pred_cfg = PredictConfig(self.cfg.model, self.cfg.params)
+        pass_builder = self.pred_cfg.pass_builder()
+        pass_builder.delete_pass("identity_op_clean_pass")
+        self.pred_cfg.enable_memory_optim()
+        self.pred_cfg.switch_ir_optim(True)
+    def _init_cpu_config(self):
+        """
+        Init the config for x86 cpu.
+        """
+        logger.info("Use CPU")
+        self.pred_cfg.disable_gpu()
+        if self.args.enable_mkldnn:
+            logger.info("Use MKLDNN")
+            # cache 10 different shapes for mkldnn
+            self.pred_cfg.set_mkldnn_cache_capacity(10)
+            self.pred_cfg.enable_mkldnn()
+        self.pred_cfg.set_cpu_math_library_num_threads(self.args.cpu_threads)
+    def _init_gpu_config(self):
+        """
+        Init the config for nvidia gpu.
+        """
+        logger.info("Use GPU")
+        self.pred_cfg.enable_use_gpu(100, 0)
+    def run(self, image, prompt_out):
+        image, prompt_out = self.preprocess(image, prompt_out)
+        input_names = self.predictor.get_input_names()
+        input_handle1 = self.predictor.get_input_handle(input_names[0])
+        input_handle2 = self.predictor.get_input_handle(input_names[1])
+        output_names = self.predictor.get_output_names()
+        output_handle = self.predictor.get_output_handle(output_names[0])
+        input_handle1.reshape(image.shape)
+        input_handle1.copy_from_cpu(image.numpy())
+        if self.args.input_type == "boxs":
+            prompt_out = prompt_out.reshape([-1, 4])
+        input_handle2.reshape(prompt_out.shape)
+        input_handle2.copy_from_cpu(prompt_out.numpy())
+        self.predictor.run()
+        results = output_handle.copy_to_cpu()
+        results = self.postprocess(results)
+        return results
+    def preprocess(self, image, prompts):
+        image_seg, prompt = self.processor(
+            image,
+            input_type=self.args.input_type,
+            box=prompts["boxs"],
+            point_coords=prompts["points"],
+        )
+        return [image_seg, prompt]
+    def postprocess(self, results):
+        return self.processor.postprocess_masks(results)
+@dataclass
+class DataArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `PdArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+    input_image: str = field(metadata={"help": "The name of input image."})
+    box_prompt: List[int] = field(default=None, metadata={"help": "box promt format as xyxyxyxy...]."})
+    points_prompt: List[int] = field(default=None, metadata={"help": "point promt format as [[xy],[xy]...]."})
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        default="Sam/SamVitH-1024",
+        metadata={"help": "Path to pretrained model or model identifier"},
+    )
+    input_type: str = field(
+        default="boxs",
+        metadata={"help": "The model prompt type, choices ['boxs', 'points', 'points_grid']."},
+    )
+    cfg: str = field(
+        default=None,
+        metadata={"help": "The config file."},
+    )
+    use_trt: bool = field(
+        default=False,
+        metadata={"help": "Whether to use Nvidia TensorRT to accelerate prediction."},
+    )
+    precision: str = field(
+        default="fp32",
+        metadata={"help": "The tensorrt precision."},
+    )
+    min_subgraph_size: int = field(
+        default=3,
+        metadata={"help": "The min subgraph size in tensorrt prediction.'"},
+    )
+    enable_auto_tune: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to enable tuned dynamic shape. We uses some images to collect \
+             the dynamic shape for trt sub graph, which avoids setting dynamic shape manually."
+        },
+    )
+    device: str = field(
+        default="GPU",
+        metadata={"help": "Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU."},
+    )
+    cpu_threads: int = field(
+        default=10,
+        metadata={"help": "Number of threads to predict when using cpu."},
+    )
+    enable_mkldnn: bool = field(
+        default=False,
+        metadata={"help": "Enable to use mkldnn to speed up when using cpu."},
+    )
+    output_dir: str = field(
+        default="seg_output",
+        metadata={"help": "output directory."},
+    )
+    visual: bool = field(
+        default=True,
+        metadata={"help": "save visual image."},
+    )
+    benchmark: bool = field(
+        default=False,
+        metadata={"help": "benchmark"}
+    )
+def main(model_args, data_args):
+    url = data_args.input_image
+    # read image
+    if os.path.isfile(url):
+        image_pil = Image.open(data_args.input_image).convert("RGB")
+    else:
+        image_pil = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    if data_args.box_prompt is not None:
+        data_args.box_prompt = np.array(data_args.box_prompt)
+    if data_args.points_prompt is not None:
+        data_args.points_prompt = np.array([data_args.points_prompt])
+    if use_auto_tune(model_args):
+        tune_img_nums = 10
+        auto_tune(model_args, [image_pil], tune_img_nums)
+    predictor = Predictor(model_args)
+    if model_args.benchmark:
+        import time
+        start = 0.0
+        total = 0.0
+        for i in range(20):
+            if i>10:
+                start = time.time()
+            seg_masks = predictor.run(image_pil, {"points": data_args.points_prompt, "boxs": data_args.box_prompt})
+            if i > 10:
+                total += time.time()-start
+        print("Time:",total/10)
+    seg_masks = predictor.run(image_pil, {"points": data_args.points_prompt, "boxs": data_args.box_prompt})
+    if model_args.visual:
+        # make dir
+        os.makedirs(model_args.output_dir, exist_ok=True)
+        # draw output image
+        plt.figure(figsize=(10, 10))
+        plt.imshow(image_pil)
+        for mask in seg_masks:
+            show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
+        plt.axis("off")
+        plt.savefig(
+            os.path.join(model_args.output_dir, "mask_pred.jpg"),
+            bbox_inches="tight",
+            dpi=300,
+            pad_inches=0.0,
+        )
+    if use_auto_tune(model_args) and os.path.exists(model_args.auto_tuned_shape_file):
+        os.remove(model_args.auto_tuned_shape_file)
+if __name__ == "__main__":
+    parser = PdArgumentParser((ModelArguments, DataArguments))
+    model_args, data_args = parser.parse_args_into_dataclasses()
+    model_args.device = model_args.device.upper()
+    assert model_args.device in [
+        "CPU",
+        "GPU",
+        "XPU",
+        "NPU",
+    ], "device should be CPU, GPU, XPU or NPU"
+    main(model_args, data_args)

PaddleMIX/docs/hardware_support/ascend_usage.md ADDED Viewed

	@@ -0,0 +1,222 @@

+# PaddleMIX昇腾使用说明
+为了满足用户对AI芯片多样化的需求， PaddleMIX 团队基于飞桨框架在硬件兼容性和灵活性方面的优势，深度适配了昇腾910芯片，为用户提供了国产计算芯片上的训推能力。只需安装说明安装多硬件版本的飞桨框架后，在模型配置文件中添加一个配置设备的参数，即可在相关硬件上使用PaddleMIX。当前PaddleMIX昇腾版适配涵盖了多模态理解模型InternVL2、LLaVA和多模态生成模型SD3、SDXL。未来我们将继续在用户使用的多种算力平台上适配 PaddleMIX 更多的模型，敬请期待。
+## 1. 模型列表
+<table align="center">
+  <tbody>
+    <tr align="center" valign="center">
+      <td>
+        <b>多模态理解</b>
+      </td>
+      <td>
+        <b>多模态生成</b>
+      </td>
+    </tr>
+    <tr valign="top">
+      <td>
+        <ul>
+        </ul>
+          <li><b>图文预训练</b></li>
+        <ul>
+            <li><a href="../../paddlemix/examples/llava">LLaVA-1.6</a></li>
+            <li><a href="../../paddlemix/examples/internvl2">InternVL2</a></li>
+      </ul>
+      </td>
+      <td>
+        <ul>
+        </ul>
+          <li><b>文生图</b></li>
+        <ul>
+           <li><a href="../../ppdiffusers/examples/stable_diffusion">Stable Diffusion</a></li>
+           <li><a href="../../ppdiffusers/examples/dreambooth/README_sd3.md">Stable Diffusion 3 (SD3)</a></li>
+        </ul>
+      </td>
+    </tr>
+  </tbody>
+</table>
+## 2. 安装说明
+### 2.1 创建标准化环境
+当前 PaddleMIX 支持昇腾 910B 芯片，昇腾驱动版本为 23.0.3。考虑到环境差异性，我们推荐使用飞桨官方提供的标准镜像（支持x86服务器与Arm服务器）完成环境准备。
+参考如下命令启动容器，ASCEND_RT_VISIBLE_DEVICES 指定可见的 NPU 卡号
+```shell
+docker run -it --name paddle-npu-dev -v $(pwd):/work \
+    --privileged --network=host --shm-size=128G -w=/work \
+    -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
+    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+    -v /usr/local/dcmi:/usr/local/dcmi \
+    -e ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
+    registry.baidubce.com/device/paddle-npu:cann80T13-ubuntu20-$(uname -m)-gcc84-py39 /bin/bash
+```
+### 2.2 安装飞桨
+在容器内安装飞桨
+```shell
+# 注意需要先安装飞桨 cpu 版本，目前仅支持python3.9版本
+python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
+python -m pip install --pre paddle-custom-npu -i https://www.paddlepaddle.org.cn/packages/nightly/npu/
+```
+### 2.3 安装PaddleMIX
+克隆PaddleMIX仓库
+```shell
+# 使用最新发布的release/2.1分支
+git clone https://github.com/PaddlePaddle/PaddleMIX -b release/2.1
+cd PaddleMIX
+```
+### 2.4 安装依赖
+```shell
+sh build_env.sh
+python -m pip install -U librosa
+```
+## 3. 多模态理解
+多模态大模型（Multimodal LLM）是当前研究的热点，在 2024 年迎来了井喷式的发展，它将多模态输入经由特定的多模态 encoder 转化为与文本对齐的 token ，随后被输入到大语言模型中来执行多模态任务。PaddleMIX 2.1 新增了两大系列多模态大模型：InternVL2 系列和 Qwen2-VL 系列，同时支持指令微调训练和推理部署，模型能力覆盖了图片问答、文档图表理解、关键信息提取、场景文本理解、 OCR 识别、科学数学问答、视频理解、多图联合理解等。
+InternVL2系列模型支持昇腾 910B 芯片上训练和推理，使用昇腾 910B 芯片训练推理时请先参考本文安装说明章节中的内容安装相应版本的飞桨框架。InternVL2模型训练推理使用方法参考如下:
+### 3.1 微调训练
+#### 3.1.1 数据准备
+参照[文档](../../paddlemix/examples/internvl2)进行数据准备
+#### 3.1.2 环境设置
+设置NPU相关环境变量
+```shell
+export FLAGS_use_stride_kernel=0
+export FLAGS_npu_storage_format=0 # 关闭私有格式
+export FLAGS_npu_jit_compile=0 # 关闭即时编译
+export FLAGS_npu_scale_aclnn=True # aclnn加速
+export FLAGS_npu_split_aclnn=True # aclnn加速
+export CUSTOM_DEVICE_BLACK_LIST=set_value,set_value_with_tensor # set_value加入黑名单
+# 将ppdiffusers加入到PYTHONPATH中
+export PYTHONPATH=`pwd`/ppdiffusers:$PYTHONPATH
+```
+#### 3.1.3 微调训练
+执行微调训练，可以从[PaddleMIX工具箱介绍](../..//paddlemix/tools/README.md)查看详细的参数说明
+```shell
+# 以2B权重为例子
+sh paddlemix/examples/internvl2/shell/internvl2.0/2nd_finetune/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh
+```
+### 3.2 推理
+#### 3.2.1 环境设置
+参考上述步骤设置环境变量
+#### 3.2.2 执行推理
+```shell
+python paddlemix/examples/internvl2/chat_demo.py \
+    --model_name_or_path "OpenGVLab/InternVL2-2B" \
+    --image_path 'paddlemix/demo_images/examples_image1.jpg' \
+    --text "Please describe this image in detail."
+```
+## 4. 多模态生成
+PPDiffusers 提供了 SD3 的的个性化微调训练样例，只需要少量主题图像即可定制个性化 SD3 模型，支持 DreamBooth LoRA 微调及 DreamBooth 全参数微调。在推理上，提供 SD3 模型高性能推理实现。
+多模态生成Stable Diffusion系列模型支持昇腾 910B 芯片上训练和推理，使用昇腾 910B 芯片训练推理时请先参考本文安装说明章节中的内容安装相应版本的飞桨框架。SDXL模型训练推理使用方法参考如下:
+### 4.1 训练
+#### 4.1.1 环境设置
+昇腾 910B 芯片上进行SDXL训练时设置相应的环境变量
+```shell
+export FLAGS_npu_storage_format=0
+export FLAGS_use_stride_kernel=0
+export FLAGS_npu_scale_aclnn=True
+export FLAGS_allocator_strategy=auto_growth
+export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
+export VAE_NAME="madebyollin/sdxl-vae-fp16-fix"
+export DATASET_NAME="lambdalabs/naruto-blip-captions"
+export HF_ENDPOINT=https://hf-mirror.com
+export FLAGS_conv_workspace_size_limit=4096
+# 将ppdiffusers加入到PYTHONPATH中
+export PYTHONPATH=`pwd`/ppdiffusers:$PYTHONPATH
+```
+#### 4.1.2 启动SDXL微调训练
+```shell
+python -u ppdiffusers/examples/text_to_image/train_text_to_image_sdxl.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --pretrained_vae_model_name_or_path=$VAE_NAME \
+  --dataset_name=$DATASET_NAME \
+  --resolution=512 --center_crop --random_flip \
+  --proportion_empty_prompts=0.2 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 --gradient_checkpointing \
+  --max_train_steps=10000 \
+  --learning_rate=1e-06 --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --mixed_precision="fp16" \
+  --report_to="wandb" \
+  --validation_prompt="a cute Sundar Pichai creature" --validation_epochs 5 \
+  --checkpointing_steps=5000 \
+  --output_dir="sdxl-pokemon-model"
+```
+#### 4.1.3 启动SDXL LoRA训练
+```shell
+python -u ppdiffusers/examples/text_to_image/train_text_to_image_lora_sdxl.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --pretrained_vae_model_name_or_path=$VAE_NAME \
+  --dataset_name=$DATASET_NAME --caption_column="text" \
+  --resolution=1024 --random_flip \
+  --train_batch_size=1 \
+  --num_train_epochs=2 --checkpointing_steps=500 \
+  --learning_rate=1e-04 --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --mixed_precision="fp16" \
+  --seed=42 \
+  --output_dir="sd-pokemon-model-lora-sdxl" \
+  --validation_prompt="cute dragon creature" \
+  --report_to="wandb"
+```
+### 4.2 推理
+推理脚本参考如下
+```python
+from ppdiffusers import StableDiffusionXLPipeline
+from ppdiffusers import (
+    AutoencoderKL,
+    StableDiffusionXLPipeline,
+    UNet2DConditionModel,
+)
+import paddle
+unet_path = "your-checkpoint/unet"
+pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", paddle_dtype=paddle.float16)
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix")
+unet = UNet2DConditionModel.from_pretrained(unet_path)
+prompt = "A pokemon with green eyes and red legs."
+image = pipe(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
+image.save("pokemon.png")
+```

PaddleMIX/paddlemix/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Standard imports
+# Local imports
+from .caption_dataset import *
+from .chatml_dataset import *
+from .coco_caption import *
+from .coco_clip import *
+from .collator import *
+from .dataset import *
+from .mixtoken_dataset import *
+from .vg_caption import *
+import pkg_resources
+version = pkg_resources.get_distribution("paddlenlp").version
+try:
+    if version.startswith('3'):
+        from .internvl_dataset import *
+    else:
+        print(f"paddlenlp version {version} is not 3.x, skipping import internvl2 datasets.")
+except ImportError:
+    print("paddlenlp is not installed.")

PaddleMIX/paddlemix/datasets/caption_dataset.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import json
+import os
+from paddle.utils.download import get_path_from_url
+from paddlemix.utils.env import DATA_HOME
+from paddlemix.utils.log import logger
+from .dataset import DatasetBuilder
+__all__ = ["CaptionDataset"]
+class CaptionDataset(DatasetBuilder):
+    """
+    Caption dataset.
+    """
+    URL = "https://bj.bcebos.com/v1/paddlenlp/datasets/paddlemix/coco.tar"
+    META_INFO = collections.namedtuple("META_INFO", ("images", "annotations", "images_md5", "annotations_md5"))
+    MD5 = ""
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("coco", "images"),
+            os.path.join("coco", "annotations/coco_karpathy_train.json"),
+            "",
+            "",
+        ),
+        "val": META_INFO(
+            os.path.join("coco", "images"),
+            os.path.join("coco", "annotations/coco_karpathy_val.json"),
+            "",
+            "",
+        ),
+        "test": META_INFO(
+            os.path.join("coco", "images"),
+            os.path.join("coco", "annotations/coco_karpathy_test.json"),
+            "",
+            "",
+        ),
+    }
+    def _get_data(self, mode, **kwargs):
+        logger.info("default dataset root is {}".format(DATA_HOME))
+        images, annotations, image_hash, anno_hash = self.SPLITS[mode]
+        image_fullname = os.path.join(DATA_HOME, images)
+        anno_fullname = os.path.join(DATA_HOME, annotations)
+        if not os.path.exists(image_fullname) or not os.path.exists(anno_fullname):
+            get_path_from_url(self.URL, DATA_HOME)
+        return image_fullname, anno_fullname, mode
+    def _gen_image_id(self, anno):
+        img_ids = {}
+        n = 0
+        for ann in anno:
+            img_id = ann["image_id"]
+            if img_id not in img_ids.keys():
+                img_ids[img_id] = n
+                n += 1
+        return img_ids
+    def _gen_image_id_eval(self, anno):
+        img_ids = {}
+        n = 0
+        for ann in anno:
+            img_id = ann["image"].split("/")[-1].strip(".jpg").split("_")[-1]
+            if img_id not in img_ids.keys():
+                img_ids[img_id] = n
+                n += 1
+        return img_ids
+    def _read(self, filename, *args):
+        image_root, anno_path, mode = filename
+        annotations = json.load(open(anno_path, "r"))
+        if mode == "val" or mode == "test":
+            image_ids = self._gen_image_id_eval(annotations)
+        else:
+            image_ids = self._gen_image_id(annotations)
+        for ann in annotations:
+            image_path = os.path.join(image_root, ann["image"])
+            if mode == "train":
+                yield_data = {
+                    "image": image_path,
+                    "image_id": image_ids[ann["image_id"]],
+                }
+                # only train mode has text input
+                yield_data["text_input"] = ann["caption"]
+            else:
+                yield_data = {
+                    "image": image_path,
+                    "image_id": ann["image"].split("/")[-1].strip(".jpg").split("_")[-1],
+                }
+            yield yield_data

PaddleMIX/paddlemix/datasets/cc_sbu_dataset.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import json
+import os
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+from paddlemix.utils.env import DATA_HOME
+from paddlemix.utils.log import logger
+# from dataset import DatasetBuilder
+from .dataset import DatasetBuilder
+__all__ = ["CCSBUAlignDataset"]
+class CCSBUAlignDataset(DatasetBuilder):
+    """
+    CCSBUAlignDataset dataset.
+    """
+    URL = "https://paddlenlp.bj.bcebos.com/datasets/cc_sbu_align.zip"
+    META_INFO = collections.namedtuple(
+        "META_INFO", ("images", "annotations", "num_images", "annotations_md5")
+    )
+    MD5 = "d5fa38be915c8a2aee7ebf3a9c56a95c"
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("cc_sbu_align", "image"),
+            os.path.join("cc_sbu_align", "filter_cap.json"),
+            3439,
+            "fa3508b6ac29e0ddc7246683d0c3d9a2",
+        ),
+    }
+    def count_files(self, path):
+        if not os.path.isdir(path):
+            raise ValueError("A directory expected for path, but received {}".format(path))
+        pathes = os.listdir(path)
+        return len(pathes)
+    def _get_data(self, mode, **kwargs):
+        logger.info("default dataset root is {}".format(DATA_HOME))
+        images, annotations, num_images, anno_hash = self.SPLITS[mode]
+        image_fullname = os.path.join(DATA_HOME, images)
+        anno_fullname = os.path.join(DATA_HOME, annotations)
+        if (not os.path.exists(image_fullname)) or (not os.path.exists(anno_fullname)) or (not md5file(anno_fullname) == anno_hash) or num_images != self.count_files(image_fullname):
+            get_path_from_url(self.URL, DATA_HOME, self.MD5)
+        return image_fullname, anno_fullname, mode
+    def _gen_image_id(self, anno):
+        img_ids = {}
+        n = 0
+        for ann in anno:
+            # an ann example: {'image_id': '2', 'caption': 'The image shows a man fishing on a lawn next to a river with a bridge in the background. Trees can be seen on the other side of the river, and the sky is cloudy.'}
+            img_id = ann["image_id"]
+            if img_id not in img_ids.keys():
+                img_ids[img_id] = n
+                n += 1
+        return img_ids
+    def _read(self, filename, *args):
+        image_root, anno_path, mode = filename
+        with open(anno_path, "r", encoding="utf8") as f:
+            annotations = json.load(f)["annotations"]
+            image_ids = self._gen_image_id(annotations)
+            for ann in annotations:
+                image_path = os.path.join(image_root, ann["image_id"]+".jpg")
+                yield_data = {"image": image_path, "image_id": image_ids[ann["image_id"]]}
+                if mode == "train":
+                    # only train mode has text input
+                    yield_data["text_input"] = ann["caption"]
+                yield yield_data

PaddleMIX/paddlemix/datasets/chatml_dataset.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+from paddlenlp.transformers.tokenizer_utils import ChatTemplateMixin
+from .dataset import DatasetBuilder
+__all__ = ["ChatMLDataset"]
+class ChatMLDataset(DatasetBuilder, ChatTemplateMixin):
+    """
+    ChatMLDataset dataset.
+    """
+    SPLITS = {"train": "train.json", "val": "eval.json", "test": "test.json"}
+    def _read(self, filename, *args):
+        if self.config["chat_template"] is not None:
+            self.init_chat_template(self.config["chat_template"])
+        raw_data = json.load(open(filename, "r"))
+        annotations = raw_data
+        for ann in annotations:
+            yield_data = {}
+            conversations = ann["conversations"]
+            if self.config["chat_template"] is not None:
+                conversations.append([""])
+                yield_data["conversations"] = self.apply_chat_template(conversations, tokenize=False)
+            else:
+                yield_data["conversations"] = conversations
+            if "image" in ann.keys():
+                yield_data["image"] = ann["image"]
+            yield yield_data

PaddleMIX/paddlemix/datasets/coco_caption.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlemix.datasets.caption_dataset import CaptionDataset
+COCOCaption = CaptionDataset

PaddleMIX/paddlemix/datasets/coco_vqa.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import json
+import os
+from paddle.utils.download import get_path_from_url
+from paddlemix.utils.env import DATA_HOME
+from paddlemix.utils.log import logger
+from .dataset import DatasetBuilder
+__all__ = ["VQADataset"]
+class VQADataset(DatasetBuilder):
+    """
+    Caption dataset.
+    """
+    URL = "https://bj.bcebos.com/v1/paddlenlp/datasets/paddlemix/coco.tar"
+    META_INFO = collections.namedtuple("META_INFO", ("images", "annotations", "images_md5", "annotations_md5"))
+    MD5 = ""
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("coco", "images"),
+            [os.path.join("coco", "annotations/vqa_train.json"), os.path.join("coco", "annotations/vqa_val.json")],
+            "",
+            "",
+        ),
+        "val": META_INFO(
+            os.path.join("coco", "images"),
+            [
+                os.path.join("coco", "annotations/vqa_val_eval.json"),
+                os.path.join("coco", "annotations/answer_list.json"),
+                os.path.join("coco", "annotations/v2_OpenEnded_mscoco_val2014_questions.json"),
+                os.path.join("coco", "annotations/v2_mscoco_val2014_annotations.json"),
+            ],
+            "",
+            "",
+        ),
+        "test": META_INFO(
+            os.path.join("coco", "images"),
+            [
+                os.path.join("coco", "annotation/vqa_test.json"),
+                os.path.join("coco", "annotation/vqa_test.json"),
+            ],
+            "",
+            "",
+        ),
+    }
+    def _get_data(self, mode, **kwargs):
+        logger.info("default dataset root is {}".format(DATA_HOME))
+        images, annotations, image_hash, anno_hash = self.SPLITS[mode]
+        image_fullname = os.path.join(DATA_HOME, images)
+        if isinstance(annotations, (list, tuple)):
+            anno_fullname = []
+            for ann in annotations:
+                anno_fullname.append(os.path.join(DATA_HOME, ann))
+                if not os.path.exists(image_fullname) or not os.path.exists(os.path.join(DATA_HOME, ann)):
+                    get_path_from_url(self.URL, DATA_HOME)
+        else:
+            anno_fullname = os.path.join(DATA_HOME, annotations)
+            if not os.path.exists(image_fullname) or not os.path.exists(anno_fullname):
+                get_path_from_url(self.URL, DATA_HOME)
+        return image_fullname, anno_fullname, mode
+    def _read(self, filename, *args):
+        if isinstance(filename, (list, tuple)):
+            image_root, anno_path, mode = filename
+        else:
+            anno_path = [filename]
+            image_root = ""
+            mode = "train"
+        annotations = []
+        if mode == "val" or mode == "test":
+            annotations = json.load(open(anno_path[0]))
+            image_ids = self._gen_image_id_eval(annotations)
+        else:
+            for ann_p in anno_path:
+                annotations.extend(json.load(open(ann_p, "r")))
+            image_ids = self._gen_image_id(annotations)
+        for ann in annotations:
+            image_path = os.path.join(image_root, ann["image"])
+            if mode == "train":
+                yield_data = {
+                    "image": image_path,
+                }
+                yield_data["text_input"] = ann["question"]
+                yield_data["answers"] = ann["answer"]
+                yield_data["image_ids"] = ann["image"].split("/")[-1].strip(".jpg").split("_")[-1]
+            else:
+                yield_data = {
+                    "image": image_path,
+                    "text_input": ann["question"],
+                    "question_id": ann["question_id"],
+                    "image_id": ann["image"].split("/")[-1].strip(".jpg").split("_")[-1],
+                }
+                yield_data["image_ids"] = ann["image_ids"]
+            yield yield_data
+    def _gen_image_id(self, anno):
+        img_ids = {}
+        n = 0
+        for ann in anno:
+            if "image_id" not in ann.keys():
+                img_id = ann["image"].split("/")[-1].strip(".jpg").split("_")[-1]
+            else:
+                img_id = ann["image_id"]
+            if img_id not in img_ids.keys():
+                img_ids[img_id] = n
+                n += 1
+        return img_ids
+    def _gen_image_id_eval(self, anno):
+        img_ids = {}
+        n = 0
+        for ann in anno:
+            img_id = ann["image"].split("/")[-1].strip(".jpg").split("_")[-1]
+            if img_id not in img_ids.keys():
+                img_ids[img_id] = n
+                n += 1
+        return img_ids

PaddleMIX/paddlemix/datasets/collator.py ADDED Viewed

	@@ -0,0 +1,362 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+class CLIPCollator:
+    """
+    Data collator that will dynamically pad the inputs to the longest sequence in the batch.
+    Args:
+        processor (`paddlemix.processors.ProcessorMixin`):
+            The processor used for pre-process the data.
+    """
+    def __init__(self, processor):
+        self.processor = processor
+    def __call__(self, data_list):
+        if isinstance(data_list[0], dict):
+            images = [sample["image"] for sample in data_list]
+            text = [sample["text_input"] for sample in data_list]
+            batch = self.processor(
+                images=images,
+                text=text,
+                max_length=77,
+                return_tensors="pd",
+                return_attention_mask=False,
+                mode="train",
+                padding_zero=True,
+            )
+            return batch
+        else:
+            images = [sample[0] for sample in data_list]
+            labels = [sample[1] for sample in data_list]
+            batch = self.processor(
+                images=images,
+                text=None,
+                max_length=77,
+                return_tensors="pd",
+                return_attention_mask=False,
+                mode="eval",
+                do_resize=True,
+                do_crop=True,
+                padding_zero=True,
+            )
+            batch["labels"] = paddle.to_tensor(np.array(labels))
+            return batch
+class EVA02Collator:
+    """
+    Data collator that will dynamically pad the inputs to the longest sequence in the batch.
+    Args:
+        processor (`paddlemix.processors.ProcessorMixin`):
+            The processor used for pre-process the data.
+    """
+    def __init__(self, processor, mode="train"):
+        self.processor = processor
+        self.mode = mode
+    def __call__(self, data_list):
+        images = [sample[0] for sample in data_list]
+        # get labels from teacher's clip_features
+        batch = self.processor(
+            images=images,
+            return_tensors="pd",
+            mode=self.mode,
+        )
+        return batch
+class MiniGPT4Collator:
+    """
+    Data collator that will dynamically pad the inputs to the longest sequence in the batch.
+    Args:
+        processor (`paddlemix.processors.ProcessorMixin`):
+            The processor used for pre-process the data.
+    """
+    def __init__(self, processor, mode="test"):
+        self.processor = processor
+        self.mode = mode
+    def __call__(self, data_list):
+        images = [sample["image"] for sample in data_list]
+        target_texts = [sample["text_input"] for sample in data_list]
+        # random text from text_list read by processor and combine it with default prompt
+        batch_data = self.processor(images=images, mode="train")
+        target_outputs = self.processor.process_target_texts(target_texts)
+        batch_data.update(target_outputs)
+        return batch_data
+class QwenVLCollator:
+    """
+    Data collator that will dynamically pad the inputs to the longest sequence in the batch.
+    Args:
+        processor (`paddlemix.processors.ProcessorMixin`):
+            The processor used for pre-process the data.
+    """
+    def __init__(self, processor, mode="test"):
+        self.processor = processor
+        self.mode = mode
+    def __call__(self, data_list):
+        input_ids = []
+        labels = []
+        images = []
+        IGNORE_TOKEN_ID = -100
+        for record in data_list:
+            if isinstance(record, dict) and "input_ids" in record.keys():
+                raw_data = record
+            else:
+                raw_data = self.processor(query=record, mode=self.mode)
+            raw_data["input_ids"] += [self.processor.tokenizer.pad_token_id] * (
+                self.processor.max_len - len(raw_data["input_ids"])
+            )
+            raw_data["labels"] += [IGNORE_TOKEN_ID] * (self.processor.max_len - len(raw_data["labels"]))
+            input_ids.append(raw_data["input_ids"])
+            labels.append(raw_data["labels"])
+            if "images" in raw_data:
+                if isinstance(raw_data["images"], list):
+                    if not isinstance(raw_data["images"][0], list):
+                        raw_data["images"] = [raw_data["images"]]
+                    raw_data["images"] = [self.processor.image_processor(path) for path in raw_data["images"]]
+                    raw_data["images"] = paddle.stack(x=raw_data["images"], axis=0)
+                images.append(raw_data["images"])
+        input_ids = paddle.to_tensor(data=input_ids, dtype="int32")
+        labels = paddle.to_tensor(data=labels, dtype="int32")
+        attention_mask = input_ids.not_equal(y=paddle.to_tensor(self.processor.tokenizer.pad_token_id, dtype="int32"))
+        if len(images) > 0:
+            images = paddle.concat(images, axis=0)
+            image_shape = [-1, 3] + images.shape[-2:]
+            images = images.reshape(image_shape)
+        batch_data = dict(
+            input_ids=input_ids,
+            labels=labels,
+            images=images if 0 < len(images) else None,
+            attention_mask=attention_mask,
+        )
+        return batch_data
+class VisualglmCollator:
+    """
+    Data collator that will dynamically pad the inputs to the longest sequence in the batch.
+    Args:
+        processor (`paddlemix.processors.ProcessorMixin`):
+            The processor used for pre-process the data.
+    """
+    def __init__(self, processor, mode="test", max_seq_length=2048):
+        self.processor = processor
+        self.mode = mode
+        self.max_seq_length = max_seq_length
+    def __call__(self, data_list):
+        input_ids = []
+        labels = []
+        images = []
+        for record in data_list:
+            if "input_ids" not in record.keys():
+                raw_data = self.processor(record=record, mode=self.mode)
+            else:
+                raw_data = record
+            pad_len = self.max_seq_length - len(raw_data["input_ids"])
+            raw_data["input_ids"] = raw_data["input_ids"] + [self.processor.tokenizer.pad_token_id] * pad_len
+            raw_data["labels"] = raw_data["labels"] + [self.processor.tokenizer.pad_token_id] * pad_len
+            raw_data["labels"] = [
+                (l if l != self.processor.tokenizer.pad_token_id else -100) for l in raw_data["labels"]
+            ]
+            if "images" in raw_data:
+                if isinstance(raw_data["images"], list):
+                    raw_data["images"] = paddle.stack(x=raw_data["images"], axis=0)
+                images.append(raw_data["images"])
+            input_ids.append(raw_data["input_ids"])
+            labels.append(raw_data["labels"])
+        input_ids = paddle.to_tensor(data=input_ids, dtype="int64")
+        labels = paddle.to_tensor(data=labels, dtype="int64")
+        if 0 < len(images):
+            images = paddle.concat(images, axis=0)
+            image_shape = [-1, 3] + images.shape[-2:]
+            images = images.reshape(image_shape)
+        batch_data = dict(input_ids=input_ids, labels=labels, images=images if 0 < len(images) else None)
+        return batch_data
+class LLaVACollator:
+    """
+    Data collator that will dynamically pad the inputs to the longest sequence in the batch.
+    Args:
+        processor (`paddlemix.processors.ProcessorMixin`):
+            The processor used for pre-process the data.
+    """
+    def __init__(self, processor, mode="test", mixtokens=False):
+        self.processor = processor
+        self.mode = mode
+        self.mixtokens = mixtokens
+    def __call__(self, data_list):
+        IGNORE_INDEX = -100
+        input_ids = []
+        labels = []
+        images = []
+        for record in data_list:
+            if isinstance(record, dict) and "input_ids" in record.keys():
+                raw_data = record
+            else:
+                raw_data = self.processor(record=record, mode=self.mode)
+            raw_data["input_ids"] += [self.processor.tokenizer.pad_token_id] * (
+                self.processor.max_len - len(raw_data["input_ids"])
+            )
+            raw_data["labels"] += [IGNORE_INDEX] * (self.processor.max_len - len(raw_data["labels"]))
+            input_ids.append(raw_data["input_ids"])
+            labels.append(raw_data["labels"])
+            if "images" in raw_data:
+                if isinstance(raw_data["images"], list):
+                    raw_data["images"] = paddle.stack(x=raw_data["images"], axis=0)
+                images.append(raw_data["images"])
+        input_ids = paddle.to_tensor(data=input_ids, dtype="int32")
+        labels = paddle.to_tensor(data=labels, dtype="int32")
+        attention_mask = input_ids.not_equal(y=paddle.to_tensor(self.processor.tokenizer.pad_token_id, dtype="int32"))
+        if len(images) > 0:
+            images = paddle.concat(images, axis=0)
+            image_shape = [-1, 3] + images.shape[-2:]
+            images = images.reshape(image_shape)
+        batch_data = dict(
+            input_ids=input_ids,
+            labels=labels,
+            images=images if len(images) > 0 else None,
+            attention_mask=attention_mask,
+        )
+        return batch_data
+class InternLMXComposer2Collator:
+    """Collate examples for InternLMXComposer2Collator"""
+    def __init__(self, processor, mode="train"):
+        self.processor = processor
+        self.mode = mode
+    def __call__(self, instances):
+        instances = [self.processor(query=instance, mode=self.mode) for instance in instances]
+        input_tokens, input_text = tuple(
+            [instance[key] for instance in instances] for key in ("input_tokens", "input_text")
+        )
+        batch = dict(
+            input_tokens=input_tokens,
+            input_text=input_text,
+        )
+        if "images" in instances[0].keys():
+            input_images = tuple([instance["images"] for instance in instances])
+            batch["images"] = input_images
+        return dict(samples=batch)
+class InternVL2Collator:
+    """Collate examples for InternVL2Collator"""
+    def __init__(self, processor, mode="test"):
+        self.processor = processor
+        self.mode = mode
+    def __call__(self, features):
+        pad_id = self.processor.tokenizer.pad_token_id
+        IGNORE_INDEX = -100
+        first = features[0]
+        batch = {}
+        batch_lens = [feat["input_ids"].shape for feat in features]
+        max_item_length = max(batch_lens)[0]
+        for idx in range(len(features)):
+            feat = self.processor(features[idx])
+            temp_input_ids = paddle.to_tensor([pad_id] * max_item_length, dtype=paddle.int64)
+            temp_input_ids[: feat["input_ids"].shape[0]] = feat["input_ids"]
+            feat["input_ids"] = temp_input_ids
+            temp_labels = paddle.to_tensor([IGNORE_INDEX] * max_item_length, dtype=paddle.int64)
+            temp_labels[: feat["labels"].shape[0]] = feat["labels"]
+            feat["labels"] = temp_labels
+            feat["attention_mask"] = feat["input_ids"].ne(pad_id)
+        # Special handling for labels.
+        # Ensure that tensor is created with the correct type
+        # (it should be automatically the case, but let's make sure of it.)
+        if "label" in first and first["label"] is not None:
+            label = first["label"].item() if isinstance(first["label"], paddle.Tensor) else first["label"]
+            dtype = paddle.int64 if isinstance(label, int) else paddle.float32
+            batch["labels"] = paddle.to_tensor([f["label"] for f in features], dtype=dtype)
+        elif "label_ids" in first and first["label_ids"] is not None:
+            if isinstance(first["label_ids"], paddle.Tensor):
+                batch["labels"] = paddle.stack([f["label_ids"] for f in features])
+            else:
+                dtype = paddle.int64 if isinstance(first["label_ids"][0], int) else paddle.float32
+                batch["labels"] = paddle.to_tensor([f["label_ids"] for f in features], dtype=dtype)
+        # Handling of all other possible keys.
+        # Again, we will use the first element to figure out which key/values are not None for this model.
+        for k, v in first.items():
+            if (
+                k not in ("label", "label_ids", "pixel_values", "image_flags")
+                and v is not None
+                and not isinstance(v, str)
+            ):
+                if isinstance(v, paddle.Tensor):
+                    batch[k] = paddle.stack([f[k] for f in features])
+                elif isinstance(v, np.ndarray):
+                    batch[k] = paddle.to_tensor(np.stack([f[k] for f in features]))
+                else:
+                    batch[k] = paddle.to_tensor([f[k] for f in features])
+            if k in ("pixel_values", "image_flags"):
+                if isinstance(v, paddle.Tensor):
+                    batch[k] = paddle.concat([f[k] for f in features])
+                elif isinstance(v, np.ndarray):
+                    batch[k] = paddle.concat(np.stack([f[k] for f in features]))
+                else:
+                    batch[k] = paddle.concat([f[k] for f in features])
+        return batch

PaddleMIX/paddlemix/datasets/dataset.py ADDED Viewed

	@@ -0,0 +1,1169 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import atexit
+import inspect
+import os
+import time
+import warnings
+from collections import namedtuple
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
+import cv2
+import datasets
+import numpy as np
+from multiprocess import Pool, RLock
+from PIL import Image
+import paddlemix
+try:
+    import paddle.distributed as dist
+except Exception:
+    warnings.warn("paddle.distributed is not contains in you paddle!")
+import importlib
+from functools import partial
+from paddle.io import Dataset, IterableDataset
+from paddle.utils.download import _get_unique_endpoints
+from paddlemix.utils.env import DATA_HOME
+__all__ = ["MapDataset", "DatasetBuilder", "IterDataset", "load_dataset", "MixDataset"]
+DATASETS_MODULE_PATH = "paddlemix.datasets."
+# Patch for intranet
+from datasets import load_dataset as origin_load_dataset  # noqa: E402
+def load_from_ppvlp(path, *args, **kwargs):
+    ppvlp_path = paddlemix.datasets.__path__[0]
+    new_path = os.path.split(path)[-1]
+    new_path = os.path.join(ppvlp_path, "hf_datasets", new_path + ".py")
+    if os.path.exists(new_path):
+        return origin_load_dataset(new_path, *args, **kwargs)
+    else:
+        return origin_load_dataset(path, *args, **kwargs)
+datasets.load_dataset = load_from_ppvlp
+class DatasetTuple:
+    def __init__(self, splits):
+        self.identifier_map, identifiers = self._gen_identifier_map(splits)
+        self.tuple_cls = namedtuple("datasets", identifiers)
+        self.tuple = self.tuple_cls(*[None for _ in splits])
+    def __getitem__(self, key):
+        if isinstance(key, (int, slice)):
+            return self.tuple[key]
+        if isinstance(key, str):
+            return getattr(self.tuple, self.identifier_map[key])
+    def __setitem__(self, key, value):
+        self.tuple = self.tuple._replace(**{self.identifier_map[key]: value})
+    def _gen_identifier_map(self, splits):
+        identifier_map = {}
+        identifiers = []
+        for i in range(len(splits)):
+            identifiers.append("splits_" + str(i))
+            identifier_map[splits[i]] = "splits_" + str(i)
+        return identifier_map, identifiers
+    def __len__(self):
+        return len(self.tuple)
+def import_main_class(module_path):
+    """
+    Import a module at module_path and return its DatasetBuilder class.
+    """
+    module_path = DATASETS_MODULE_PATH + module_path
+    module = importlib.import_module(module_path)
+    main_cls_type = DatasetBuilder
+    # Find the main class in our imported module
+    module_main_cls = None
+    for name, obj in module.__dict__.items():
+        if isinstance(obj, type) and issubclass(obj, main_cls_type):
+            if name == "DatasetBuilder":
+                continue
+            module_main_cls = obj
+            break
+    return module_main_cls
+def load_from_hf(path, name=None, splits=None, **kwargs):
+    from datasets import DatasetDict
+    from datasets import load_dataset as load_hf_dataset
+    from datasets.features import ClassLabel
+    try:
+        hf_datasets = load_hf_dataset(path, name=name, split=splits, **kwargs)
+    except FileNotFoundError:
+        raise FileNotFoundError("Couldn't find the dataset script for '" + path + "' on PaddleNLP or HuggingFace")
+    else:
+        label_list = []
+        if isinstance(hf_datasets, DatasetDict):
+            datasets = DatasetTuple(list(hf_datasets.keys()))
+            for split, ds in hf_datasets.items():
+                for feature in ds.features.values():
+                    if isinstance(feature, ClassLabel):
+                        label_list = feature.names
+                datasets[split] = MapDataset(ds, label_list=label_list)
+        elif isinstance(hf_datasets, list):
+            datasets = DatasetTuple(splits)
+            for i, split in enumerate(splits):
+                for feature in hf_datasets[i].features.values():
+                    if isinstance(feature, ClassLabel):
+                        label_list = feature.names
+                datasets[split] = MapDataset(hf_datasets[i], label_list=label_list)
+        else:
+            for feature in hf_datasets.features.values():
+                if isinstance(feature, ClassLabel):
+                    label_list = feature.names
+            datasets = MapDataset(hf_datasets, label_list=label_list)
+    return datasets
+def load_dataset(path_or_read_func, name=None, data_files=None, splits=None, lazy=None, **kwargs):
+    """
+    This method will load a dataset, either form PaddleNLP library or from a
+    self-defined data loading script, by calling functions in `DatasetBuilder`.
+    For all the names of datasets in PaddleNLP library, see here:  `dataset_list
+    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_list.html>`__.
+    Either `splits` or `data_files` must be specified.
+    Args:
+        path_or_read_func (str|callable): Name of the dataset processing script
+            in PaddleNLP library or a custom data reading function.
+        name (str, optional): Additional name to select a more specific dataset.
+            Defaults to None.
+        data_files (str|list|tuple|dict, optional): Defining the path of dataset
+            files. If None. `splits` must be specified. Defaults to None.
+        splits (str|list|tuple, optional): Which split of the data to load. If None.
+            `data_files` must be specified. Defaults to None.
+        lazy (bool, optional): Weather to return `MapDataset` or an `IterDataset`.
+            True for `IterDataset`. False for `MapDataset`. If None, return the
+            default type of this dataset. Defaults to None.
+        kwargs (dict): Other keyword arguments to be passed to the `DatasetBuilder`.
+    Returns:
+        A `MapDataset` or `IterDataset` or a tuple of those.
+    For how to use this function, please see `dataset_load
+    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_load.html>`__
+    and `dataset_self_defined
+    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_self_defined.html>`__
+    """
+    if inspect.isfunction(path_or_read_func):
+        assert lazy is not None, "lazy can not be None in custom mode."
+        kwargs["name"] = name
+        kwargs["data_files"] = data_files
+        kwargs["splits"] = splits
+        custom_kwargs = {}
+        for name in inspect.signature(path_or_read_func).parameters.keys():
+            if name in kwargs.keys():
+                custom_kwargs[name] = kwargs[name]
+        reader_instance = SimpleBuilder(lazy=lazy, read_func=path_or_read_func)
+        return reader_instance.read(**custom_kwargs)
+    else:
+        try:
+            reader_cls = import_main_class(path_or_read_func)
+        except ModuleNotFoundError:
+            datasets = load_from_hf(path_or_read_func, name=name, splits=splits, **kwargs)
+        else:
+            reader_instance = reader_cls(lazy=lazy, name=name, **kwargs)
+            # Check if selected name and split is valid in this DatasetBuilder
+            if hasattr(reader_instance, "BUILDER_CONFIGS"):
+                if name in reader_cls.BUILDER_CONFIGS.keys():
+                    split_names = reader_cls.BUILDER_CONFIGS[name]["splits"].keys()
+                else:
+                    raise ValueError(
+                        'Invalid name "{}". Should be one of {}.'.format(name, list(reader_cls.BUILDER_CONFIGS.keys()))
+                    )
+            elif hasattr(reader_instance, "SPLITS"):
+                split_names = reader_instance.SPLITS.keys()
+            else:
+                raise AttributeError("Either 'SPLITS' or 'BUILDER_CONFIGS' must be implemented for DatasetBuilder.")
+            selected_splits = []
+            if isinstance(splits, list) or isinstance(splits, tuple):
+                selected_splits.extend(splits)
+            else:
+                selected_splits += [splits]
+            for split_name in selected_splits:
+                if split_name not in split_names and split_name is not None:
+                    raise ValueError('Invalid split "{}". Should be one of {}.'.format(split_name, list(split_names)))
+            datasets = reader_instance.read_datasets(data_files=data_files, splits=splits)
+        return datasets
+class MapDataset(Dataset):
+    """
+    Wraps a map-style dataset-like object as an instance of `MapDataset`, and equips it
+    with `map` and other utility methods. All non-magic methods of the raw object
+    are also accessible.
+    Args:
+        data (list|Dataset): An object with `__getitem__` and `__len__` methods. It could
+            be a list or a subclass of `paddle.io.Dataset`.
+        kwargs (dict, optional): Other information to be passed to the dataset.
+    For examples of this class, please see `dataset_self_defined
+    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_self_defined.html>`__.
+    """
+    def __init__(self, data, **kwargs):
+        self.data = data
+        self._transform_pipline = []
+        self.new_data = self.data
+        self.info = kwargs
+        self.label_list = self.info.pop("label_list", None)
+        self.vocab_info = self.info.pop("vocab_info", None)
+    def _transform(self, data):
+        for fn in self._transform_pipline:
+            data = fn(data)
+        return data
+    def __getitem__(self, idx):
+        """
+        Basic function of `MapDataset` to get sample from dataset with a given
+        index.
+        """
+        return self._transform(self.new_data[idx]) if self._transform_pipline else self.new_data[idx]
+    def __len__(self):
+        """
+        Returns the number of samples in dataset.
+        """
+        return len(self.new_data)
+    def filter(self, fn, num_workers=0):
+        """
+        Filters samples by the filter function and uses the filtered data to
+        update this dataset.
+        Args:
+            fn (callable): A filter function that takes a sample as input and
+                returns a boolean. Samples that return False would be discarded.
+            num_workers(int, optional): Number of processes for multiprocessing. If
+                set to 0, it doesn't use multiprocessing. Defaults to `0`.
+        """
+        assert num_workers >= 0, "num_workers should be a non-negative value"
+        if num_workers > 1:
+            shards = [
+                self._shard(num_shards=num_workers, index=index, contiguous=True) for index in range(num_workers)
+            ]
+            kwds_per_shard = [dict(self=shards[rank], fn=fn) for rank in range(num_workers)]
+            pool = Pool(num_workers, initargs=(RLock(),))
+            results = [pool.apply_async(self.__class__._filter, kwds=kwds) for kwds in kwds_per_shard]
+            transformed_shards = [r.get() for r in results]
+            pool.close()
+            pool.join()
+            self.new_data = []
+            for i in range(num_workers):
+                self.new_data += transformed_shards[i].new_data
+            return self
+        else:
+            return self._filter(fn)
+    def _filter(self, fn):
+        self.new_data = [self.new_data[idx] for idx in range(len(self.new_data)) if fn(self.new_data[idx])]
+        return self
+    def shard(self, num_shards=None, index=None, contiguous=False):
+        self.new_data = self._shard(num_shards=num_shards, index=index, contiguous=contiguous).data
+        return self
+    def _shard(self, num_shards=None, index=None, contiguous=False):
+        """
+        Split the dataset into `num_shards` pieces. Note that the size of each
+        shard might be different because the original dataset may not be evenly
+        divisible.
+        Args:
+            num_shards (int, optional): An integer representing the number of
+                data shards. If None, `num_shards` would be number of trainers.
+                Defaults to `None`.
+            index (int, optional): An integer representing the index of the
+                current shard. If None, `index` would be the current trainer rank
+                id. Defaults to `None`.
+            contiguous: (bool, optional): If true, contiguous chunks of data
+                will be select for sharding. And total number of examples will
+                be the same. Otherwise each shard will contain all examples of
+                dataset whose index mod `num_shards` = `index`. Defaults to `False`.
+        """
+        if num_shards is None:
+            num_shards = dist.get_world_size()
+        if index is None:
+            index = dist.get_rank()
+        if contiguous:
+            div = len(self) // num_shards
+            mod = len(self) % num_shards
+            start = div * index + min(index, mod)
+            end = start + div + (1 if index < mod else 0)
+            new_data = [self.new_data[idx] for idx in range(start, end)]
+        else:
+            new_data = [self.new_data[idx] for idx in range(len(self.new_data)) if idx % num_shards == index]
+        return MapDataset(new_data)
+    def map(self, fn, lazy=True, batched=False, num_workers=0):
+        """
+        Performs specific function on the dataset to transform and update every sample.
+        Args:
+            fn (callable): Transformations to be performed. It receives single
+                sample as argument if batched is False. Else it receives all examples.
+            lazy (bool, optional): If True, transformations would be delayed and
+                performed on demand. Otherwise, transforms all samples at once. Note that
+                if `fn` is stochastic, `lazy` should be True or you will get the same
+                result on all epochs. Defaults to False.
+            batched(bool, optional): If True, transformations would take all examples as
+                input and return a collection of transformed examples. Note that if set
+                True, `lazy` option would be ignored. Defaults to False.
+            num_workers(int, optional): Number of processes for multiprocessing. If
+                set to 0, it doesn't use multiprocessing. Note that if set to positive
+                value, `lazy` option would be ignored. Defaults to 0.
+        """
+        assert num_workers >= 0, "num_workers should be a non-negative value"
+        if num_workers > 1:
+            shards = [
+                self._shard(num_shards=num_workers, index=index, contiguous=True) for index in range(num_workers)
+            ]
+            kwds_per_shard = [
+                dict(self=shards[rank], fn=fn, lazy=False, batched=batched) for rank in range(num_workers)
+            ]
+            pool = Pool(num_workers, initargs=(RLock(),))
+            results = [pool.apply_async(self.__class__._map, kwds=kwds) for kwds in kwds_per_shard]
+            transformed_shards = [r.get() for r in results]
+            pool.close()
+            pool.join()
+            self.new_data = []
+            for i in range(num_workers):
+                self.new_data += transformed_shards[i].new_data
+            return self
+        else:
+            return self._map(fn, lazy=lazy, batched=batched)
+    def _map(self, fn, lazy=True, batched=False):
+        if batched:
+            self.new_data = fn(self.new_data)
+        elif lazy:
+            self._transform_pipline.append(fn)
+        else:
+            self.new_data = [fn(self.new_data[idx]) for idx in range(len(self.new_data))]
+        return self
+class IterDataset(IterableDataset):
+    """
+    Wraps a dataset-like object as an instance of `IterDataset`, and equips it with
+    `map` and other utility methods. All non-magic methods of the raw object
+    also accessible.
+    Args:
+        data (Iterable): An object with `__iter__` function. It can be a Iterable or a
+            subclass of `paddle.io.IterableDataset`.
+        kwargs (dict, optional): Other information to be passed to the dataset.
+    For examples of this class, please see `dataset_self_defined
+    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_self_defined.html>`__.
+    """
+    def __init__(self, data, **kwargs):
+        self.data = data
+        self._transform_pipline = []
+        self._filter_pipline = []
+        self.label_list = kwargs.pop("label_list", None)
+        self.vocab_info = kwargs.pop("vocab_info", None)
+    def _transform(self, data):
+        for fn in self._transform_pipline:
+            data = fn(data)
+        return data
+    def _shard_filter(self, num_samples):
+        return True
+    def _filter(self, data):
+        for fn in self._filter_pipline:
+            if not fn(data):
+                return False
+        return True
+    def __iter__(self):
+        """
+        yields sample sequentially.
+        """
+        num_samples = 0
+        if inspect.isfunction(self.data):
+            for example in self.data():
+                if (not self._filter_pipline or self._filter(self._filter_pipline)) and self._shard_filter(
+                    num_samples=num_samples
+                ):
+                    yield self._transform(example) if self._transform_pipline else example
+                num_samples += 1
+        else:
+            if inspect.isgenerator(self.data):
+                warnings.warn("Reciving generator as data source, data can only be iterated once")
+            for example in self.data:
+                if (not self._filter_pipline or self._filter(self._filter_pipline)) and self._shard_filter(
+                    num_samples=num_samples
+                ):
+                    yield self._transform(example) if self._transform_pipline else example
+                num_samples += 1
+    def filter(self, fn):
+        """
+        Filters samples by the filter function and uses the filtered data to
+        update this dataset.
+        Args:
+            fn (callable): A filter function that takes a sample as input and
+                returns a boolean. Samples that return False are discarded.
+        """
+        self._filter_pipline.append(fn)
+        return self
+    def shard(self, num_shards=None, index=None):
+        """
+        Split the dataset into `num_shards` pieces.
+        Args:
+            num_shards (int, optional): An integer representing the number of
+                data shards. If None, `num_shards` would be number of trainers.
+                Defaults to None.
+            index (int, optional): An integer representing the index of the
+                current shard. If None, `index` would be the current trainer rank
+                id. Defaults to None.
+        """
+        if num_shards is None:
+            num_shards = dist.get_world_size()
+        if index is None:
+            index = dist.get_rank()
+        def sharder(num_shards, index, num_samples):
+            if num_samples % num_shards == index:
+                return True
+            else:
+                return False
+        fn = partial(sharder, num_shards=num_shards, index=index)
+        self._shard_filter = fn
+        return self
+    def map(self, fn):
+        """
+        Performs specific function on the dataset to transform and update every sample.
+        Args:
+            fn (callable): Transformations to be performed. It receives single
+                sample as argument.
+        """
+        self._transform_pipline.append(fn)
+        return self
+class DatasetBuilder:
+    """
+    A base class for all DatasetBuilder. It provides a `read()` function to turn
+    a data file into a MapDataset or IterDataset.
+    `_get_data()` function and `_read()` function should be implemented to download
+    data file and read data file into a `Iterable` of the examples.
+    For how to define a custom `DatasetBuilder`, please see `contribute_dataset
+    <https://paddlenlp.readthedocs.io/zh/latest/community/contribute_dataset.html>`__.
+    """
+    lazy = False
+    def __init__(self, lazy=None, name=None, **config):
+        if lazy is not None:
+            self.lazy = lazy
+        self.name = name
+        self.config = config
+    def read_datasets(self, splits=None, data_files=None):
+        def remove_if_exit(filepath):
+            if isinstance(filepath, (list, tuple)):
+                for file in filepath:
+                    try:
+                        os.remove(file)
+                    except OSError:
+                        pass
+            else:
+                try:
+                    os.remove(filepath)
+                except OSError:
+                    pass
+        if data_files is None:
+            if splits is None:
+                splits = (
+                    list(self.BUILDER_CONFIGS[self.name]["splits"].keys())
+                    if hasattr(self, "BUILDER_CONFIGS")
+                    else list(self.SPLITS.keys())
+                )
+            assert (
+                isinstance(splits, str)
+                or (isinstance(splits, list) and isinstance(splits[0], str))
+                or (isinstance(splits, tuple) and isinstance(splits[0], str))
+            ), "`splits` should be a string or list of string or a tuple of string."
+            if isinstance(splits, str):
+                splits = [splits]
+            datasets = DatasetTuple(splits)
+            parallel_env = dist.ParallelEnv()
+            unique_endpoints = _get_unique_endpoints(parallel_env.trainer_endpoints[:])
+            # move register hook to first and register togather
+            lock_files = []
+            for split in splits:
+                lock_file = os.path.join(DATA_HOME, self.__class__.__name__)
+                if self.name is not None:
+                    lock_file = lock_file + "." + self.name
+                lock_file += "." + split + ".done" + "." + str(os.getppid())
+                lock_files.append(lock_file)
+            # Must register to all procs to make the lock file can be removed
+            # when any proc breaks. Otherwise, the single registered proc may
+            # not receive proper singal send by the parent proc to exit.
+            atexit.register(lambda: remove_if_exit(lock_files))
+            for split in splits:
+                filename = self._get_data(split)
+                lock_file = os.path.join(DATA_HOME, self.__class__.__name__)
+                if self.name is not None:
+                    lock_file = lock_file + "." + self.name
+                lock_file += "." + split + ".done" + "." + str(os.getppid())
+                # `lock_file` indicates the finished status of`_get_data`.
+                # `_get_data` only works in the `unique_endpoints` specified
+                # proc since `get_path_from_url` only work for it. The other
+                # procs wait `_get_data` to be finished.
+                if parallel_env.current_endpoint in unique_endpoints:
+                    f = open(lock_file, "w")
+                    f.close()
+                else:
+                    while not os.path.exists(lock_file):
+                        time.sleep(1)
+                datasets[split] = self.read(filename=filename, split=split)
+        else:
+            assert (
+                isinstance(data_files, str) or isinstance(data_files, tuple) or isinstance(data_files, list)
+            ), "`data_files` should be a string or tuple or list of strings."
+            if isinstance(data_files, str):
+                data_files = [data_files]
+            default_split = "train"
+            if splits:
+                if isinstance(splits, str):
+                    splits = [splits]
+                datasets = DatasetTuple(splits)
+                assert len(splits) == len(
+                    data_files
+                ), "Number of `splits` and number of `data_files` should be the same if you want to specify the split of loacl data file."
+                for i in range(len(data_files)):
+                    datasets[splits[i]] = self.read(filename=data_files[i], split=splits[i])
+            else:
+                datasets = DatasetTuple(["split" + str(i) for i in range(len(data_files))])
+                for i in range(len(data_files)):
+                    datasets["split" + str(i)] = self.read(filename=data_files[i], split=default_split)
+        return datasets if len(datasets) > 1 else datasets[0]
+    def read(self, filename, split="train"):
+        """
+        Returns a dataset containing all the examples that can be read from the file path.
+        If `self.lazy` is False, this eagerly reads all instances from `self._read()`
+        and returns a `MapDataset`.
+        If `self.lazy` is True, this returns an `IterDataset`, which internally
+        relies on the generator created from `self._read()` to lazily produce examples.
+        In this case your implementation of `_read()` must also be lazy
+        (that is, not load all examples into memory at once).
+        Args:
+            filename (str): Path of data file to read, usually provided by `_get_data`
+                function.
+            split (str, optional): The split name of selected dataset. This only makes
+                a different when data files of different splits have different structures.
+        Returns:
+            A `MapDataset|IterDataset`.
+        """
+        label_list = self.get_labels()
+        vocab_info = self.get_vocab()
+        def _create_dict(labels):
+            # For multiple labels in the form of list.
+            if isinstance(labels[0], list) or isinstance(labels[0], tuple):
+                label_dict = []
+                for sub_labels in labels:
+                    sub_dict = {}
+                    for i, label in enumerate(sub_labels):
+                        sub_dict[label] = i
+                    label_dict.append(sub_dict)
+            else:
+                label_dict = {}
+                for i, label in enumerate(labels):
+                    label_dict[label] = i
+            return label_dict
+        def _convert_label_to_id(labels, label_dict):
+            if isinstance(labels, list) or isinstance(labels, tuple):
+                for label_idx in range(len(labels)):
+                    labels[label_idx] = label_dict[labels[label_idx]]
+            else:
+                labels = label_dict[labels]
+            return labels
+        if self.lazy:
+            def generate_examples():
+                generator = (
+                    self._read(filename, split) if self._read.__code__.co_argcount > 2 else self._read(filename)
+                )
+                for example in generator:
+                    # We need to check if the example contains label column and confirm its name.
+                    # For now we only allow `label` or `labels` to be the name of label column.
+                    if "labels" in example.keys():
+                        label_col = "labels"
+                    elif "label" in example.keys():
+                        label_col = "label"
+                    else:
+                        label_col = None
+                    # Convert class label to label ids.
+                    if label_list is not None and example.get(label_col, None):
+                        label_dict = _create_dict(label_list)
+                        # For multiple labels in the form of list.
+                        if isinstance(label_dict, list):
+                            for idx, sub_dict in enumerate(label_dict):
+                                example[label_col][idx] = _convert_label_to_id(example[label_col][idx], sub_dict)
+                        else:
+                            example[label_col] = _convert_label_to_id(example[label_col], label_dict)
+                        yield example
+                    else:
+                        yield example
+            return IterDataset(generate_examples(), label_list=label_list, vocab_info=vocab_info)
+        else:
+            examples = self._read(filename, split) if self._read.__code__.co_argcount > 2 else self._read(filename)
+            # Then some validation.
+            if not isinstance(examples, list):
+                examples = list(examples)
+            if not examples:
+                raise ValueError(
+                    "No instances were read from the given filepath {}. " "Is the path correct?".format(filename)
+                )
+            # We need to check if the example contains label column and confirm its name.
+            # For now we only allow `label` or `labels` to be the name of label column.
+            if isinstance(examples[0], dict):
+                if "labels" in examples[0].keys():
+                    label_col = "labels"
+                elif "label" in examples[0].keys():
+                    label_col = "label"
+                else:
+                    label_col = None
+            # Convert class label to label ids.
+            if label_list is not None and examples[0].get(label_col, None):
+                label_dict = _create_dict(label_list)
+                for idx in range(len(examples)):
+                    # For multiple labels in the form of list.
+                    if isinstance(label_dict, list):
+                        for i, sub_dict in enumerate(label_dict):
+                            examples[idx][label_col][i] = _convert_label_to_id(examples[idx][label_col][i], sub_dict)
+                    else:
+                        examples[idx][label_col] = _convert_label_to_id(examples[idx][label_col], label_dict)
+            return MapDataset(examples, label_list=label_list, vocab_info=vocab_info)
+    def _read(self, filename: str, *args):
+        """
+        Reads examples from the given file_path and returns them as an
+        `Iterable` (which could be a list or a generator).
+        This method must be implemented in self-defined `DatasetBuilder`.
+        """
+        raise NotImplementedError
+    def _get_data(self, mode: str):
+        """
+        Downloads examples from the given URL and customized split
+        informations and returns a filepath.
+        This method must be implemented in self-defined `DatasetBuilder`.
+        """
+        raise NotImplementedError
+    def get_labels(self):
+        """
+        Returns list of class labels of the dataset if specified.
+        """
+        return None
+    def get_vocab(self):
+        """
+        Returns vocab file path of the dataset if specified.
+        """
+        return None
+class SimpleBuilder(DatasetBuilder):
+    def __init__(self, lazy, read_func):
+        self._read = read_func
+        self.lazy = lazy
+    def read(self, **kwargs):
+        if self.lazy:
+            def generate_examples():
+                generator = self._read(**kwargs)
+                for example in generator:
+                    yield example
+            return IterDataset(generate_examples)
+        else:
+            examples = self._read(**kwargs)
+            if hasattr(examples, "__len__") and hasattr(examples, "__getitem__"):
+                return MapDataset(examples)
+            else:
+                return MapDataset(list(examples))
+def has_file_allowed_extension(filename: str, extensions: Union[str, Tuple[str, ...]]) -> bool:
+    """Checks if a file is an allowed extension.
+    Args:
+        filename (string): path to a file
+        extensions (tuple of strings): extensions to consider (lowercase)
+    Returns:
+        bool: True if the filename ends with one of given extensions
+    """
+    return filename.lower().endswith(extensions if isinstance(extensions, str) else tuple(extensions))
+def find_classes(directory: str) -> Tuple[List[str], Dict[str, int]]:
+    """Finds the class folders in a dataset.
+    See :class:`DatasetFolder` for details.
+    """
+    classes = sorted(entry.name for entry in os.scandir(directory) if entry.is_dir())
+    if not classes:
+        raise FileNotFoundError(f"Couldn't find any class folder in {directory}.")
+    class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
+    return classes, class_to_idx
+def make_dataset(
+    directory: str,
+    class_to_idx: Optional[Dict[str, int]] = None,
+    extensions: Optional[Union[str, Tuple[str, ...]]] = None,
+    is_valid_file: Optional[Callable[[str], bool]] = None,
+) -> List[Tuple[str, int]]:
+    """Generates a list of samples of a form (path_to_sample, class).
+    See :class:`DatasetFolder` for details.
+    Note: The class_to_idx parameter is here optional and will use the logic of the ``find_classes`` function
+    by default.
+    """
+    directory = os.path.expanduser(directory)
+    if class_to_idx is None:
+        _, class_to_idx = find_classes(directory)
+    elif not class_to_idx:
+        raise ValueError("'class_to_index' must have at least one entry to collect any samples.")
+    both_none = extensions is None and is_valid_file is None
+    both_something = extensions is not None and is_valid_file is not None
+    if both_none or both_something:
+        raise ValueError("Both extensions and is_valid_file cannot be None or not None at the same time")
+    if extensions is not None:
+        def is_valid_file(x: str) -> bool:
+            return has_file_allowed_extension(x, extensions)  # type: ignore[arg-type]
+    is_valid_file = cast(Callable[[str], bool], is_valid_file)
+    instances = []
+    available_classes = set()
+    for target_class in sorted(class_to_idx.keys()):
+        class_index = class_to_idx[target_class]
+        target_dir = os.path.join(directory, target_class)
+        if not os.path.isdir(target_dir):
+            continue
+        for root, _, fnames in sorted(os.walk(target_dir, followlinks=True)):
+            for fname in sorted(fnames):
+                path = os.path.join(root, fname)
+                if is_valid_file(path):
+                    item = path, class_index
+                    instances.append(item)
+                    if target_class not in available_classes:
+                        available_classes.add(target_class)
+    empty_classes = set(class_to_idx.keys()) - available_classes
+    if empty_classes:
+        msg = f"Found no valid file for the classes {', '.join(sorted(empty_classes))}. "
+        if extensions is not None:
+            msg += f"Supported extensions are: {extensions if isinstance(extensions, str) else ', '.join(extensions)}"
+        raise FileNotFoundError(msg)
+    return instances
+class DatasetFolder(Dataset):
+    """A generic data loader.
+    This default directory structure can be customized by overriding the
+    :meth:`find_classes` method.
+    Args:
+        root (string): Root directory path.
+        loader (callable): A function to load a sample given its path.
+        extensions (tuple[string]): A list of allowed extensions.
+            both extensions and is_valid_file should not be passed.
+        transform (callable, optional): A function/transform that takes in
+            a sample and returns a transformed version.
+            E.g, ``transforms.RandomCrop`` for images.
+        target_transform (callable, optional): A function/transform that takes
+            in the target and transforms it.
+        is_valid_file (callable, optional): A function that takes path of a file
+            and check if the file is a valid file (used to check of corrupt files)
+            both extensions and is_valid_file should not be passed.
+     Attributes:
+        classes (list): List of the class names sorted alphabetically.
+        class_to_idx (dict): Dict with items (class_name, class_index).
+        samples (list): List of (sample path, class_index) tuples
+        targets (list): The class_index value for each image in the dataset
+    """
+    def __init__(
+        self,
+        root: str,
+        loader: Callable[[str], Any],
+        extensions: Optional[Tuple[str, ...]] = None,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+        is_valid_file: Optional[Callable[[str], bool]] = None,
+    ) -> None:
+        self.root = root
+        self.transform = transform
+        self.target_transform = target_transform
+        classes, class_to_idx = self.find_classes(self.root)
+        samples = self.make_dataset(self.root, class_to_idx, extensions, is_valid_file)
+        print(f"find total {len(classes)} classes and {len(samples)} images.")
+        self.loader = loader
+        self.extensions = extensions
+        self.classes = classes
+        self.class_to_idx = class_to_idx
+        self.samples = samples
+        self.targets = [s[1] for s in samples]
+    @staticmethod
+    def make_dataset(
+        directory: str,
+        class_to_idx: Dict[str, int],
+        extensions: Optional[Tuple[str, ...]] = None,
+        is_valid_file: Optional[Callable[[str], bool]] = None,
+    ) -> List[Tuple[str, int]]:
+        """Generates a list of samples of a form (path_to_sample, class).
+        This can be overridden to e.g. read files from a compressed zip file instead of from the disk.
+        Args:
+            directory (str): root dataset directory, corresponding to ``self.root``.
+            class_to_idx (Dict[str, int]): Dictionary mapping class name to class index.
+            extensions (optional): A list of allowed extensions.
+                Either extensions or is_valid_file should be passed. Defaults to None.
+            is_valid_file (optional): A function that takes path of a file
+                and checks if the file is a valid file
+                (used to check of corrupt files) both extensions and
+                is_valid_file should not be passed. Defaults to None.
+        Raises:
+            ValueError: In case ``class_to_idx`` is empty.
+            ValueError: In case ``extensions`` and ``is_valid_file`` are None or both are not None.
+            FileNotFoundError: In case no valid file was found for any class.
+        Returns:
+            List[Tuple[str, int]]: samples of a form (path_to_sample, class)
+        """
+        if class_to_idx is None:
+            # prevent potential bug since make_dataset() would use the class_to_idx logic of the
+            # find_classes() function, instead of using that of the find_classes() method, which
+            # is potentially overridden and thus could have a different logic.
+            raise ValueError("The class_to_idx parameter cannot be None.")
+        return make_dataset(directory, class_to_idx, extensions=extensions, is_valid_file=is_valid_file)
+    def find_classes(self, directory: str) -> Tuple[List[str], Dict[str, int]]:
+        """Find the class folders in a dataset structured as follows::
+            directory/
+            ├── class_x
+            │   ├── xxx.ext
+            │   ├── xxy.ext
+            │   └── ...
+            │       └── xxz.ext
+            └── class_y
+                ├── 123.ext
+                ├── nsdf3.ext
+                └── ...
+                └── asd932_.ext
+        This method can be overridden to only consider
+        a subset of classes, or to adapt to a different dataset directory structure.
+        Args:
+            directory(str): Root directory path, corresponding to ``self.root``
+        Raises:
+            FileNotFoundError: If ``dir`` has no class folders.
+        Returns:
+            (Tuple[List[str], Dict[str, int]]): List of all classes and dictionary mapping each class to an index.
+        """
+        return find_classes(directory)
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (sample, target) where target is class_index of the target class.
+        """
+        path, target = self.samples[index]
+        sample = self.loader(path)
+        if self.transform is not None:
+            sample = self.transform(sample)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        return sample, np.int32(target)
+    def __len__(self) -> int:
+        return len(self.samples)
+    @property
+    def class_num(self):
+        return len(set(self.classes))
+IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
+_image_backend = "pil"
+def pil_loader(path: str) -> Image.Image:
+    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
+    with open(path, "rb") as f:
+        img = Image.open(f)
+        return img.convert("RGB")
+def set_image_backend(backend):
+    """
+    Specifies the package used to load images.
+    Args:
+        backend (string): Name of the image backend. one of {'PIL', 'accimage'}.
+            The :mod:`accimage` package uses the Intel IPP library. It is
+            generally faster than PIL, but does not support as many operations.
+    """
+    global _image_backend
+    if backend not in ["pil", "cv2"]:
+        raise ValueError(f"Invalid backend '{backend}'. Options are 'pil' and 'cv2'")
+    _image_backend = backend
+def get_image_backend():
+    """
+    Gets the name of the package used to load images
+    """
+    return _image_backend
+def cv2_loader(path: str):
+    return cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB)
+def default_loader(path: str) -> Any:
+    if get_image_backend() == "cv2":
+        return cv2_loader(path)
+    else:
+        return pil_loader(path)
+class ImageFolder(DatasetFolder):
+    """A generic data loader where the images are arranged in this way by default: ::
+        root/dog/xxx.png
+        root/dog/xxy.png
+        root/dog/[...]/xxz.png
+        root/cat/123.png
+        root/cat/nsdf3.png
+        root/cat/[...]/asd932_.png
+    This class inherits from :class:`~torchvision.datasets.DatasetFolder` so
+    the same methods can be overridden to customize the dataset.
+    Args:
+        root (string): Root directory path.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        loader (callable, optional): A function to load an image given its path.
+        is_valid_file (callable, optional): A function that takes path of an Image file
+            and check if the file is a valid file (used to check of corrupt files)
+     Attributes:
+        classes (list): List of the class names sorted alphabetically.
+        class_to_idx (dict): Dict with items (class_name, class_index).
+        imgs (list): List of (image path, class_index) tuples
+    """
+    def __init__(
+        self,
+        root: str,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+        loader: Callable[[str], Any] = default_loader,
+        is_valid_file: Optional[Callable[[str], bool]] = None,
+    ):
+        super().__init__(
+            root,
+            loader,
+            IMG_EXTENSIONS if is_valid_file is None else None,
+            transform=transform,
+            target_transform=target_transform,
+            is_valid_file=is_valid_file,
+        )
+        self.imgs = self.samples
+import bisect
+class ConcatDataset(Dataset):
+    r"""Dataset as a concatenation of multiple datasets.
+    This class is useful to assemble different existing datasets.
+    Args:
+        datasets (sequence): List of datasets to be concatenated
+    """
+    datasets: List[Dataset]
+    cumulative_sizes: List[int]
+    @staticmethod
+    def cumsum(sequence):
+        r, s = [], 0
+        for e in sequence:
+            l = len(e)
+            r.append(l + s)
+            s += l
+        return r
+    def __init__(self, datasets) -> None:
+        super().__init__()
+        self.datasets = list(datasets)
+        assert len(self.datasets) > 0, "datasets should not be an empty iterable"  # type: ignore[arg-type]
+        for d in self.datasets:
+            assert not isinstance(d, IterableDataset), "ConcatDataset does not support IterableDataset"
+        self.cumulative_sizes = self.cumsum(self.datasets)
+    def __len__(self):
+        return self.cumulative_sizes[-1]
+    def __getitem__(self, idx):
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError("absolute value of index should not exceed dataset length")
+            idx = len(self) + idx
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return self.datasets[dataset_idx][sample_idx]
+    @property
+    def cummulative_sizes(self):
+        warnings.warn(
+            "cummulative_sizes attribute is renamed to " "cumulative_sizes", DeprecationWarning, stacklevel=2
+        )
+        return self.cumulative_sizes
+class MixDataset(Dataset):
+    datasets_names: List[Dict]
+    def __init__(self, datasets_names) -> None:
+        super().__init__()
+        self.datasets_names = list(datasets_names)
+        self.datasets = []
+        for d in self.datasets_names:
+            name = d["name"]
+            data_files = d["data_files"] if "data_files" in d else None
+            splits = d["splits"] if "splits" in d else None
+            chat_template = d["chat_template"] if "chat_template" in d else None
+            self.datasets.append(load_dataset(name, data_files=data_files, splits=splits, chat_template=chat_template))
+        self.datasets = ConcatDataset(self.datasets)
+    def __len__(self):
+        return len(self.datasets)
+    def __getitem__(self, idx):
+        return self.datasets[idx]

PaddleMIX/paddlemix/datasets/got_dataset.py ADDED Viewed

	@@ -0,0 +1,439 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import json
+import logging
+import random
+from typing import Dict
+import paddle
+from paddle import Tensor
+import paddlenlp
+from PIL import Image, ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+from ..models.GOT.utils.conversation import (
+    SeparatorStyle,
+    conv_mpt,
+)
+from dataclasses import dataclass
+from functools import partial
+from typing import List, Union
+from megfile import smart_glob
+from natsort import natsorted
+IGNORE_INDEX = -100
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "log"
+IGNORE_INDEX = -100
+# DEFAULT_PAD_TOKEN = "[PAD]"
+DEFAULT_PAD_TOKEN = "<|endoftext|>"
+DEFAULT_EOS_TOKEN = "</s>"
+DEFAULT_BOS_TOKEN = "</s>"
+DEFAULT_UNK_TOKEN = "<unk>"
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_BOX_TOKEN = "<box>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<imgpad>"
+DEFAULT_IM_START_TOKEN = "<img>"
+DEFAULT_IM_END_TOKEN = "</img>"
+class BaseDataset(paddle.io.Dataset):
+    def __init__(self, datasets: str, tokenizer: paddlenlp.transformers.PretrainedTokenizer, multimodal_cfg: dict):
+        super(BaseDataset, self).__init__()
+        self.tokenizer = tokenizer
+        self.multimodal_cfg = multimodal_cfg
+        logging.warning(f"Using {multimodal_cfg['image_token_len']} tokens for representing image")
+    def image_processor(self, image):
+        # processor = self.multimodal_cfg['image_processor']  # the first processor, usually is the clip pretrained model (vit)
+        processor_high = self.multimodal_cfg[
+            "image_processor_high"
+        ]  # the second processor, usually is the designed image encoder (sam/swin/cnn)
+        image_high = image.copy()
+        image_high = processor_high(image_high)
+        return image_high
+    def __len__(self):
+        return len(self.list_data_dict)
+    def __getitem__(self, i) -> Dict[str, paddle.Tensor]:
+        pass
+class ConversationDataset(BaseDataset):
+    """Conversation format dataset stage2 fine-tuning."""
+    def __init__(self, meta_path, tokenizer, multimodal_cfg):
+        super(ConversationDataset, self).__init__(meta_path, tokenizer, multimodal_cfg)
+        # v0 version format conversation
+        # default_conversation = conv_templates["mpt"]
+        logging.warning("Formatting inputs into conversation type: mpt-fixed")
+        logging.warning("Loading data...")
+        list_data_dict = []
+        list_image_path = []
+        # add your data  [data1, data2, data3, .....]
+        # got_data_dict = {
+        #     "pdf-ocr": ["data1"],
+        #     #'scene-ocr': ["data3", "data4"]
+        #     # ......
+        # }
+        # for name_all in datasets.split("+"):
+        #    for name in got_data_dict[name_all]:
+        ds_collections = json.loads(open(meta_path).read())
+        #ds_collections = json.load(open(meta_path, 'r'))
+        for ds_idx, ds_name in enumerate(ds_collections.keys()):
+            # dataset = CONVERSATION_DATA[ds_name]
+            dataset = ds_collections[ds_name]
+            data_path = dataset["annotations"]
+            #image_root = dataset["images"]
+            if data_path.endswith(".json"):
+                data = json.load(open(data_path, "r"))
+            elif data_path.endswith(".jsonl"):
+                with open(data_path, "r") as f:
+                    data = f.readlines()
+                    for ii in range(len(data)):
+                        data[ii] = json.loads(data[ii])
+            else:
+                raise ValueError(f"Unknown file extension: {data_path}")
+            list_data_dict.extend(data)
+            image_path = dataset["images"]  # image_root
+            list_image_path.extend([image_path] * len(data))
+            logging.warning(f"Data from {data_path} provide {len(data)} conversations.")
+        assert len(list_data_dict) == len(list_image_path)
+        logging.warning(f"{len(list_data_dict)} conversations in total.")
+        a_new_list = list(zip(list_data_dict, list_image_path))
+        random.shuffle(a_new_list)
+        list_data_dict_new, list_image_path_new = zip(*a_new_list)
+        self.list_data_dict = list_data_dict_new
+        self.list_image_path = list_image_path_new
+        self.im_patch_token = 151859
+        self.im_start_token = 151857
+        self.im_end_token = 151858
+    def multimodal_processor(self, sources, flag_num_patches):
+        for source in sources:
+            if self.multimodal_cfg["sep_image_conv_front"]:
+                assert DEFAULT_IMAGE_TOKEN in source[0]["value"]
+                source[0]["value"] = source[0]["value"].replace(DEFAULT_IMAGE_TOKEN, "").strip()
+                source[0]["value"] = DEFAULT_IMAGE_TOKEN + conv_mpt.sep + conv_mpt.roles[0] + ": " + source[0]["value"]
+            for sentence in source:
+                replace_token = DEFAULT_IMAGE_PATCH_TOKEN * self.multimodal_cfg["image_token_len"] * flag_num_patches
+                replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+                # sentence["value"] = str(sentence["value"]).replace('\qquad', '\quad')
+                sentence["value"] = str(sentence["value"]).replace(DEFAULT_IMAGE_TOKEN, replace_token)
+        return sources
+    def _tokenize_fn(self, strings):
+        """Tokenize a list of strings."""
+        tokenized_list = [
+            self.tokenizer(
+                text,
+                return_tensors="pd",
+                padding="longest",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+            )
+            for text in strings
+        ]
+        input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
+        input_ids_lens = labels_lens = [
+            tokenized.input_ids.not_equal(paddle.to_tensor(self.tokenizer.pad_token_id)).sum().item()
+            for tokenized in tokenized_list
+        ]
+        return dict(
+            input_ids=input_ids,
+            labels=labels,
+            input_ids_lens=input_ids_lens,
+            labels_lens=labels_lens,
+        )
+    def _mask_targets(self, target, tokenized_lens, speakers):
+        # cur_idx = 0
+        cur_idx = tokenized_lens[0]
+        tokenized_lens = tokenized_lens[1:]
+        target[:cur_idx] = IGNORE_INDEX
+        for tokenized_len, speaker in zip(tokenized_lens, speakers):
+            if speaker.lower() == "human":
+                target[cur_idx + 2 : cur_idx + tokenized_len] = IGNORE_INDEX
+            cur_idx += tokenized_len
+    def token_processor(self, sources, image_name):
+        conv = conv_mpt.copy()
+        roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+        # Apply prompt templates
+        conversations = []
+        for i, source in enumerate(sources):
+            if roles[source[0]["from"]] != conv.roles[0]:
+                # Skip the first one if it is not from human
+                source = source[1:]
+            conv.messages = []
+            for j, sentence in enumerate(source):
+                role = roles[sentence["from"]]
+                assert role == conv.roles[j % 2], f"{i}"
+                conv.append_message(role, sentence["value"])
+            conversations.append(conv.get_prompt())
+        # Tokenize conversations
+        input_ids = self.tokenizer(
+            conversations,
+            return_tensors="pd",
+            padding="longest",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+        # input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+        targets = input_ids.clone()
+        assert conv.sep_style == SeparatorStyle.MPT
+        # Mask targets
+        sep = conv.sep + conv.roles[1]
+        for conversation, target in zip(conversations, targets):
+            total_len = int(target.not_equal(paddle.to_tensor(self.tokenizer.pad_token_id)).sum())
+            rounds = conversation.split(conv.sep)
+            re_rounds = [conv.sep.join(rounds[:3])]  # system + user + gpt
+            for conv_idx in range(3, len(rounds), 2):
+                re_rounds.append(conv.sep.join(rounds[conv_idx : conv_idx + 2]))  # user + gpt
+            cur_len = 0
+            target[:cur_len] = IGNORE_INDEX
+            for i, rou in enumerate(re_rounds):
+                if rou == "":
+                    break
+                parts = rou.split(sep)
+                if len(parts) != 2:
+                    break
+                parts[0] += sep
+                round_len = len(self.tokenizer(rou).input_ids) + len(self.tokenizer(conv.sep).input_ids)
+                # round_len = len(tokenizer_image_token(rou, self.tokenizer)) + len(tokenizer_image_token(conv.sep, self.tokenizer))
+                # instruction_len = len(tokenizer_image_token(parts[0], tokenizer))
+                instruction_len = len(self.tokenizer(parts[0]).input_ids)
+                target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+                cur_len += round_len
+            target[cur_len:] = IGNORE_INDEX
+            if cur_len < self.tokenizer.model_max_length:
+                if cur_len != total_len:
+                    target[:] = IGNORE_INDEX
+                    print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                    print(image_name)
+        return dict(
+            input_ids=input_ids,
+            labels=targets,
+        )
+    def __getitem__(self, i) -> Dict[str, paddle.Tensor]:
+        # data = self.list_data_dict[i]
+        data = copy.deepcopy(self.list_data_dict[i])
+        if isinstance(data, dict):
+            image_list = []
+            image_high_list = []
+            flag_num_patches = 1
+            if "image" in data:
+                image_path = self.list_image_path[i]
+                image_file = data["image"]
+                # multi-crop or multi page, only support .png files
+                if (
+                    0
+                ):  # ('.jpg' not in image_file and '.png' not in image_file and '.jpeg' not in image_file) and ('.jpg' not in image_path and '.png' not in image_path and '.jpeg' not in image_path):
+                    if image_file[0] == "/":
+                        patch_dir = image_path[:-1] + image_file
+                        patches = smart_glob(patch_dir + "*.png")
+                    else:
+                        patch_dir = image_path + image_file
+                        patches = smart_glob(patch_dir + "*.png")
+                    # print(patches)
+                    if not patches:
+                        print(f"cannot glob the dir {patch_dir}.")
+                        return self.__getitem__(0)
+                    # sort multi images by name
+                    patches = natsorted(patches)
+                    flag_num_patches = len(patches)
+                    for patch in patches:
+                        try:
+                            image = Image.open(patch).convert("RGB")
+                        except:
+                            print(f"cannot identify image file {patch}.")
+                            return self.__getitem__(0)
+                        try:
+                            img = self.image_processor(image)
+                            image_list.append(img)
+                            image_high_list.append(img)
+                        except:
+                            print(
+                                f"image {image_path + image_file + patch} are broken or grayscale! we thus select 0-th sample instead!"
+                            )
+                            return self.__getitem__(0)
+                else:
+                    flag_num_patches = 1
+                    try:
+                        image = Image.open(image_path + image_file).convert("RGB")
+                    except:
+                        print(f"cannot identify image file {image_file}.")
+                        return self.__getitem__(0)
+                    try:
+                        image = self.image_processor(image)
+                    except:
+                        print(f"image {image_file} are broken or grayscale! we thus select 0-th sample instead!")
+                        return self.__getitem__(0)
+            conversations = self.multimodal_processor([data["conversations"]], flag_num_patches)
+            # print(conversations)
+            # exit()
+        else:
+            conversations = [data]
+        # align with fastchat & llava here, put the conversation into a list for tokenization
+        image_name = image_path + image_file
+        data_dict = self.token_processor(conversations, image_name)
+        data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
+        if isinstance(data, dict) and "image" in data:
+            if image_list and image_high_list:
+                data_dict["image"] = image_list
+                data_dict["image_high"] = image_high_list
+            else:
+                data_dict["image"] = [image]
+                data_dict["image_high"] = [image]
+        else:
+            # crop_size = self.multimodal_cfg['image_processor'].crop_size
+            # data_dict['image'] = [torch.zeros(3, crop_size['height'], crop_size['width'])]
+            # Vary for two image, GOT does not use the data_dict['image]
+            data_dict["image"] = [paddle.zeros([3, 1024, 1024])]
+            data_dict["image_high"] = [paddle.zeros([3, 1024, 1024])]
+        return data_dict
+# helpers
+def pad_sequence_paddle(sequences, padding_value=0):
+    """
+    Implement a function similar to PyTorch's pad_sequence in PaddlePaddle.
+    Args:
+    - sequences (list of Tensor): The list of sequences to be padded.
+    - padding_value (float, optional): The value used for padding, default is 0.
+    Returns:
+    - Tensor: The result of padding all sequences to the same length.
+    """
+    # Calculate the maximum length
+    max_len = max([seq.shape[0] for seq in sequences])
+    # Pad sequences
+    padded_sequences = []
+    for seq in sequences:
+        # Calculate the length to pad
+        padding_len = max_len - seq.shape[0]
+        # Create a padding tensor
+        if padding_len > 0:
+            padding_tensor = paddle.full([padding_len] + list(seq.shape[1:]), padding_value, dtype=seq.dtype)
+            # Concatenate the original sequence and the padding tensor
+            padded_seq = paddle.concat([seq, padding_tensor], axis=0)
+        else:
+            padded_seq = seq
+        padded_sequences.append(padded_seq)
+    # Stack the padded sequences to form a batch
+    padded_batch = paddle.stack(padded_sequences, axis=0)
+    return padded_batch
+def orig_pad_sequence(
+    sequences: Union[Tensor, List[Tensor]],
+    batch_first: bool = False,
+    padding_value: float = 0.0,
+) -> Tensor:
+    if batch_first:
+        return pad_sequence_paddle(sequences, padding_value)
+    else:
+        assert False, "Not implemented"
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    tokenizer: paddlenlp.transformers.PretrainedTokenizer
+    def __call__(self, instances):
+        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
+        images = [paddle.stack(instance["image"]) for instance in instances]
+        images_high = [paddle.stack(instance["image_high"]) for instance in instances]
+        images = list(zip(images, images_high))
+        pad_sequence = partial(orig_pad_sequence, batch_first=True)
+        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+        labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.not_equal(paddle.to_tensor(self.tokenizer.pad_token_id)),
+            images=images,
+        )
+        return batch
+def make_supervised_data_module(interleave, with_box, tokenizer, data_args):
+    assert data_args.conversation_version == "mpt"
+    train_dataset = ConversationDataset(
+        tokenizer=tokenizer,
+        # datasets=data_args.datasets,
+        meta_path=data_args.meta_path,
+        multimodal_cfg=dict(
+            sep_image_conv_front=data_args.sep_image_conv_front,
+            image_token_len=data_args.image_token_len,
+            image_aspect_ratio=data_args.image_aspect_ratio,
+            use_im_start_end=data_args.use_im_start_end,
+            image_processor=data_args.image_processor,
+            image_processor_high=data_args.image_processor_high,
+            box_limit=data_args.box_limit,
+        ),
+    )
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)

PaddleMIX/paddlemix/datasets/internvl_dataset.py ADDED Viewed

	@@ -0,0 +1,688 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import sys
+IGNORE_TOKEN_ID = -100 # LabelSmoother.ignore_index
+import random
+from typing import Dict
+from collections.abc import Sequence
+import paddle
+import paddle.vision.transforms as T
+from paddlemix.models.internvl2.conversation import get_conv_template
+from PIL import Image
+from paddle.io import ConcatDataset, WeightedRandomSampler
+from paddlemix.models.internvl2.constants import (CLIP_MEAN, CLIP_STD, IMAGENET_MEAN, IMAGENET_STD,
+                        IMG_CONTEXT_TOKEN, IMG_END_TOKEN, IMG_START_TOKEN,
+                        SIGLIP_MEAN, SIGLIP_STD)
+class WeightedConcatDataset(ConcatDataset):
+    def __init__(self, datasets, weights):
+        super().__init__(datasets)
+        self.weights = paddle.to_tensor(weights, dtype='float32')
+        self.total_size = sum(len(d) for d in datasets)
+        self.sampler = WeightedRandomSampler(weights=self.weights, num_samples=self.total_size, replacement=True)
+    def __iter__(self):
+        return iter(self.sampler)
+    def __len__(self):
+        return self.total_size
+def pil_loader(img_str):
+    buff = io.BytesIO(img_str)
+    img = Image.open(buff)
+    return img.convert('RGB')
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def simulate_jpeg_degradation(quality):
+    def jpeg_degrade(img):
+        with io.BytesIO() as output:
+            img.convert('RGB').save(output, format='JPEG', quality=quality)
+            output.seek(0)  # Move the reading cursor to the start of the stream
+            img_jpeg = Image.open(output).copy()  # Use .copy() to make sure the image is loaded in memory
+        return img_jpeg
+    return jpeg_degrade
+# Define the JPEG compression quality range, pre-create all JPEG compression functions
+qualities = list(range(75, 101))
+jpeg_degrade_functions = {quality: simulate_jpeg_degradation(quality) for quality in qualities}
+class Lambda:
+    """Apply a user-defined lambda as a transform. This transform does not support torchscript.
+    Args:
+        lambd (function): Lambda/function to be used for transform.
+    """
+    def __init__(self, lambd):
+        #_log_api_usage_once(self)
+        if not callable(lambd):
+            raise TypeError(f"Argument lambd should be callable, got {repr(type(lambd).__name__)}")
+        self.lambd = lambd
+    def __call__(self, img):
+        return self.lambd(img)
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}()"
+class RandomTransforms:
+    """Base class for a list of transformations with randomness
+    Args:
+        transforms (sequence): list of transformations
+    """
+    def __init__(self, transforms):
+        #_log_api_usage_once(self)
+        if not isinstance(transforms, Sequence):
+            raise TypeError("Argument transforms should be a sequence")
+        self.transforms = transforms
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError()
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += f"    {t}"
+        format_string += "\n)"
+        return format_string
+class RandomChoice(RandomTransforms):
+    """Apply single transformation randomly picked from a list. This transform does not support torchscript."""
+    def __init__(self, transforms, p=None):
+        super().__init__(transforms)
+        if p is not None and not isinstance(p, Sequence):
+            raise TypeError("Argument p should be a sequence")
+        self.p = p
+    def __call__(self, *args):
+        t = random.choices(self.transforms, weights=self.p)[0]
+        return t(*args)
+    def __repr__(self) -> str:
+        return f"{super().__repr__()}(p={self.p})"
+def build_transform(is_train, input_size, pad2square=False, normalize_type='imagenet'):
+    if normalize_type == 'imagenet':
+        MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    elif normalize_type == 'clip':
+        MEAN, STD = CLIP_MEAN, CLIP_STD
+    elif normalize_type == 'siglip':
+        MEAN, STD = SIGLIP_MEAN, SIGLIP_STD
+    else:
+        raise NotImplementedError
+    if is_train:  # use data augumentation
+        transform = T.Compose([
+            Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            RandomChoice([Lambda(jpeg_degrade_functions[quality]) for quality in qualities]),
+            T.Resize((input_size, input_size), interpolation='bicubic'),
+            T.ToTensor(),
+            T.Normalize(mean=MEAN, std=STD)
+        ])
+    else:
+        if pad2square is False:  # now we use this transform function by default
+            # run this
+            transform = T.Compose([
+                Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+                T.Resize((input_size, input_size), interpolation='bicubic'),
+                T.ToTensor(),
+                T.Normalize(mean=MEAN, std=STD)
+            ])
+        else:
+            transform = T.Compose([
+                Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+                Lambda(lambda img: expand2square(img, tuple(int(x * 255) for x in MEAN))),
+                T.Resize((input_size, input_size), interpolation='bicubic'),
+                T.ToTensor(),
+                T.Normalize(mean=MEAN, std=STD)
+            ])
+    return transform
+def preprocess(
+    template_name,
+    sources,
+    tokenizer,
+    num_image_token_list: list,
+    text_only: bool = False,
+    group_by_length: bool = False,
+    use_packed_ds: bool = False,
+    ds_name: str = None,
+    num_image: int = 1,
+):
+    conv = get_conv_template(template_name)
+    roles = {'human': conv.roles[0], 'gpt': conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]['from']] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence['from']]
+            assert role == conv.roles[j % 2], f'{i}'
+            conv.append_message(role, sentence['value'])
+        conversations.append(conv.get_prompt())
+    if not text_only:
+        new_conversations = []
+        for conversation in conversations:
+            for i in range(num_image):
+                image_tokens = f'{IMG_START_TOKEN}{IMG_CONTEXT_TOKEN * num_image_token_list[i]}{IMG_END_TOKEN}'
+                conversation = conversation.replace('<image>', image_tokens, 1)
+            new_conversations.append(conversation)
+        conversations = new_conversations
+    # Tokenize conversations
+    input_ids = tokenizer(
+        conversations,
+        return_tensors='pd',
+        padding=False if group_by_length or use_packed_ds else 'max_length',
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+    ).input_ids
+    targets = input_ids.clone()
+    # assert conv.sep_style == SeparatorStyle.ADD_COLON_TWO
+    # Mask targets. Only compute loss on the assistant outputs.
+    sep = conv.sep + conv.roles[1] + ': '
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.not_equal(paddle.to_tensor(tokenizer.pad_token_id)).sum())
+        turns = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_TOKEN_ID
+        for i, turn in enumerate(turns):
+            if turn == '':
+                break
+            turn_len = len(tokenizer(turn).input_ids)
+            parts = turn.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            # "-2" is hardcoded for the Llama tokenizer to make the offset correct.
+            instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+            if i != 0 and not tokenizer.legacy:
+                # The legacy and non-legacy modes handle special tokens differently
+                instruction_len -= 1
+            # Ignore the user instructions
+            target[cur_len: cur_len + instruction_len] = IGNORE_TOKEN_ID
+            cur_len += turn_len
+            if i != 0 and not tokenizer.legacy:
+                # The legacy and non-legacy modes handle special tokens differently
+                cur_len -= 1
+        target[cur_len:] = IGNORE_TOKEN_ID
+        if False:  # Inspect and check the correctness of masking
+            z = target.clone()
+            z = torch.where(z == IGNORE_TOKEN_ID, tokenizer.unk_token_id, z)
+            logger.info(tokenizer.decode(z))
+            exit()
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_TOKEN_ID
+                print(
+                    f'WARNING: tokenization mismatch: {cur_len} vs. {total_len}.'
+                    f' #turn = {len(turns) - 1}. (ignored). This dataset is {ds_name}.'
+                )
+                sys.stdout.flush()
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+        attention_mask=input_ids.not_equal(paddle.to_tensor(tokenizer.pad_token_id)),
+    )
+def preprocess_mpt(
+    template_name,
+    sources,
+    tokenizer,
+    num_image_token_list: list,
+    text_only: bool = False,
+    group_by_length: bool = False,
+    use_packed_ds: bool = False,
+    ds_name: str = None,
+    num_image: int = 1
+) -> Dict:
+    conv = get_conv_template(template_name)
+    roles = {'human': conv.roles[0], 'gpt': conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]['from']] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence['from']]
+            assert role == conv.roles[j % 2], f'{i}'
+            conv.append_message(role, sentence['value'])
+        conversations.append(conv.get_prompt())
+    if not text_only:
+        new_conversations = []
+        for conversation in conversations:
+            for i in range(num_image):
+                image_tokens = f'{IMG_START_TOKEN}{IMG_CONTEXT_TOKEN * num_image_token_list[i]}{IMG_END_TOKEN}'
+                conversation = conversation.replace('<image>', image_tokens, 1)
+            new_conversations.append(conversation)
+        conversations = new_conversations
+    # Tokenize conversations
+    input_ids = tokenizer(
+        conversations,
+        return_tensors='pd',
+        padding=False if group_by_length or use_packed_ds else 'max_length',
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+    ).input_ids
+    targets = input_ids.clone()
+    # Mask targets. Only compute loss on the assistant outputs.
+    sep = conv.sep + conv.roles[1]  # <|im_end|><|im_start|>assistant\n
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.not_equal(paddle.to_tensor(tokenizer.pad_token_id)).sum())
+        turns = conversation.split(conv.sep)
+        re_turns = [conv.sep.join(turns[:3])]  # system + user + gpt
+        for conv_idx in range(3, len(turns), 2):
+            re_turns.append(conv.sep.join(turns[conv_idx:conv_idx + 2]))  # user + gpt
+        cur_len = 0
+        target[:cur_len] = IGNORE_TOKEN_ID
+        for i, turn in enumerate(re_turns):
+            if turn == '':
+                break
+            turn_len = len(tokenizer(turn).input_ids) + 1
+            parts = turn.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            instruction_len = len(tokenizer(parts[0]).input_ids)
+            # Ignore the user instructions
+            target[cur_len: cur_len + instruction_len] = IGNORE_TOKEN_ID
+            # print(f'[question {i}]', tokenizer.decode(input_ids[:, cur_len: cur_len + instruction_len][0]))
+            # print(f'[answer {i}]', tokenizer.decode(input_ids[:, cur_len + instruction_len: cur_len + turn_len][0]))
+            # print(f'[label {i}]', target[cur_len + instruction_len: cur_len + turn_len])
+            cur_len += turn_len
+        target[cur_len:] = IGNORE_TOKEN_ID
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_TOKEN_ID
+                print(
+                    f'WARNING: tokenization mismatch: {cur_len} vs. {total_len}.'
+                    f' #turn = {len(turns) - 1}. (ignored). This dataset is {ds_name}.'
+                )
+                sys.stdout.flush()
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+        attention_mask=input_ids.not_equal(paddle.to_tensor(tokenizer.pad_token_id)),
+    )
+def preprocess_phi3(
+    template_name,
+    sources,
+    tokenizer,
+    num_image_token_list: list,
+    text_only: bool = False,
+    group_by_length: bool = False,
+    use_packed_ds: bool = False,
+    ds_name: str = None,
+    num_image: int = 1
+) -> Dict:
+    conv = get_conv_template(template_name)
+    roles = {'human': conv.roles[0], 'gpt': conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]['from']] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence['from']]
+            assert role == conv.roles[j % 2], f'{i}'
+            conv.append_message(role, sentence['value'])
+        conversations.append(conv.get_prompt())
+    if not text_only:
+        new_conversations = []
+        for conversation in conversations:
+            for i in range(num_image):
+                image_tokens = f'{IMG_START_TOKEN}{IMG_CONTEXT_TOKEN * num_image_token_list[i]}{IMG_END_TOKEN}'
+                conversation = conversation.replace('<image>', image_tokens, 1)
+            new_conversations.append(conversation)
+        conversations = new_conversations
+    # Tokenize conversations
+    tokenizer.padding_side = 'right'
+    input_ids = tokenizer(
+        conversations,
+        return_tensors='pd',
+        padding=False if group_by_length or use_packed_ds else 'max_length',
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+    ).input_ids
+    targets = input_ids.clone()
+    # Mask targets. Only compute loss on the assistant outputs.
+    sep = conv.sep + conv.roles[1]  # <|end|>\n<|assistant|>
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.not_equal(paddle.to_tensor(tokenizer.pad_token_id)).sum())
+        turns = conversation.split(conv.sep)
+        re_turns = [conv.sep.join(turns[:3])]  # system + user + gpt
+        for conv_idx in range(3, len(turns), 2):
+            re_turns.append(conv.sep.join(turns[conv_idx:conv_idx + 2]))  # user + gpt
+        cur_len = 1
+        target[:cur_len] = IGNORE_TOKEN_ID
+        endoftext_id = tokenizer.convert_tokens_to_ids('<|endoftext|>')
+        target[target == endoftext_id] = IGNORE_TOKEN_ID
+        for i, turn in enumerate(re_turns):
+            if turn == '':
+                break
+            if i == 0:
+                turn_len = len(tokenizer(turn).input_ids)
+            else:
+                turn_len = len(tokenizer(turn).input_ids) - 1
+            parts = turn.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            if i == 0:
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 1
+            else:
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+            # Ignore the user instructions
+            target[cur_len: cur_len + instruction_len] = IGNORE_TOKEN_ID
+            # print(f'[question {i}]', tokenizer.decode(input_ids[:, cur_len: cur_len + instruction_len][0]))
+            # print(f'[answer {i}]', tokenizer.decode(input_ids[:, cur_len + instruction_len: cur_len + turn_len][0]))
+            # print(f'[label {i}]', target[cur_len + instruction_len: cur_len + turn_len])
+            cur_len += turn_len
+        target[cur_len:] = IGNORE_TOKEN_ID
+        if False:  # Inspect and check the correctness of masking
+            z = target.clone()
+            z = torch.where(z == IGNORE_TOKEN_ID, tokenizer.unk_token_id, z)
+            print(repr(tokenizer.decode(z)))
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_TOKEN_ID
+                print(
+                    f'WARNING: tokenization mismatch: {cur_len} vs. {total_len}.'
+                    f' #turn = {len(turns) - 1}. (ignored). This dataset is {ds_name}.'
+                )
+                sys.stdout.flush()
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+        attention_mask=input_ids.not_equal(paddle.to_tensor(tokenizer.pad_token_id)),
+    )
+def preprocess_internlm(
+    template_name,
+    sources,
+    tokenizer,
+    num_image_token_list: list,
+    text_only: bool = False,
+    group_by_length: bool = False,
+    use_packed_ds: bool = False,
+    ds_name: str = None,
+    num_image: int = 1
+) -> Dict:
+    conv = get_conv_template(template_name)
+    roles = {'human': conv.roles[0], 'gpt': conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]['from']] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence['from']]
+            assert role == conv.roles[j % 2], f'{i}'
+            sentence['value'] = sentence['value'].strip()
+            conv.append_message(role, sentence['value'])
+        conversations.append(conv.get_prompt())
+    if not text_only:
+        new_conversations = []
+        for conversation in conversations:
+            for i in range(num_image):
+                image_tokens = f'{IMG_START_TOKEN}{IMG_CONTEXT_TOKEN * num_image_token_list[i]}{IMG_END_TOKEN}'
+                conversation = conversation.replace('<image>', image_tokens, 1)
+            new_conversations.append(conversation)
+        conversations = new_conversations
+    # Tokenize conversations
+    input_ids = tokenizer(
+        conversations,
+        return_tensors='pd',
+        padding=False if group_by_length or use_packed_ds else 'max_length',
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+    ).input_ids
+    targets = input_ids.clone()
+    new_targets = []
+    # print('tokenizer.pad_token_id:\n', tokenizer.pad_token_id) # 151643
+    # print('targets', targets, targets.shape, targets.sum().item())
+    # [[151644, 8948  , 198   , ..., 103978, 1773  , 151645]]   [1, 1918]   281157253
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.not_equal(paddle.to_tensor(tokenizer.pad_token_id)).sum())  # 浦语里面 pad_token_id = eos_token_id
+        cur_len = 1
+        target[:cur_len] = IGNORE_TOKEN_ID  # <s>
+        parts = conversation.split(conv.roles[1])  # [UNUSED_TOKEN_146]assistant\n
+        info = parts[0] + conv.roles[1]
+        temp_len = len(tokenizer(info).input_ids) - 1  # 去除tokenizer的<s>
+        target[cur_len: cur_len + temp_len] = IGNORE_TOKEN_ID
+        cur_len = cur_len + temp_len
+        for index in range(1, len(parts) - 1):
+            info = parts[index]
+            part1, part2 = info.split(conv.roles[0])
+            temp_len = len(tokenizer(part1).input_ids) - 1
+            cur_len = cur_len + temp_len
+            part = conv.roles[0] + part2 + conv.roles[1]
+            temp_len = len(tokenizer(part).input_ids) - 1
+            target[cur_len: cur_len + temp_len] = IGNORE_TOKEN_ID
+            cur_len = cur_len + temp_len
+        last_info = parts[-1]
+        temp_len = len(tokenizer(last_info).input_ids) - 1
+        cur_len = cur_len + temp_len
+        target[cur_len:] = IGNORE_TOKEN_ID
+        if False:  # Inspect and check the correctness of masking
+            z = target.clone()
+            z = torch.where(z == IGNORE_TOKEN_ID, tokenizer.unk_token_id, z)
+            print(repr(tokenizer.decode(z)))
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_TOKEN_ID
+                print(f'WARNING: tokenization mismatch: {cur_len} vs. {total_len}. This dataset is {ds_name}.')
+                sys.stdout.flush()
+        new_targets.append(target)
+    new_targets = paddle.stack(new_targets, axis=0)
+    return dict(
+        input_ids=input_ids,
+        labels=new_targets,
+        attention_mask=input_ids.not_equal(paddle.to_tensor(tokenizer.pad_token_id)),
+    )
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
+    return best_ratio
+def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False, return_target_aspect_ratio=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    if return_target_aspect_ratio:
+        return processed_images, target_aspect_ratio
+    else:
+        return processed_images
+def dynamic_preprocess2(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False, prior_aspect_ratio=None):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    new_target_ratios = []
+    if prior_aspect_ratio is not None:
+        for i in target_ratios:
+            if prior_aspect_ratio[0]%i[0] != 0 and prior_aspect_ratio[1]%i[1] != 0:
+                new_target_ratios.append(i)
+            else:
+                continue
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, new_target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images

PaddleMIX/paddlemix/datasets/laiondata.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import gzip
+import io
+import json
+import os
+import random
+from paddle.io import IterableDataset, get_worker_info
+from PIL import Image
+def paddle_worker_info(group=None):
+    """Return node and worker info for paddle and some distributed environments."""
+    rank = 0
+    world_size = 1
+    worker = 0
+    num_workers = 1
+    if "WORKER" in os.environ and "NUM_WORKERS" in os.environ:
+        worker = int(os.environ["WORKER"])
+        num_workers = int(os.environ["NUM_WORKERS"])
+    else:
+        try:
+            worker_info = get_worker_info()
+            if worker_info is not None:
+                worker = worker_info.id
+                num_workers = worker_info.num_workers
+        except ModuleNotFoundError:
+            pass
+    return rank, world_size, worker, num_workers
+class LaionDataset(IterableDataset):
+    def __init__(
+        self,
+        file_list,
+        get_text_emb="",
+        data_world_rank=0,
+        data_world_size=1,
+        buffer_size=1,
+        shuffle_every_n_samples=1000,
+        total_seen_samples=None,
+    ):
+        with open(file_list, "r", encoding="utf-8") as f:
+            self.file_list = f.read().strip().split("\n")
+        self.get_text_emb = get_text_emb
+        self.buffer_size = buffer_size
+        self.shuffle_every_n_samples = shuffle_every_n_samples
+        self.min_size = 5
+        self.total_seen_samples = total_seen_samples
+        self.data_world_rank = data_world_rank
+        self.data_world_size = data_world_size
+    def parse_line(self, line, filename):
+        try:
+            vec = line.strip().split("\t")
+            text_json = json.loads(vec[2])
+            img_b64 = vec[5]
+            caption = text_json.get("caption_en", text_json.get("blip_caption_en", ""))
+            image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
+            return dict(image=image, text=caption)
+        except Exception:
+            print(f"error when parse file {filename}")
+            return None
+    def get_data(self, data):
+        w, h = data["image"].size
+        if w < self.min_size or h < self.min_size:
+            return None
+        return data
+    def __len__(self):
+        return self.total_seen_samples
+    def sample(self):
+        _, _, worker, num_workers = paddle_worker_info()
+        total_num_workers = num_workers * self.data_world_size
+        global_worker_id = self.data_world_rank * num_workers + worker
+        print("[CHECK ME] LaionDataset", global_worker_id, total_num_workers)
+        while True:
+            random.shuffle(self.file_list)
+            for i in range(len(self.file_list)):
+                if i % total_num_workers == global_worker_id:
+                    filename = self.file_list[i].strip("\n")
+                    with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f:
+                        while True:
+                            line = f.readline()
+                            if line == b"":
+                                break
+                            try:
+                                try:
+                                    line = line.decode(encoding="utf-8")
+                                except:
+                                    line = line.decode(encoding="gb18030")
+                            except:
+                                print(f"error on file {filename}")
+                                continue
+                            data = self.parse_line(line, filename)
+                            if data is None:
+                                continue
+                            else:
+                                data = self.get_data(data)
+                                if data is None:
+                                    continue
+                                yield data
+    def shuffle(self, iterator):
+        buffer_list = []
+        for _ in range(self.buffer_size):
+            buffer_list.append(next(iterator))
+        i = 0
+        while True:
+            if i % self.shuffle_every_n_samples == 0:
+                random.shuffle(buffer_list)
+            yield buffer_list.pop()
+            buffer_list.append(next(iterator))
+            i += 1
+    def __iter__(self):
+        return self.shuffle(iter(self.sample()))

PaddleMIX/paddlemix/datasets/mixtoken_dataset.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from paddle.io import Dataset
+from scipy.linalg import block_diag
+from tqdm import tqdm
+class MIXToken:
+    required_input_keys = ["input_ids", "labels"]
+    required_output_keys = ["input_ids", "labels", "attention_mask"]
+    # Only supported the following keys for MIXToken. Keys outside of the set will be ignored.
+    supported_input_keys = ["input_ids", "labels", "attention_mask", "position_ids", "images"]
+    @classmethod
+    def _pad_batch_records(cls, batch_records):
+        # Only consider supported input keys
+        input_keys = [key for key in batch_records[0].keys() if key in cls.supported_input_keys]
+        # Check required_keys
+        for key in cls.required_input_keys:
+            if key not in input_keys:
+                raise ValueError(f"feature `{key}` is required for MIXTokenDataset")
+        # Output features must include all required output keys
+        for key in cls.required_output_keys:
+            if key not in input_keys:
+                input_keys.append(key)
+        batched_features = {key: [] for key in input_keys}
+        for record in batch_records:
+            batched_features["input_ids"].extend(record["input_ids"])
+            batched_features["labels"].extend(record["labels"])
+            seq_length = len(record["input_ids"])
+            # If attention_mask is not given, assume it's causal mask
+            attention_mask = record.get("attention_mask", np.tril(np.ones([seq_length, seq_length], dtype=bool)))
+            batched_features["attention_mask"].append(attention_mask)
+            # NOTE: position_ids is optional and not required by every model
+            # We append instead of extend here to accomodate 2D position ids
+            if "position_ids" in record:
+                batched_features["position_ids"].append(record["position_ids"])
+            if "images" in record:
+                batched_features["images"].append(record["images"])
+        block_attention_mask = block_diag(*batched_features["attention_mask"])
+        # convert to 3-D [batch_size(1), seq_length, seq_length]
+        batched_features["attention_mask"] = np.expand_dims(block_attention_mask, axis=0)
+        if "position_ids" in batched_features:
+            # Accommodate both 1D and 2D position ids
+            batched_features["position_ids"] = np.concatenate(batched_features["position_ids"], axis=-1).tolist()
+        return batched_features
+class MIXTokenMapDataset(MIXToken, Dataset):
+    """
+    MIXToken is a unique feature of PaddleMix training, which replaces traditional pad tokens by
+    concatenating effective tokens to increase the throughput of a single sample and improve training speed.
+    traditional pad tokens:
+    len( imageToken + query + paddingToken ) = max_length
+    MIXToken:
+    len( imageToken1 + query1 + imageToken2 + query2 + ... + paddingToken ) = max_length
+    """
+    def __init__(self, data, max_length, processor=None, tokenizer=None, mode="train"):
+        self.max_length = max_length
+        self.processor = processor
+        self.tokenizer = tokenizer
+        self.mode = mode
+        self.new_data = self._create_intokens_data(data)
+    def _create_intokens_data(self, data):
+        batch_records, max_len = [], 0
+        cur_len_so_far = 0
+        total_data = []
+        for i in tqdm(range(len(data))):
+            record = data[i]
+            if self.processor:
+                record = self.processor(record=record, mode=self.mode)
+            if getattr(self.tokenizer, "image_token_span", None) is not None and record["images"] is not None:
+                image_token_span = self.tokenizer.image_token_span - 1  # image token
+            else:
+                image_token_span = 0
+            max_len = max(max_len, len(record["input_ids"]))
+            to_append = (cur_len_so_far + int(image_token_span) + len(record["input_ids"])) <= self.max_length
+            if to_append:
+                batch_records.append(record)
+                cur_len_so_far += len(record["input_ids"]) + image_token_span
+            else:
+                # exceed max length
+                padded_list = self._pad_batch_records(batch_records)
+                total_data.append(padded_list)
+                # reset
+                batch_records, max_len = [], 0
+                cur_len_so_far = 0
+                # append current data
+                batch_records.append(record)
+                cur_len_so_far += len(record["input_ids"]) + image_token_span
+        # remaining data
+        if batch_records:
+            padded_list = self._pad_batch_records(batch_records)
+            total_data.append(padded_list)
+        return total_data
+    def __getitem__(self, idx):
+        return self.new_data[idx]
+    def __len__(self):
+        return len(self.new_data)

PaddleMIX/paddlemix/datasets/vg_caption.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+__all__ = ["VGCaption"]
+from paddlemix.datasets.caption_dataset import CaptionDataset
+class VGCaption(CaptionDataset):
+    """
+    VG Caption dataset.
+    """
+    URL = "https://bj.bcebos.com/paddlemix/datasets/vg.tar.gz"
+    META_INFO = collections.namedtuple("META_INFO", ("images", "annotations", "images_md5", "annotations_md5"))
+    MD5 = ""
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("coco", "images"),
+            os.path.join("coco", "annotations/vg_caption.json"),
+            "",
+            "",
+        ),
+    }

PaddleMIX/paddlemix/demo_images/critic_img_seven.png ADDED Viewed

PaddleMIX/paddlemix/external_ops/setup.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import multiprocessing
+import os
+def get_gencode_flags():
+    import paddle
+    prop = paddle.device.cuda.get_device_properties()
+    cc = prop.major * 10 + prop.minor
+    return ["-gencode", "arch=compute_{0},code=sm_{0}".format(cc)]
+def run(func):
+    p = multiprocessing.Process(target=func)
+    p.start()
+    p.join()
+def change_pwd():
+    path = os.path.dirname(__file__)
+    if path:
+        os.chdir(path)
+def setup_fast_ln():
+    from paddle.utils.cpp_extension import CUDAExtension, setup
+    gencode_flags = get_gencode_flags()
+    change_pwd()
+    setup(
+        name="fast_ln",
+        ext_modules=CUDAExtension(
+            sources=[
+                "fast_ln/ln_api.cpp",
+                "fast_ln/ln_bwd_semi_cuda_kernel.cu",
+                "fast_ln/ln_fwd_cuda_kernel.cu",
+            ],
+            extra_compile_args={
+                "cxx": ["-O3"],
+                "nvcc": [
+                    "-O3",
+                    "-U__CUDA_NO_HALF_OPERATORS__",
+                    "-U__CUDA_NO_HALF_CONVERSIONS__",
+                    "-U__CUDA_NO_BFLOAT16_OPERATORS__",
+                    "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+                    "-U__CUDA_NO_BFLOAT162_OPERATORS__",
+                    "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
+                    "-I./apex/contrib/csrc/layer_norm/",
+                    "--expt-relaxed-constexpr",
+                    "--expt-extended-lambda",
+                    "--use_fast_math",
+                ]
+                + gencode_flags,
+            },
+        ),
+    )
+def setup_fused_ln():
+    from paddle.utils.cpp_extension import CUDAExtension, setup
+    gencode_flags = get_gencode_flags()
+    change_pwd()
+    setup(
+        name="fused_ln",
+        ext_modules=CUDAExtension(
+            sources=[
+                "fused_ln/layer_norm_cuda.cu",
+            ],
+            extra_compile_args={
+                "cxx": ["-O3"],
+                "nvcc": [
+                    "-O3",
+                    "-U__CUDA_NO_HALF_OPERATORS__",
+                    "-U__CUDA_NO_HALF_CONVERSIONS__",
+                    "-U__CUDA_NO_BFLOAT16_OPERATORS__",
+                    "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+                    "-U__CUDA_NO_BFLOAT162_OPERATORS__",
+                    "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
+                    "-I./apex/contrib/csrc/layer_norm/",
+                    "--expt-relaxed-constexpr",
+                    "--expt-extended-lambda",
+                    "--use_fast_math",
+                    "-maxrregcount=50",
+                ]
+                + gencode_flags,
+            },
+        ),
+    )
+run(setup_fast_ln)
+run(setup_fused_ln)

PaddleMIX/paddlemix/metrics/clip_zero_shot.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import paddle
+import paddle.nn.functional as F
+from tqdm import tqdm
+from paddlemix.processors.tokenizer import tokenize
+def zero_shot_classifier(model, classnames_filename, templates_filename, args, text_tower=None):
+    classnames = [i.strip() for i in open(classnames_filename).readlines()]
+    templates = [i.strip() for i in open(templates_filename).readlines()]
+    if text_tower is None:
+        if hasattr(model, "_layers"):
+            text_tower = model._layers.encode_text
+        else:
+            text_tower = model.encode_text
+    tokenizer = tokenize
+    with paddle.no_grad():
+        zeroshot_weights = []
+        for classname in tqdm(classnames):
+            texts = [template.format(classname) for template in templates]  # format with class
+            texts = tokenizer(texts)  # tokenize
+            class_embeddings = text_tower(texts)
+            class_embedding = F.normalize(class_embeddings, axis=-1).mean(0)
+            class_embedding /= class_embedding.norm()
+            zeroshot_weights.append(class_embedding)
+        zeroshot_weights = paddle.stack(zeroshot_weights, axis=1)
+    return zeroshot_weights
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    maxk = min(max(topk), output.shape[1])
+    pred = output.topk(maxk, 1, True, True)[1].t()
+    correct = pred == target.reshape([1, -1]).expand_as(pred)
+    return [
+        float(correct[: min(k, maxk)].reshape([-1]).astype(paddle.float32).sum(0, keepdim=True).numpy() * 100.0)
+        for k in topk
+    ]
+class DummyAutocast:
+    def __init__(self, *args, **kwargs):
+        return
+    def __enter__(self, *args, **kwargs):
+        return
+    def __exit__(self, *args, **kwargs):
+        return
+def get_autocast(precision):
+    if precision == "float16":
+        return paddle.amp.auto_cast
+    elif precision == "bfloat16":
+        return lambda: paddle.amp.auto_cast(dtype="bfloat16")
+    else:
+        return DummyAutocast
+def get_cast_dtype(args):
+    cast_dtype = None
+    if args.bf16:
+        cast_dtype = "bfloat16"
+    elif args.fp16:
+        cast_dtype = "float16"
+    return cast_dtype
+class ClipZeroShot:
+    def __init__(self, model, args):
+        data_path = args.classification_eval.strip()
+        classname_filename = f"{data_path}/labels.txt"
+        template_filename = f"{data_path}/templates.txt"
+        self.data_name = os.path.basename(args.classification_eval)
+        classifier_filename = (
+            f"{os.path.dirname(classname_filename)}/{args.pretrained_text_model}_{self.data_name}_classifier.pdparams"
+        )
+        if os.path.exists(classifier_filename):
+            print("load classifier from disk")
+            classifier = paddle.load(classifier_filename)
+        else:
+            print("constructing classifier: {}.".format(classifier_filename))
+            classifier = zero_shot_classifier(model, classname_filename, template_filename, args)
+            paddle.save(classifier, classifier_filename)
+        print(f"zero-shot evaluating classification task: {self.data_name}")
+        if args.bf16:
+            self.classifier = classifier.astype(paddle.bfloat16)
+        elif args.fp16:
+            self.classifier = classifier.astype(paddle.float16)
+        else:
+            self.classifier = classifier
+        self.batch_size = args.per_device_eval_batch_size
+        self.cast_dtype = get_cast_dtype(args)
+    def zero_shot_eval(self, evalres):
+        results = {}
+        print("Extract features done, starting zero-shot classification evaluation.")
+        predictions, labels = evalres.predictions, evalres.label_ids
+        n = predictions.shape[0]
+        top1, top5 = 0.0, 0.0
+        autocast = get_autocast(self.cast_dtype)
+        with paddle.no_grad():
+            for step in tqdm(range((predictions.shape[0] + self.batch_size - 1) // self.batch_size)):
+                with autocast():
+                    image_features = paddle.to_tensor(
+                        predictions[step * self.batch_size : (step + 1) * self.batch_size]
+                    )
+                    target = paddle.to_tensor(labels[step * self.batch_size : (step + 1) * self.batch_size])
+                    logits = 100.0 * image_features @ self.classifier
+                if logits.shape[-1] < 5:
+                    (acc1,) = accuracy(logits, target, topk=(1,))
+                    acc5 = -1
+                else:
+                    acc1, acc5 = accuracy(logits, target, topk=(1, 5))
+                top1 += acc1
+                top5 += acc5
+        top1 = top1 / n
+        top5 = top5 / n
+        results["val/imagenet-zeroshot-val-top1"] = top1
+        results["val/imagenet-zeroshot-val-top5"] = top5
+        results["top1"] = top1
+        print(f"zero-shot classification task: {self.data_name}: top1: {top1}, top5: {top5}")
+        print("Finished zero-shot evaluation.")
+        return results