+
+
掩码图像
+
+
+
参考图像
+
+
生成图像
+
+
+
+
+
文本引导的图像变换(Image-to-Image Text-Guided Generation)
+
+#### text_guided_image_inpainting-kandinsky2_2
+```python
+import numpy as np
+import paddle
+
+from ppdiffusers import KandinskyV22InpaintPipeline, KandinskyV22PriorPipeline
+from ppdiffusers.utils import load_image
+
+pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+ "kandinsky-community/kandinsky-2-2-prior", paddle_dtype=paddle.float16
+)
+prompt = "a hat"
+image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
+pipe = KandinskyV22InpaintPipeline.from_pretrained(
+ "kandinsky-community/kandinsky-2-2-decoder-inpaint", paddle_dtype=paddle.float16
+)
+init_image = load_image(
+ "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
+)
+mask = np.zeros((768, 768), dtype=np.float32)
+mask[:250, 250:-250] = 1
+out = pipe(
+ image=init_image,
+ mask_image=mask,
+ image_embeds=image_emb,
+ negative_image_embeds=zero_image_emb,
+ height=768,
+ width=768,
+ num_inference_steps=50,
+)
+image = out.images[0]
+image.save("text_guided_image_inpainting-kandinsky2_2-result-cat_with_hat.png")
+```
+
+
+
原图像
+
+
生成图像
+
+
+#### image_to_image_text_guided_generation-stable_diffusion
+```python
+import paddle
+
+from ppdiffusers import StableDiffusionImg2ImgPipeline
+from ppdiffusers.utils import load_image
+
+# 加载pipeline
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+
+# 下载初始图片
+url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
+
+init_image = load_image(url).resize((768, 512))
+
+prompt = "A fantasy landscape, trending on artstation"
+# 使用fp16加快生成速度
+with paddle.amp.auto_cast(True):
+ image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0]
+
+image.save("fantasy_landscape.png")
+```
+
+
+
原图像
+
+
生成图像
+
+
+#### image_to_image_text_guided_generation-stable_diffusion_xl
+```python
+import paddle
+from ppdiffusers import StableDiffusionXLImg2ImgPipeline
+from ppdiffusers.utils import load_image
+
+pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
+ "stabilityai/stable-diffusion-xl-refiner-1.0",
+ paddle_dtype=paddle.float16,
+ # from_hf_hub=True,
+ # from_diffusers=True,
+ variant="fp16"
+)
+url = "https://paddlenlp.bj.bcebos.com/models/community/westfish/develop-0-19-3/000000009.png"
+init_image = load_image(url).convert("RGB")
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt, image=init_image).images[0]
+image.save('sdxl_image2image.png')
+```
+
+
+
原图像
+
+
生成图像
+
+
+#### image_to_image_text_guided_generation-kandinsky2_2
+```python
+import paddle
+
+from ppdiffusers import KandinskyV22Img2ImgPipeline, KandinskyV22PriorPipeline
+from ppdiffusers.utils import load_image
+
+pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+ "kandinsky-community/kandinsky-2-2-prior", paddle_dtype=paddle.float16
+)
+prompt = "A red cartoon frog, 4k"
+image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
+pipe = KandinskyV22Img2ImgPipeline.from_pretrained(
+ "kandinsky-community/kandinsky-2-2-decoder", paddle_dtype=paddle.float16
+)
+
+init_image = load_image(
+ "https://hf-mirror.com/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/frog.png"
+)
+image = pipe(
+ image=init_image,
+ image_embeds=image_emb,
+ negative_image_embeds=zero_image_emb,
+ height=768,
+ width=768,
+ num_inference_steps=100,
+ strength=0.2,
+).images
+image[0].save("image_to_image_text_guided_generation-kandinsky2_2-result-red_frog.png")
+```
+
+
+
原图像
+
+
生成图像
+
+
+
+
+
+
文本图像双引导图像生成(Dual Text and Image Guided Generation)
+
+#### dual_text_and_image_guided_generation-versatile_diffusion
+```python
+from ppdiffusers import VersatileDiffusionDualGuidedPipeline
+from ppdiffusers.utils import load_image
+
+url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/benz.jpg"
+image = load_image(url)
+text = "a red car in the sun"
+
+pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
+pipe.remove_unused_weights()
+
+text_to_image_strength = 0.75
+image = pipe(prompt=text, image=image, text_to_image_strength=text_to_image_strength).images[0]
+image.save("versatile-diffusion-red_car.png")
+```
+
+
+
原图像
+
+
生成图像
+
+
+
+### 文本视频多模
+
+
+ 文本条件的视频生成(Text-to-Video Generation)
+
+#### text_to_video_generation-lvdm
+
+```python
+import paddle
+
+from ppdiffusers import LVDMTextToVideoPipeline
+
+# 加载模型和scheduler
+pipe = LVDMTextToVideoPipeline.from_pretrained("westfish/lvdm_text2video_orig_webvid_2m")
+
+# 执行pipeline进行推理
+seed = 2013
+generator = paddle.Generator().manual_seed(seed)
+samples = pipe(
+ prompt="cutting in kitchen",
+ num_frames=16,
+ height=256,
+ width=256,
+ num_inference_steps=50,
+ generator=generator,
+ guidance_scale=15,
+ eta=1,
+ save_dir=".",
+ save_name="text_to_video_generation-lvdm-result-ddim_lvdm_text_to_video_ucf",
+ encoder_type="2d",
+ scale_factor=0.18215,
+ shift_factor=0,
+)
+```
+
+
+
+
+#### text_to_video_generation-synth
+
+```python
+import imageio
+
+from ppdiffusers import DPMSolverMultistepScheduler, TextToVideoSDPipeline
+
+pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+
+prompt = "An astronaut riding a horse."
+video_frames = pipe(prompt, num_inference_steps=25).frames
+imageio.mimsave("text_to_video_generation-synth-result-astronaut_riding_a_horse.mp4", video_frames, fps=8)
+```
+
+
+
+
+
+#### text_to_video_generation-synth with zeroscope_v2_XL
+
+```python
+import imageio
+
+from ppdiffusers import DPMSolverMultistepScheduler, TextToVideoSDPipeline
+
+# from ppdiffusers.utils import export_to_video
+
+pipe = TextToVideoSDPipeline.from_pretrained("cerspense/zeroscope_v2_XL")
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+
+prompt = "An astronaut riding a horse."
+video_frames = pipe(prompt, num_inference_steps=50, height=320, width=576, num_frames=24).frames
+imageio.mimsave("text_to_video_generation-synth-result-astronaut_riding_a_horse.mp4", video_frames, fps=8)
+```
+
+
+
+
+#### text_to_video_generation-zero
+
+```python
+import imageio
+
+# pip install imageio[ffmpeg]
+import paddle
+
+from ppdiffusers import TextToVideoZeroPipeline
+
+model_id = "runwayml/stable-diffusion-v1-5"
+pipe = TextToVideoZeroPipeline.from_pretrained(model_id, paddle_dtype=paddle.float16)
+
+prompt = "A panda is playing guitar on times square"
+result = pipe(prompt=prompt).images
+result = [(r * 255).astype("uint8") for r in result]
+imageio.mimsave("text_to_video_generation-zero-result-panda.mp4", result, fps=4)
+```
+
+
+
+
+
+
+### 文本音频多模
+
+ 文本条件的音频生成(Text-to-Audio Generation)
+
+#### text_to_audio_generation-audio_ldm
+
+```python
+import paddle
+import scipy
+
+from ppdiffusers import AudioLDM2Pipeline
+
+pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2", paddle_dtype=paddle.float16)
+
+prompt = "Musical constellations twinkling in the night sky, forming a cosmic melody."
+negative_prompt = "Low quality."
+audio = pipe(prompt, negative_prompt=negative_prompt, num_inference_steps=200, audio_length_in_s=10).audios[0]
+
+output_path = f"{prompt}.wav"
+# save the audio sample as a .wav file
+scipy.io.wavfile.write(output_path, rate=16000, data=audio)
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+可以使用以下代码转换[huggingface](https://huggingface.co/docs/diffusers/api/pipelines/audioldm2)的模型,一键在paddle中使用
+```python
+pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2-music", from_hf_hub=True, from_diffusers=True).save_pretrained("cvssp/audioldm2-music")
+```
+### 图像
+
+
无条件图像生成(Unconditional Image Generation)
+
+#### unconditional_image_generation-latent_diffusion_uncond
+
+```python
+from ppdiffusers import LDMPipeline
+
+# 加载模型和scheduler
+pipe = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256")
+
+# 执行pipeline进行推理
+image = pipe(num_inference_steps=200).images[0]
+
+# 保存图片
+image.save("ldm_generated_image.png")
+```
+
+
+
+
+
+
超分(Super Superresolution)
+
+#### super_resolution-latent_diffusion
+```python
+import paddle
+
+from ppdiffusers import LDMSuperResolutionPipeline
+from ppdiffusers.utils import load_image
+
+# 加载pipeline
+pipe = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages")
+
+# 下载初始图片
+url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+
+init_image = load_image(url).resize((128, 128))
+init_image.save("original-image.png")
+
+# 使用fp16加快生成速度
+with paddle.amp.auto_cast(True):
+ image = pipe(init_image, num_inference_steps=100, eta=1).images[0]
+
+image.save("super-resolution-image.png")
+```
+
+
+
原图像
+
+
生成图像
+
+
+
+
+
图像编辑(Image Inpainting)
+
+#### image_inpainting-repaint
+```python
+from ppdiffusers import RePaintPipeline, RePaintScheduler
+from ppdiffusers.utils import load_image
+
+img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/celeba_hq_256.png"
+mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/mask_256.png"
+
+# Load the original image and the mask as PIL images
+original_image = load_image(img_url).resize((256, 256))
+mask_image = load_image(mask_url).resize((256, 256))
+
+scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256", subfolder="scheduler")
+pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler)
+
+output = pipe(
+ original_image=original_image,
+ mask_image=mask_image,
+ num_inference_steps=250,
+ eta=0.0,
+ jump_length=10,
+ jump_n_sample=10,
+)
+inpainted_image = output.images[0]
+
+inpainted_image.save("repaint-image.png")
+```
+
+
+
原图像
+
+
mask图像
+
+
生成图像
+
+
+
+
+
+
图像变化(Image Variation)
+
+#### image_variation-versatile_diffusion
+```python
+from ppdiffusers import VersatileDiffusionImageVariationPipeline
+from ppdiffusers.utils import load_image
+
+url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/benz.jpg"
+image = load_image(url)
+
+pipe = VersatileDiffusionImageVariationPipeline.from_pretrained("shi-labs/versatile-diffusion")
+
+image = pipe(image).images[0]
+image.save("versatile-diffusion-car_variation.png")
+```
+
+
+
原图像
+
+
生成图像
+
+
+
+
+
+
+
+### 音频
+
+ 无条件音频生成(Unconditional Audio Generation)
+
+#### unconditional_audio_generation-audio_diffusion
+
+```python
+from scipy.io.wavfile import write
+from ppdiffusers import AudioDiffusionPipeline
+import paddle
+
+# 加载模型和scheduler
+pipe = AudioDiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256")
+pipe.set_progress_bar_config(disable=None)
+generator = paddle.Generator().manual_seed(42)
+
+output = pipe(generator=generator)
+audio = output.audios[0]
+image = output.images[0]
+
+# 保存音频到本地
+for i, audio in enumerate(audio):
+ write(f"audio_diffusion_test{i}.wav", pipe.mel.config.sample_rate, audio.transpose())
+
+# 保存图片
+image.save("audio_diffusion_test.png")
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#### unconditional_audio_generation-spectrogram_diffusion
+
+```python
+import paddle
+import scipy
+
+from ppdiffusers import MidiProcessor, SpectrogramDiffusionPipeline
+from ppdiffusers.utils.download_utils import ppdiffusers_url_download
+
+# Download MIDI from: wget https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid
+mid_file_path = ppdiffusers_url_download(
+ "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid", cache_dir="."
+)
+pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion", paddle_dtype=paddle.float16)
+processor = MidiProcessor()
+output = pipe(processor(mid_file_path))
+audio = output.audios[0]
+
+output_path = "unconditional_audio_generation-spectrogram_diffusion-result-beethoven_hammerklavier_2.wav"
+# save the audio sample as a .wav file
+scipy.io.wavfile.write(output_path, rate=16000, data=audio)
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+## License
+PPDiffusers 遵循 [Apache-2.0开源协议](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/LICENSE)。
+
+Stable Diffusion 遵循 [The CreativeML OpenRAIL M 开源协议](https://huggingface.co/spaces/CompVis/stable-diffusion-license)。
+> The CreativeML OpenRAIL M is an [Open RAIL M license](https://www.licenses.ai/blog/2022/8/18/naming-convention-of-responsible-ai-licenses), adapted from the work that [BigScience](https://bigscience.huggingface.co/) and [the RAIL Initiative](https://www.licenses.ai/) are jointly carrying in the area of responsible AI licensing. See also [the article about the BLOOM Open RAIL license](https://bigscience.huggingface.co/blog/the-bigscience-rail-license) on which this license is based.
+
+Stable Diffusion 3遵循 [Stability Community 开源协议](https://stability.ai/license)。
+> Community License: Free for research, non-commercial, and commercial use for organisations or individuals with less than $1M annual revenue. You only need a paid Enterprise license if your yearly revenues exceed USD$1M and you use Stability AI models in commercial products or services. Read more: https://stability.ai/license
+
+## Acknowledge
+我们借鉴了🤗 Hugging Face的[Diffusers](https://github.com/huggingface/diffusers)关于预训练扩散模型使用的优秀设计,在此对Hugging Face作者及其开源社区表示感谢。
+
+## Citation
+
+```bibtex
+@misc{ppdiffusers,
+ author = {PaddlePaddle Authors},
+ title = {PPDiffusers: State-of-the-art diffusion model toolkit based on PaddlePaddle},
+ year = {2022},
+ publisher = {GitHub},
+ journal = {GitHub repository},
+ howpublished = {\url{https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers}}
+}
+```
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/export_model.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/export_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..05a4c0e5335b5714dc1fc6f658431d0c36ec5a34
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/export_model.py
@@ -0,0 +1,263 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+# set USE_PPXFORMERS=False to avoid using ppxformers
+os.environ["USE_PPXFORMERS"] = "False"
+from pathlib import Path
+from types import MethodType
+
+import paddle
+
+from ppdiffusers import (
+ ControlNetModel,
+ PaddleInferRuntimeModel,
+ PaddleInferStableDiffusionControlNetPipeline,
+ StableDiffusionControlNetPipeline,
+ UNet2DConditionModel,
+)
+
+
+class ControlNetWithUnetModel(paddle.nn.Layer):
+ def __init__(
+ self,
+ unet,
+ controlnet,
+ ):
+ super().__init__()
+ self.unet = unet
+ self.controlnet = controlnet
+
+ def forward(
+ self,
+ sample,
+ timestep,
+ encoder_hidden_states,
+ controlnet_cond,
+ controlnet_conditioning_scale,
+ return_dict=True,
+ ):
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
+ sample,
+ timestep,
+ encoder_hidden_states=encoder_hidden_states,
+ controlnet_cond=controlnet_cond,
+ conditioning_scale=controlnet_conditioning_scale,
+ return_dict=False,
+ )
+
+ noise_pred = self.unet(
+ sample,
+ timestep,
+ encoder_hidden_states=encoder_hidden_states,
+ down_block_additional_residuals=down_block_res_samples,
+ mid_block_additional_residual=mid_block_res_sample,
+ return_dict=return_dict,
+ )
+ return noise_pred
+
+
+def convert_ppdiffusers_pipeline_to_paddleinfer_pipeline(
+ model_path: str,
+ controlnet_model_path: str,
+ output_path: str,
+ sample: bool = False,
+ height: int = None,
+ width: int = None,
+):
+ unet_tmp = UNet2DConditionModel.from_pretrained(model_path, resnet_pre_temb_non_linearity=False, subfolder="unet")
+ controlnet_tmp = ControlNetModel.from_pretrained(controlnet_model_path, resnet_pre_temb_non_linearity=False)
+
+ pipeline = StableDiffusionControlNetPipeline.from_pretrained(
+ model_path,
+ unet=unet_tmp,
+ controlnet=controlnet_tmp,
+ safety_checker=None,
+ feature_extractor=None,
+ requires_safety_checker=False,
+ )
+ output_path = Path(output_path)
+ # calculate latent's H and W
+ latent_height = height // 8 if height is not None else None
+ latent_width = width // 8 if width is not None else None
+ # get arguments
+ cross_attention_dim = pipeline.unet.config.cross_attention_dim # 768 or 1024 or 1280
+ unet_channels = pipeline.unet.config.in_channels # 4
+ vae_in_channels = pipeline.vae.config.in_channels # 3
+ vae_latent_channels = pipeline.vae.config.latent_channels # 4
+ print(
+ f"cross_attention_dim: {cross_attention_dim}\n",
+ f"unet_in_channels: {unet_channels}\n",
+ f"vae_encoder_in_channels: {vae_in_channels}\n",
+ f"vae_decoder_latent_channels: {vae_latent_channels}",
+ )
+ # 1. Convert text_encoder
+ text_encoder = paddle.jit.to_static(
+ pipeline.text_encoder,
+ input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")], # input_ids
+ )
+ save_path = os.path.join(args.output_path, "text_encoder", "inference")
+ paddle.jit.save(text_encoder, save_path)
+ print(f"Save text_encoder model in {save_path} successfully.")
+ del pipeline.text_encoder
+
+ # wrap unet + controlnet
+ new_unet = ControlNetWithUnetModel(unet=pipeline.unet, controlnet=pipeline.controlnet)
+
+ # 2. Convert unet
+ unet = paddle.jit.to_static(
+ new_unet,
+ input_spec=[
+ paddle.static.InputSpec(
+ shape=[None, unet_channels, latent_height, latent_width],
+ dtype="float32",
+ name="sample",
+ ), # sample
+ paddle.static.InputSpec(shape=[1], dtype="float32", name="timestep"), # timestep
+ paddle.static.InputSpec(
+ shape=[None, None, cross_attention_dim],
+ dtype="float32",
+ name="encoder_hidden_states",
+ ), # encoder_hidden_states
+ paddle.static.InputSpec(
+ shape=[None, vae_in_channels, height, width],
+ dtype="float32",
+ name="controlnet_cond",
+ ), # controlnet_cond
+ paddle.static.InputSpec(
+ shape=[len(pipeline.unet.config.block_out_channels) * 3 + 1],
+ dtype="float32",
+ name="controlnet_conditioning_scale",
+ ), # controlnet_conditioning_scale
+ ],
+ )
+
+ save_path = os.path.join(args.output_path, "unet", "inference")
+ paddle.jit.save(unet, save_path)
+ print(f"Save unet model in {save_path} successfully.")
+ del pipeline.unet
+ del new_unet
+
+ def forward_vae_encoder_mode(self, z):
+ return self.encode(z, True).latent_dist.mode()
+
+ def forward_vae_encoder_sample(self, z):
+ return self.encode(z, True).latent_dist.sample()
+
+ # 3. Convert vae encoder
+ vae_encoder = pipeline.vae
+ if sample:
+ vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder)
+ else:
+ vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder)
+
+ vae_encoder = paddle.jit.to_static(
+ vae_encoder,
+ input_spec=[
+ paddle.static.InputSpec(
+ shape=[None, vae_in_channels, height, width],
+ dtype="float32",
+ name="sample", # N, C, H, W
+ ), # latent
+ ],
+ )
+ # Save vae_encoder in static graph model.
+ save_path = os.path.join(args.output_path, "vae_encoder", "inference")
+ paddle.jit.save(vae_encoder, save_path)
+ print(f"Save vae_encoder model in {save_path} successfully.")
+
+ # 4. Convert vae encoder
+ vae_decoder = pipeline.vae
+
+ def forward_vae_decoder(self, z):
+ return self.decode(z, True).sample
+
+ vae_decoder.forward = MethodType(forward_vae_decoder, vae_decoder)
+ vae_decoder = paddle.jit.to_static(
+ vae_decoder,
+ input_spec=[
+ paddle.static.InputSpec(
+ shape=[None, vae_latent_channels, latent_height, latent_width],
+ dtype="float32",
+ name="latent_sample",
+ ), # latent_sample
+ ],
+ )
+ # Save vae_decoder in static graph model.
+ save_path = os.path.join(args.output_path, "vae_decoder", "inference")
+ paddle.jit.save(vae_decoder, save_path)
+ print(f"Save vae_decoder model in {save_path} successfully.")
+ del pipeline.vae
+
+ paddleinfer_pipeline = PaddleInferStableDiffusionControlNetPipeline(
+ vae_encoder=PaddleInferRuntimeModel.from_pretrained(output_path / "vae_encoder"),
+ vae_decoder=PaddleInferRuntimeModel.from_pretrained(output_path / "vae_decoder"),
+ text_encoder=PaddleInferRuntimeModel.from_pretrained(output_path / "text_encoder"),
+ unet=PaddleInferRuntimeModel.from_pretrained(output_path / "unet"),
+ tokenizer=pipeline.tokenizer,
+ scheduler=pipeline.scheduler,
+ safety_checker=None,
+ feature_extractor=None,
+ image_encoder=None,
+ requires_safety_checker=False,
+ )
+ paddleinfer_pipeline.save_pretrained(str(output_path))
+ print("PaddleInfer pipeline saved to", output_path)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument(
+ "--pretrained_model_name_or_path",
+ type=str,
+ default="runwayml/stable-diffusion-v1-5",
+ help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).",
+ )
+ parser.add_argument(
+ "--controlnet_pretrained_model_name_or_path",
+ type=str,
+ default="lllyasviel/sd-controlnet-canny",
+ help="Path to the `ppdiffusers` controlnet_pretrained_model_name_or_path checkpoint to convert (either a local directory or on the bos).",
+ )
+ parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
+ parser.add_argument(
+ "--sample",
+ action="store_true",
+ default=False,
+ help="Export the vae encoder in mode or sample",
+ )
+ parser.add_argument(
+ "--height",
+ type=int,
+ default=None,
+ help="The height of output images. Default: None",
+ )
+ parser.add_argument(
+ "--width",
+ type=int,
+ default=None,
+ help="The width of output images. Default: None",
+ )
+ args = parser.parse_args()
+
+ convert_ppdiffusers_pipeline_to_paddleinfer_pipeline(
+ args.pretrained_model_name_or_path,
+ args.controlnet_pretrained_model_name_or_path,
+ args.output_path,
+ args.sample,
+ args.height,
+ args.width,
+ )
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_paddle.sh b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_paddle.sh
new file mode 100644
index 0000000000000000000000000000000000000000..babde7cd92a54bcb31ab4e4c89e1c7c2017e33f4
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_paddle.sh
@@ -0,0 +1,32 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# attention raw fp16
+python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type raw --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
+
+# attention cutlass fp16
+python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type cutlass --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
+
+# attention flash fp16
+python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type flash --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
+
+
+# attention raw fp32
+python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type raw --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
+
+# attention cutlass fp32
+python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type cutlass --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
+
+# attention flash fp32
+python infer_dygraph_paddle.py --scheduler "ddim" --task_name all --attention_type flash --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_torch.sh b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_torch.sh
new file mode 100644
index 0000000000000000000000000000000000000000..40eb9bc45707a567eb68415727060bdf1344c5cc
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/benchmark_torch.sh
@@ -0,0 +1,26 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# attention raw
+python infer_dygraph_torch.py --scheduler "ddim" --task_name all --attention_type raw --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
+
+# attention sdp
+python infer_dygraph_torch.py --scheduler "ddim" --task_name all --attention_type sdp --use_fp16 True --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
+
+
+# attention raw fp32
+python infer_dygraph_torch.py --scheduler "ddim" --task_name all --attention_type raw --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
+
+# attention sdp fp32
+python infer_dygraph_torch.py --scheduler "ddim" --task_name all --attention_type sdp --use_fp16 False --inference_steps 50 --height 512 --width 512 --benchmark_steps 10
\ No newline at end of file
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/inference.sh b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/inference.sh
new file mode 100644
index 0000000000000000000000000000000000000000..24541c8f5297b87a28c1c343f1addd9608a558e8
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/controlnet/scripts/inference.sh
@@ -0,0 +1,26 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ==============================================================================
+# use paddle as backend to inference static model is not fast,
+# this script is used to make sure the inference is correct.
+# ==============================================================================
+# text2img
+python infer.py --model_dir static_model/stable-diffusion-v1-5-canny/ --scheduler "ddim" --backend paddle --device gpu --task_name text2img
+
+# img2img
+python infer.py --model_dir static_model/stable-diffusion-v1-5-canny/ --scheduler "ddim" --backend paddle --device gpu --task_name img2img
+
+# inpaint
+python infer.py --model_dir static_model/stable-diffusion-v1-5-canny/ --scheduler "ddim" --backend paddle --device gpu --task_name inpaint_legacy
\ No newline at end of file
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/export_model.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/export_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..796b2c99ac368056563c192e6e92cf18c46ccb3e
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/export_model.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+# set USE_PPXFORMERS=False to avoid using ppxformers
+os.environ["USE_PPXFORMERS"] = "False"
+from pathlib import Path
+from types import MethodType
+
+import paddle
+from unet_2d_condition_housing import UNet2DConditionModelSDHousing
+
+from ppdiffusers import (
+ PaddleInferRuntimeModel,
+ PaddleInferStableDiffusionInpaintPipeline,
+ PaddleInferStableDiffusionMegaPipeline,
+ StableDiffusionPipeline,
+)
+
+
+def convert_ppdiffusers_pipeline_to_paddleinfer_pipeline(
+ model_path: str,
+ output_path: str,
+ sample: bool = False,
+ height: int = None,
+ width: int = None,
+):
+ # specify unet model with unet pre_temb_act opt enabled.
+ unet_model = UNet2DConditionModelSDHousing.from_pretrained(
+ model_path, resnet_pre_temb_non_linearity=False, subfolder="unet"
+ )
+ pipeline = StableDiffusionPipeline.from_pretrained(
+ model_path,
+ unet=unet_model,
+ safety_checker=None,
+ )
+ output_path = Path(output_path)
+ # calculate latent's H and W
+ latent_height = height // 8 if height is not None else None
+ latent_width = width // 8 if width is not None else None
+ # get arguments
+ cross_attention_dim = pipeline.unet.config.cross_attention_dim # 768 or 1024 or 1280
+ unet_channels = pipeline.unet.config.in_channels # 4 or 9
+ vae_in_channels = pipeline.vae.config.in_channels # 3
+ vae_latent_channels = pipeline.vae.config.latent_channels # 4
+ print(
+ f"cross_attention_dim: {cross_attention_dim}\n",
+ f"unet_in_channels: {unet_channels}\n",
+ f"vae_encoder_in_channels: {vae_in_channels}\n",
+ f"vae_decoder_latent_channels: {vae_latent_channels}",
+ )
+ # 1. Convert text_encoder
+ text_encoder = paddle.jit.to_static(
+ pipeline.text_encoder,
+ input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")], # input_ids
+ )
+ save_path = os.path.join(args.output_path, "text_encoder", "inference")
+ paddle.jit.save(text_encoder, save_path)
+ print(f"Save text_encoder model in {save_path} successfully.")
+ del pipeline.text_encoder
+
+ # 2. Convert unet
+ unet = paddle.jit.to_static(
+ pipeline.unet,
+ input_spec=[
+ paddle.static.InputSpec(
+ shape=[None, unet_channels, latent_height, latent_width],
+ dtype="float32",
+ name="sample",
+ ), # sample
+ paddle.static.InputSpec(shape=[1], dtype="float32", name="timestep"), # timestep
+ paddle.static.InputSpec(
+ shape=[None, None, cross_attention_dim],
+ dtype="float32",
+ name="encoder_hidden_states",
+ ), # encoder_hidden_states
+ ],
+ )
+ save_path = os.path.join(args.output_path, "unet", "inference")
+ paddle.jit.save(unet, save_path)
+ print(f"Save unet model in {save_path} successfully.")
+ del pipeline.unet
+
+ def forward_vae_encoder_mode(self, z):
+ return self.encode(z, True).latent_dist.mode()
+
+ def forward_vae_encoder_sample(self, z):
+ return self.encode(z, True).latent_dist.sample()
+
+ # 3. Convert vae encoder
+ vae_encoder = pipeline.vae
+ if sample:
+ vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder)
+ else:
+ vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder)
+
+ vae_encoder = paddle.jit.to_static(
+ vae_encoder,
+ input_spec=[
+ paddle.static.InputSpec(
+ shape=[None, vae_in_channels, height, width],
+ dtype="float32",
+ name="sample", # N, C, H, W
+ ), # latent
+ ],
+ )
+ # Save vae_encoder in static graph model.
+ save_path = os.path.join(args.output_path, "vae_encoder", "inference")
+ paddle.jit.save(vae_encoder, save_path)
+ print(f"Save vae_encoder model in {save_path} successfully.")
+
+ # 4. Convert vae encoder
+ vae_decoder = pipeline.vae
+
+ def forward_vae_decoder(self, z):
+ return self.decode(z, True).sample
+
+ vae_decoder.forward = MethodType(forward_vae_decoder, vae_decoder)
+ vae_decoder = paddle.jit.to_static(
+ vae_decoder,
+ input_spec=[
+ paddle.static.InputSpec(
+ shape=[None, vae_latent_channels, latent_height, latent_width],
+ dtype="float32",
+ name="latent_sample",
+ ), # latent_sample
+ ],
+ )
+ # Save vae_decoder in static graph model.
+ save_path = os.path.join(args.output_path, "vae_decoder", "inference")
+ paddle.jit.save(vae_decoder, save_path)
+ print(f"Save vae_decoder model in {save_path} successfully.")
+ del pipeline.vae
+
+ if "inpainting" in model_path:
+ fd_pipe_cls = PaddleInferStableDiffusionInpaintPipeline
+ else:
+ fd_pipe_cls = PaddleInferStableDiffusionMegaPipeline
+
+ paddleinfer_pipeline = fd_pipe_cls(
+ vae_encoder=PaddleInferRuntimeModel.from_pretrained(output_path / "vae_encoder"),
+ vae_decoder=PaddleInferRuntimeModel.from_pretrained(output_path / "vae_decoder"),
+ text_encoder=PaddleInferRuntimeModel.from_pretrained(output_path / "text_encoder"),
+ unet=PaddleInferRuntimeModel.from_pretrained(output_path / "unet"),
+ tokenizer=pipeline.tokenizer,
+ scheduler=pipeline.scheduler,
+ feature_extractor=pipeline.feature_extractor,
+ image_encoder=None,
+ safety_checker=None,
+ requires_safety_checker=False,
+ )
+ paddleinfer_pipeline.save_pretrained(str(output_path))
+ print("PaddleInfer pipeline saved to", output_path)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument(
+ "--pretrained_model_name_or_path",
+ type=str,
+ required=True,
+ help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).",
+ )
+ parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
+ parser.add_argument(
+ "--sample",
+ action="store_true",
+ default=False,
+ help="Export the vae encoder in mode or sample",
+ )
+ parser.add_argument(
+ "--height",
+ type=int,
+ default=None,
+ help="The height of output images. Default: None",
+ )
+ parser.add_argument(
+ "--width",
+ type=int,
+ default=None,
+ help="The width of output images. Default: None",
+ )
+ args = parser.parse_args()
+
+ convert_ppdiffusers_pipeline_to_paddleinfer_pipeline(
+ args.pretrained_model_name_or_path,
+ args.output_path,
+ args.sample,
+ args.height,
+ args.width,
+ )
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ad63f98f50ed7cdd9f4f9c23476db3346fff131
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer.py
@@ -0,0 +1,408 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import time
+
+# isort: split
+import paddle
+import paddle.inference as paddle_infer
+
+# isort: split
+import numpy as np
+from paddlenlp.trainer.argparser import strtobool
+from tqdm.auto import trange
+
+from ppdiffusers import ( # noqa
+ DiffusionPipeline,
+ PaddleInferStableDiffusionMegaPipeline,
+)
+from ppdiffusers.utils import load_image
+
+
+def parse_arguments():
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--model_dir",
+ default="runwayml/stable-diffusion-v1-5@paddleinfer",
+ help="The model directory of diffusion_model.",
+ )
+ parser.add_argument(
+ "--inference_steps",
+ type=int,
+ default=50,
+ help="The number of unet inference steps.",
+ )
+ parser.add_argument(
+ "--benchmark_steps",
+ type=int,
+ default=10,
+ help="The number of performance benchmark steps.",
+ )
+ parser.add_argument(
+ "--backend",
+ type=str,
+ default="paddle_tensorrt",
+ choices=["paddle", "paddle_tensorrt"],
+ help="The inference runtime backend of unet model and text encoder model.",
+ )
+ parser.add_argument(
+ "--device",
+ type=str,
+ default="gpu",
+ choices=[
+ "cpu",
+ "gpu",
+ "huawei_ascend_npu",
+ "kunlunxin_xpu",
+ ],
+ help="The inference runtime device of models.",
+ )
+ parser.add_argument(
+ "--task_name",
+ type=str,
+ default="text2img",
+ choices=[
+ "text2img",
+ "img2img",
+ "inpaint_legacy",
+ "all",
+ ],
+ help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ",
+ )
+ parser.add_argument(
+ "--parse_prompt_type",
+ type=str,
+ default="lpw",
+ choices=[
+ "raw",
+ "lpw",
+ ],
+ help="The parse_prompt_type can be one of [raw, lpw]. ",
+ )
+ parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
+ parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
+ parser.add_argument(
+ "--scheduler",
+ type=str,
+ default="preconfig-euler-ancestral",
+ choices=[
+ "pndm",
+ "lms",
+ "euler",
+ "euler-ancestral",
+ "preconfig-euler-ancestral",
+ "dpm-multi",
+ "dpm-single",
+ "unipc-multi",
+ "ddim",
+ "ddpm",
+ "deis-multi",
+ "heun",
+ "kdpm2-ancestral",
+ "kdpm2",
+ ],
+ help="The scheduler type of stable diffusion.",
+ )
+ parser.add_argument("--height", type=int, default=512, help="Height of input image")
+ parser.add_argument("--width", type=int, default=512, help="Width of input image")
+ parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint")
+ parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image")
+ parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image")
+ parser.add_argument("--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?")
+ parser.add_argument(
+ "--tune",
+ type=strtobool,
+ default=False,
+ help="Whether to tune the shape of tensorrt engine.",
+ )
+
+ return parser.parse_args()
+
+
+def create_paddle_inference_runtime(
+ model_dir="",
+ model_name="",
+ use_trt=False,
+ precision_mode=paddle_infer.PrecisionType.Half,
+ device_id=0,
+ disable_paddle_trt_ops=[],
+ disable_paddle_pass=[],
+ workspace=24 * 1024 * 1024 * 1024,
+ tune=False,
+):
+ config = paddle_infer.Config()
+ config.enable_memory_optim()
+ shape_file = f"{model_dir}/{model_name}/shape_range_info.pbtxt"
+ if tune:
+ config.collect_shape_range_info(shape_file)
+ config.switch_ir_optim(False)
+ else:
+ config.enable_new_executor()
+ if str(os.environ.get("FLAGS_enable_pir_in_executor")).lower() in ("true", "1"):
+ config.enable_new_ir()
+ if str(os.environ.get("FLAGS_use_cinn")).lower() in ("true", "1"):
+ config.enable_cinn()
+
+ if device_id != -1:
+ config.use_gpu()
+ config.enable_use_gpu(memory_pool_init_size_mb=2000, device_id=device_id, precision_mode=precision_mode)
+ for pass_name in disable_paddle_pass:
+ config.delete_pass(pass_name)
+ if use_trt:
+ config.enable_tensorrt_engine(
+ workspace_size=workspace,
+ precision_mode=precision_mode,
+ max_batch_size=1,
+ min_subgraph_size=3,
+ use_static=True,
+ )
+ config.enable_tensorrt_memory_optim()
+ config.enable_tuned_tensorrt_dynamic_shape(shape_file, True)
+ cache_file = os.path.join(model_dir, model_name, "_opt_cache/")
+ config.set_optim_cache_dir(cache_file)
+ if precision_mode != paddle_infer.PrecisionType.Half:
+ only_fp16_passes = [
+ "trt_cross_multihead_matmul_fuse_pass",
+ "trt_flash_multihead_matmul_fuse_pass",
+ "preln_elementwise_groupnorm_act_pass",
+ "elementwise_groupnorm_act_pass",
+ ]
+ for curr_pass in only_fp16_passes:
+ config.delete_pass(curr_pass)
+ return config
+
+
+def main(args):
+ if args.device_id == -1:
+ paddle.set_device("cpu")
+ else:
+ paddle.set_device(f"gpu:{args.device_id}")
+
+ seed = 1024
+ min_image_size = 512
+ max_image_size = 768
+ max_image_size = max(min_image_size, max_image_size)
+
+ # 4. Init runtime
+ only_fp16_passes = [
+ "trt_cross_multihead_matmul_fuse_pass",
+ "trt_flash_multihead_matmul_fuse_pass",
+ "preln_elementwise_groupnorm_act_pass",
+ "elementwise_groupnorm_act_pass",
+ ]
+ no_need_passes = [
+ "trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass",
+ "add_support_int8_pass",
+ "elementwise_groupnorm_act_pass",
+ "groupnorm_act_pass",
+ "preln_elementwise_groupnorm_act_pass",
+ ]
+ paddle_delete_passes = dict(
+ text_encoder=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes,
+ text_encoder_2=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes,
+ vae_encoder=only_fp16_passes + [] if args.use_fp16 else [],
+ vae_decoder=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes,
+ unet=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes,
+ image_encoder=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes,
+ )
+ args.use_trt = args.backend == "paddle_tensorrt"
+ precision_mode = paddle_infer.PrecisionType.Half if args.use_fp16 else paddle_infer.PrecisionType.Float32
+ infer_configs = dict(
+ text_encoder=create_paddle_inference_runtime(
+ model_dir=args.model_dir,
+ use_trt=False,
+ model_name="text_encoder",
+ precision_mode=paddle_infer.PrecisionType.Half,
+ device_id=args.device_id,
+ disable_paddle_trt_ops=["range", "lookup_table_v2"],
+ disable_paddle_pass=paddle_delete_passes.get("text_encoder", []),
+ tune=False,
+ ),
+ vae_encoder=create_paddle_inference_runtime(
+ model_dir=args.model_dir,
+ model_name="vae_encoder",
+ use_trt=False,
+ precision_mode=paddle_infer.PrecisionType.Half,
+ device_id=args.device_id,
+ disable_paddle_pass=paddle_delete_passes.get("vae_encoder", []),
+ tune=False,
+ ),
+ vae_decoder=create_paddle_inference_runtime(
+ model_dir=args.model_dir,
+ model_name="vae_decoder",
+ use_trt=False,
+ precision_mode=paddle_infer.PrecisionType.Float32,
+ device_id=args.device_id,
+ disable_paddle_pass=paddle_delete_passes.get("vae_decoder", []),
+ tune=False,
+ ),
+ unet=create_paddle_inference_runtime(
+ model_dir=args.model_dir,
+ model_name="unet",
+ use_trt=args.use_trt,
+ precision_mode=precision_mode,
+ device_id=args.device_id,
+ disable_paddle_pass=no_need_passes,
+ tune=args.tune,
+ ),
+ )
+ pipe = PaddleInferStableDiffusionMegaPipeline.from_pretrained(
+ args.model_dir,
+ infer_configs=infer_configs,
+ use_optim_cache=False,
+ )
+ pipe.set_progress_bar_config(disable=False)
+ pipe.change_scheduler(args.scheduler)
+ parse_prompt_type = args.parse_prompt_type
+ width = args.width
+ height = args.height
+
+ folder = f"results-{args.backend}"
+ os.makedirs(folder, exist_ok=True)
+ if args.task_name in ["text2img", "all"]:
+ # text2img
+ prompt = "a photo of an astronaut riding a horse on mars"
+ time_costs = []
+ # warmup
+ pipe.text2img(
+ prompt,
+ num_inference_steps=20,
+ height=height,
+ width=width,
+ # parse_prompt_type=parse_prompt_type,
+ )
+ print("==> Test text2img performance.")
+ for step in trange(args.benchmark_steps):
+ start = time.time()
+ paddle.seed(seed)
+ images = pipe.text2img(
+ prompt,
+ output_type="pil",
+ num_inference_steps=args.inference_steps,
+ height=height,
+ width=width,
+ # parse_prompt_type=parse_prompt_type,
+ ).images
+ latency = time.time() - start
+ time_costs += [latency]
+ # print(f"No {step:3d} time cost: {latency:2f} s")
+ print(
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+ )
+ images[0].save(f"{folder}/text2img.png")
+
+ if args.task_name in ["img2img", "all"]:
+ # img2img
+ img_url = (
+ "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
+ )
+ init_image = load_image(img_url)
+ prompt = "A fantasy landscape, trending on artstation"
+ time_costs = []
+ # warmup
+ pipe.img2img(
+ prompt,
+ image=init_image,
+ num_inference_steps=20,
+ height=height,
+ width=width,
+ strength=args.strength,
+ # parse_prompt_type=parse_prompt_type,
+ )
+ print("==> Test img2img performance.")
+ for step in trange(args.benchmark_steps):
+ start = time.time()
+ paddle.seed(seed)
+ images = pipe.img2img(
+ prompt,
+ image=init_image,
+ num_inference_steps=args.inference_steps,
+ height=height,
+ width=width,
+ strength=args.strength,
+ # parse_prompt_type=parse_prompt_type,
+ ).images
+ latency = time.time() - start
+ time_costs += [latency]
+ # print(f"No {step:3d} time cost: {latency:2f} s")
+ print(
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+ )
+ images[0].save(f"{folder}/img2img.png")
+
+ if args.task_name in ["inpaint", "inpaint_legacy", "all"]:
+ img_url = (
+ "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+ )
+ mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
+ init_image = load_image(img_url)
+ mask_image = load_image(mask_url)
+ prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+ time_costs = []
+ # warmup
+ if args.task_name in ["inpaint_legacy", "all"]:
+ call_fn = pipe.inpaint_legacy
+ task_name = "inpaint_legacy"
+ else:
+ call_fn = pipe.inpaint
+ task_name = "inpaint"
+ call_fn(
+ prompt,
+ image=init_image,
+ mask_image=mask_image,
+ num_inference_steps=20,
+ height=height,
+ width=width,
+ strength=args.strength,
+ parse_prompt_type=parse_prompt_type,
+ )
+ print(f"==> Test {task_name} performance.")
+ for step in trange(args.benchmark_steps):
+ start = time.time()
+ paddle.seed(seed)
+ images = call_fn(
+ prompt,
+ image=init_image,
+ mask_image=mask_image,
+ num_inference_steps=args.inference_steps,
+ height=height,
+ width=width,
+ strength=args.strength,
+ parse_prompt_type=parse_prompt_type,
+ ).images
+ latency = time.time() - start
+ time_costs += [latency]
+ # print(f"No {step:3d} time cost: {latency:2f} s")
+ print(
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+ )
+
+ images[0].save(f"{folder}/{task_name}.png")
+
+
+if __name__ == "__main__":
+ args = parse_arguments()
+ main(args)
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_paddle.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_paddle.py
new file mode 100644
index 0000000000000000000000000000000000000000..06ffde0f7ddd1b75c3ada2a5f62c8e6165ae9056
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_paddle.py
@@ -0,0 +1,357 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import time
+import warnings
+
+import cv2
+import numpy as np
+import paddle
+from PIL import Image
+from tqdm.auto import trange
+
+from ppdiffusers import (
+ DDIMScheduler,
+ DDPMScheduler,
+ DEISMultistepScheduler,
+ DPMSolverMultistepScheduler,
+ DPMSolverSinglestepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ HeunDiscreteScheduler,
+ KDPM2AncestralDiscreteScheduler,
+ KDPM2DiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ StableDiffusionImg2ImgPipeline,
+ StableDiffusionInpaintPipeline,
+ StableDiffusionPipeline,
+ UniPCMultistepScheduler,
+)
+from ppdiffusers.utils import load_image
+
+
+def get_canny_image(image, args):
+ if isinstance(image, Image.Image):
+ image = np.array(image)
+ image = cv2.Canny(image, args.low_threshold, args.high_threshold)
+ image = image[:, :, None]
+ image = np.concatenate([image, image, image], axis=2)
+ canny_image = Image.fromarray(image)
+ return canny_image
+
+
+def strtobool(v):
+ if isinstance(v, bool):
+ return v
+ if v.lower() in ("yes", "true", "t", "y", "1"):
+ return True
+ elif v.lower() in ("no", "false", "f", "n", "0"):
+ return False
+ else:
+ raise ValueError(
+ f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
+ )
+
+
+def change_scheduler(self, scheduler_type="ddim"):
+ self.orginal_scheduler_config = self.scheduler.config
+ scheduler_type = scheduler_type.lower()
+ if scheduler_type == "pndm":
+ scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
+ elif scheduler_type == "lms":
+ scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "heun":
+ scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "euler":
+ scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "euler-ancestral":
+ scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "dpm-multi":
+ scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "dpm-single":
+ scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "kdpm2-ancestral":
+ scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "kdpm2":
+ scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "unipc-multi":
+ scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "ddim":
+ scheduler = DDIMScheduler.from_config(
+ self.orginal_scheduler_config,
+ steps_offset=1,
+ clip_sample=False,
+ set_alpha_to_one=False,
+ )
+ elif scheduler_type == "ddpm":
+ scheduler = DDPMScheduler.from_config(
+ self.orginal_scheduler_config,
+ )
+ elif scheduler_type == "deis-multi":
+ scheduler = DEISMultistepScheduler.from_config(
+ self.orginal_scheduler_config,
+ )
+ else:
+ raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+ return scheduler
+
+
+def parse_arguments():
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--pretrained_model_name_or_path",
+ type=str,
+ default="runwayml/stable-diffusion-v1-5",
+ help="Path to the `diffusers` checkpoint to convert (either a local directory or on the bos).",
+ )
+ parser.add_argument(
+ "--inference_steps",
+ type=int,
+ default=50,
+ help="The number of unet inference steps.",
+ )
+ parser.add_argument(
+ "--benchmark_steps",
+ type=int,
+ default=10,
+ help="The number of performance benchmark steps.",
+ )
+ parser.add_argument(
+ "--task_name",
+ type=str,
+ default="all",
+ choices=[
+ "text2img",
+ "img2img",
+ "inpaint_legacy",
+ "all",
+ ],
+ help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ",
+ )
+ parser.add_argument(
+ "--parse_prompt_type",
+ type=str,
+ default="raw",
+ choices=[
+ "raw",
+ "lpw",
+ ],
+ help="The parse_prompt_type can be one of [raw, lpw]. ",
+ )
+ parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
+ parser.add_argument(
+ "--attention_type", type=str, default="raw", choices=["raw", "cutlass", "flash", "all"], help="attention_type."
+ )
+ parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
+ parser.add_argument(
+ "--scheduler",
+ type=str,
+ default="euler-ancestral",
+ choices=[
+ "pndm",
+ "lms",
+ "euler",
+ "euler-ancestral",
+ "dpm-multi",
+ "dpm-single",
+ "unipc-multi",
+ "ddim",
+ "ddpm",
+ "deis-multi",
+ "heun",
+ "kdpm2-ancestral",
+ "kdpm2",
+ ],
+ help="The scheduler type of stable diffusion.",
+ )
+ parser.add_argument("--height", type=int, default=512, help="Height of input image")
+ parser.add_argument("--width", type=int, default=512, help="Width of input image")
+ parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint")
+ return parser.parse_args()
+
+
+def main(args):
+
+ seed = 1024
+ paddle_dtype = paddle.float16 if args.use_fp16 else paddle.float32
+ pipe = StableDiffusionPipeline.from_pretrained(
+ args.pretrained_model_name_or_path,
+ safety_checker=None,
+ feature_extractor=None,
+ requires_safety_checker=False,
+ paddle_dtype=paddle_dtype,
+ )
+ scheduler = change_scheduler(pipe, args.scheduler)
+ pipe.scheduler = scheduler
+
+ if args.attention_type == "all":
+ args.attention_type = ["raw", "cutlass", "flash"]
+ else:
+ args.attention_type = [args.attention_type]
+
+ for attention_type in args.attention_type:
+ if attention_type == "raw":
+ pipe.disable_xformers_memory_efficient_attention()
+ else:
+ try:
+ pipe.enable_xformers_memory_efficient_attention(attention_type)
+ except Exception as e:
+ if attention_type == "flash":
+ warnings.warn(
+ "Attention type flash is not supported on your GPU! We need to use 3060、3070、3080、3090、4060、4070、4080、4090、A30、A100 etc."
+ )
+ continue
+ else:
+ raise ValueError(e)
+
+ if not args.use_fp16 and attention_type == "flash":
+ print("Flash attention is not supported dtype=float32! Please use float16 or bfloat16. We will skip this!")
+ continue
+
+ width = args.width
+ height = args.height
+ pipe.set_progress_bar_config(disable=False)
+
+ folder = f"paddle_attn_{attention_type}_fp16" if args.use_fp16 else f"paddle_attn_{attention_type}_fp32"
+ os.makedirs(folder, exist_ok=True)
+ if args.task_name in ["text2img", "all"]:
+ init_image = load_image(
+ "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
+ )
+ # text2img
+ prompt = "bird"
+ time_costs = []
+ # warmup
+ pipe(
+ prompt,
+ num_inference_steps=10,
+ height=height,
+ width=width,
+ )
+ print("==> Test text2img performance.")
+ for step in trange(args.benchmark_steps):
+ start = time.time()
+ paddle.seed(seed)
+ images = pipe(
+ prompt,
+ num_inference_steps=args.inference_steps,
+ height=height,
+ width=width,
+ ).images
+ latency = time.time() - start
+ time_costs += [latency]
+ # print(f"No {step:3d} time cost: {latency:2f} s")
+ print(
+ f"Attention type: {attention_type}, "
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+ )
+ images[0].save(f"{folder}/text2img.png")
+
+ if args.task_name in ["img2img", "all"]:
+ pipe_img2img = StableDiffusionImg2ImgPipeline(**pipe.components)
+ pipe_img2img.set_progress_bar_config(disable=False)
+ img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
+ init_image = load_image(img_url).resize((width, height))
+ prompt = "A fantasy landscape, trending on artstation"
+ time_costs = []
+ # warmup
+ pipe_img2img(
+ prompt,
+ image=init_image,
+ num_inference_steps=20,
+ height=height,
+ width=width,
+ strength=args.strength,
+ )
+ print("==> Test img2img performance.")
+ for step in trange(args.benchmark_steps):
+ start = time.time()
+ paddle.seed(seed)
+ images = pipe_img2img(
+ prompt,
+ image=init_image,
+ num_inference_steps=args.inference_steps,
+ height=height,
+ width=width,
+ strength=args.strength,
+ ).images
+ latency = time.time() - start
+ time_costs += [latency]
+ # print(f"No {step:3d} time cost: {latency:2f} s")
+ print(
+ f"Attention type: {attention_type}, "
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+ )
+ images[0].save(f"{folder}/img2img.png")
+
+ if args.task_name in ["inpaint_legacy", "all"]:
+ pipe_inpaint = StableDiffusionInpaintPipeline(**pipe.components)
+ pipe_inpaint.set_progress_bar_config(disable=False)
+ img_url = (
+ "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+ )
+ mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
+ init_image = load_image(img_url).resize((width, height))
+ mask_image = load_image(mask_url).resize((width, height))
+ prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+ time_costs = []
+ task_name = "inpaint_legacy"
+ pipe_inpaint(
+ prompt,
+ image=init_image,
+ mask_image=mask_image,
+ num_inference_steps=20,
+ height=height,
+ width=width,
+ strength=args.strength,
+ )
+ print(f"==> Test {task_name} performance.")
+ for step in trange(args.benchmark_steps):
+ start = time.time()
+ paddle.seed(seed)
+ images = pipe_inpaint(
+ prompt,
+ image=init_image,
+ mask_image=mask_image,
+ num_inference_steps=args.inference_steps,
+ height=height,
+ width=width,
+ strength=args.strength,
+ ).images
+ latency = time.time() - start
+ time_costs += [latency]
+ # print(f"No {step:3d} time cost: {latency:2f} s")
+ print(
+ f"Attention type: {attention_type}, "
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+ )
+ images[0].save(f"{folder}/{task_name}.png")
+
+
+if __name__ == "__main__":
+ args = parse_arguments()
+ main(args)
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_torch.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..febc46610eca3d524d182c8bc39495a202fdaaca
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd15/infer_dygraph_torch.py
@@ -0,0 +1,417 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import time
+
+import torch
+
+torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention
+delattr(torch.nn.functional, "scaled_dot_product_attention")
+
+import cv2
+import numpy as np
+from diffusers import (
+ DDIMScheduler,
+ DDPMScheduler,
+ DEISMultistepScheduler,
+ DPMSolverMultistepScheduler,
+ DPMSolverSinglestepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ HeunDiscreteScheduler,
+ KDPM2AncestralDiscreteScheduler,
+ KDPM2DiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ StableDiffusionImg2ImgPipeline,
+ StableDiffusionInpaintPipeline,
+ StableDiffusionPipeline,
+ UniPCMultistepScheduler,
+)
+from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0
+from diffusers.utils import load_image
+from PIL import Image
+from tqdm.auto import trange
+
+
+def get_canny_image(image, args):
+ if isinstance(image, Image.Image):
+ image = np.array(image)
+ image = cv2.Canny(image, args.low_threshold, args.high_threshold)
+ image = image[:, :, None]
+ image = np.concatenate([image, image, image], axis=2)
+ canny_image = Image.fromarray(image)
+ return canny_image
+
+
+def strtobool(v):
+ if isinstance(v, bool):
+ return v
+ if v.lower() in ("yes", "true", "t", "y", "1"):
+ return True
+ elif v.lower() in ("no", "false", "f", "n", "0"):
+ return False
+ else:
+ raise ValueError(
+ f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
+ )
+
+
+def change_scheduler(self, scheduler_type="ddim"):
+ self.orginal_scheduler_config = self.scheduler.config
+ scheduler_type = scheduler_type.lower()
+ if scheduler_type == "pndm":
+ scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
+ elif scheduler_type == "lms":
+ scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "heun":
+ scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "euler":
+ scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "euler-ancestral":
+ scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "dpm-multi":
+ scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "dpm-single":
+ scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "kdpm2-ancestral":
+ scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "kdpm2":
+ scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "unipc-multi":
+ scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "ddim":
+ scheduler = DDIMScheduler.from_config(
+ self.orginal_scheduler_config,
+ steps_offset=1,
+ clip_sample=False,
+ set_alpha_to_one=False,
+ )
+ elif scheduler_type == "ddpm":
+ scheduler = DDPMScheduler.from_config(
+ self.orginal_scheduler_config,
+ )
+ elif scheduler_type == "deis-multi":
+ scheduler = DEISMultistepScheduler.from_config(
+ self.orginal_scheduler_config,
+ )
+ else:
+ raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+ return scheduler
+
+
+def parse_arguments():
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--pretrained_model_name_or_path",
+ type=str,
+ default="runwayml/stable-diffusion-v1-5",
+ help="Path to the `diffusers` checkpoint to convert (either a local directory or on the bos).",
+ )
+ parser.add_argument(
+ "--inference_steps",
+ type=int,
+ default=50,
+ help="The number of unet inference steps.",
+ )
+ parser.add_argument(
+ "--benchmark_steps",
+ type=int,
+ default=10,
+ help="The number of performance benchmark steps.",
+ )
+ parser.add_argument(
+ "--task_name",
+ type=str,
+ default="all",
+ choices=[
+ "text2img",
+ "img2img",
+ "inpaint_legacy",
+ "all",
+ ],
+ help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ",
+ )
+ parser.add_argument(
+ "--parse_prompt_type",
+ type=str,
+ default="raw",
+ choices=[
+ "raw",
+ "lpw",
+ ],
+ help="The parse_prompt_type can be one of [raw, lpw]. ",
+ )
+ parser.add_argument(
+ "--channels_last",
+ type=strtobool,
+ default=False,
+ help="Wheter to use channels_last",
+ )
+ parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
+ parser.add_argument("--tf32", type=strtobool, default=True, help="tf32")
+ parser.add_argument("--compile", type=strtobool, default=False, help="compile")
+ parser.add_argument(
+ "--attention_type",
+ type=str,
+ default="sdp",
+ choices=[
+ "raw",
+ "sdp",
+ ],
+ help="attention_type.",
+ )
+ parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
+ parser.add_argument(
+ "--scheduler",
+ type=str,
+ default="euler-ancestral",
+ choices=[
+ "pndm",
+ "lms",
+ "euler",
+ "euler-ancestral",
+ "dpm-multi",
+ "dpm-single",
+ "unipc-multi",
+ "ddim",
+ "ddpm",
+ "deis-multi",
+ "heun",
+ "kdpm2-ancestral",
+ "kdpm2",
+ ],
+ help="The scheduler type of stable diffusion.",
+ )
+ parser.add_argument("--height", type=int, default=512, help="Height of input image")
+ parser.add_argument("--width", type=int, default=512, help="Width of input image")
+ parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint")
+ return parser.parse_args()
+
+
+def attn_processors(self):
+ processors = {}
+
+ def fn_recursive_add_processors(name: str, module, processors):
+ if hasattr(module, "set_processor"):
+ processors[f"{name}.processor"] = module.processor
+
+ for sub_name, child in module.named_children():
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+ return processors
+
+ for name, module in self.named_children():
+ fn_recursive_add_processors(name, module, processors)
+
+ return processors
+
+
+def set_attn_processor(self, processor):
+ count = len(attn_processors(self).keys())
+
+ if isinstance(processor, dict) and len(processor) != count:
+ raise ValueError(
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+ )
+
+ def fn_recursive_attn_processor(name: str, module, processor):
+ if hasattr(module, "set_processor"):
+ if not isinstance(processor, dict):
+ module.set_processor(processor)
+ else:
+ module.set_processor(processor.pop(f"{name}.processor"))
+
+ for sub_name, child in module.named_children():
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+ for name, module in self.named_children():
+ fn_recursive_attn_processor(name, module, processor)
+
+
+def main(args):
+ if args.tf32:
+ torch.backends.cuda.matmul.allow_tf32 = True
+ else:
+ torch.backends.cuda.matmul.allow_tf32 = False
+
+ seed = 1024
+ torch_dtype = torch.float16 if args.use_fp16 else torch.float32
+ pipe = StableDiffusionPipeline.from_pretrained(
+ args.pretrained_model_name_or_path,
+ safety_checker=None,
+ feature_extractor=None,
+ requires_safety_checker=False,
+ torch_dtype=torch_dtype,
+ )
+ scheduler = change_scheduler(pipe, args.scheduler)
+ pipe.scheduler = scheduler
+ if args.device_id >= 0:
+ pipe.to(f"cuda:{args.device_id}")
+
+ if args.attention_type == "all":
+ args.attention_type = ["raw", "sdp"]
+ else:
+ args.attention_type = [args.attention_type]
+
+ for attention_type in args.attention_type:
+ attn_prrocessor_cls = AttnProcessor if attention_type == "raw" else AttnProcessor2_0
+ if attention_type == "sdp":
+ torch.nn.functional.scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention_
+ set_attn_processor(pipe.unet, attn_prrocessor_cls())
+ set_attn_processor(pipe.vae, attn_prrocessor_cls())
+
+ if args.channels_last:
+ pipe.unet.to(memory_format=torch.channels_last)
+
+ if args.compile:
+ print("Run torch compile")
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+ width = args.width
+ height = args.height
+ pipe.set_progress_bar_config(disable=False)
+
+ folder = f"torch_attn_{attention_type}_fp16" if args.use_fp16 else f"torch_attn_{attention_type}_fp32"
+ os.makedirs(folder, exist_ok=True)
+ if args.task_name in ["text2img", "all"]:
+ init_image = load_image(
+ "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
+ )
+ # text2img
+ prompt = "bird"
+ time_costs = []
+ # warmup
+ pipe(
+ prompt,
+ num_inference_steps=10,
+ height=height,
+ width=width,
+ )
+ print("==> Test text2img performance.")
+ for step in trange(args.benchmark_steps):
+ start = time.time()
+ torch.cuda.manual_seed(seed)
+ images = pipe(
+ prompt,
+ num_inference_steps=args.inference_steps,
+ height=height,
+ width=width,
+ ).images
+ latency = time.time() - start
+ time_costs += [latency]
+ # print(f"No {step:3d} time cost: {latency:2f} s")
+ print(
+ f"Attention type: {attention_type}, "
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+ )
+ images[0].save(f"{folder}/text2img.png")
+
+ if args.task_name in ["img2img", "all"]:
+ pipe_img2img = StableDiffusionImg2ImgPipeline(**pipe.components)
+ pipe_img2img.set_progress_bar_config(disable=False)
+ img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
+ init_image = load_image(img_url).resize((width, height))
+ prompt = "A fantasy landscape, trending on artstation"
+ time_costs = []
+ # warmup
+ pipe_img2img(
+ prompt,
+ image=init_image,
+ num_inference_steps=20,
+ height=height,
+ width=width,
+ strength=args.strength,
+ )
+ print("==> Test img2img performance.")
+ for step in trange(args.benchmark_steps):
+ start = time.time()
+ torch.cuda.manual_seed(seed)
+ images = pipe_img2img(
+ prompt,
+ image=init_image,
+ num_inference_steps=args.inference_steps,
+ height=height,
+ width=width,
+ strength=args.strength,
+ ).images
+ latency = time.time() - start
+ time_costs += [latency]
+ # print(f"No {step:3d} time cost: {latency:2f} s")
+ print(
+ f"Attention type: {attention_type}, "
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+ )
+ images[0].save(f"{folder}/img2img.png")
+
+ if args.task_name in ["inpaint_legacy", "all"]:
+ pipe_inpaint = StableDiffusionInpaintPipeline(**pipe.components)
+ pipe_inpaint.set_progress_bar_config(disable=False)
+ img_url = (
+ "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+ )
+ mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
+ init_image = load_image(img_url).resize((width, height))
+ mask_image = load_image(mask_url).resize((width, height))
+ prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+ time_costs = []
+ task_name = "inpaint_legacy"
+ pipe_inpaint(
+ prompt,
+ image=init_image,
+ mask_image=mask_image,
+ num_inference_steps=20,
+ height=height,
+ width=width,
+ strength=args.strength,
+ )
+ print(f"==> Test {task_name} performance.")
+ for step in trange(args.benchmark_steps):
+ start = time.time()
+ torch.cuda.manual_seed(seed)
+ images = pipe_inpaint(
+ prompt,
+ image=init_image,
+ mask_image=mask_image,
+ num_inference_steps=args.inference_steps,
+ height=height,
+ width=width,
+ strength=args.strength,
+ ).images
+ latency = time.time() - start
+ time_costs += [latency]
+ # print(f"No {step:3d} time cost: {latency:2f} s")
+ print(
+ f"Attention type: {attention_type}, "
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+ )
+ images[0].save(f"{folder}/{task_name}.png")
+
+
+if __name__ == "__main__":
+ args = parse_arguments()
+ main(args)
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/README.md b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b2804832d904d250bf1806d52c6f285f0652555d
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/README.md
@@ -0,0 +1,77 @@
+# Stable Diffusion 3 高性能推理
+
+- Paddle Inference提供Stable Diffusion 3 模型高性能推理实现,推理性能提升70%+
+环境准备:
+```shell
+# 安装 triton并适配paddle
+python -m pip install triton
+python -m pip install git+https://github.com/zhoutianzi666/UseTritonInPaddle.git
+python -c "import use_triton_in_paddle; use_triton_in_paddle.make_triton_compatible_with_paddle()"
+
+# 安装develop版本的paddle,请根据自己的cuda版本选择对应的paddle版本,这里选择12.3的cuda版本
+python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu123/
+
+# 安装paddlemix库,使用集成在paddlemix库中的自定义算子。
+python -m pip install paddlemix
+
+# 指定 libCutlassGemmEpilogue.so 的路径
+# 详情请参考 https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/README.md
+export LD_LIBRARY_PATH=/your_dir/Paddle/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/build:$LD_LIBRARY_PATH
+- 请注意,该项用于在静态图推理时利用Cutlass融合算子提升推理性能,但是并不是必须项。
+如果不使用Cutlass可以将`./text_to_image_generation-stable_diffusion_3.py`中的`exp_enable_use_cutlass`设为False。
+-
+```
+
+高性能推理指令:
+```shell
+# 执行FP16推理
+python text_to_image_generation-stable_diffusion_3.py --dtype float16 --height 512 --width 512 \
+--num-inference-steps 50 --inference_optimize 1 \
+--benchmark 1
+```
+注:--inference_optimize 1 用于开启推理优化,--benchmark 1 用于开启性能测试。
+
+
+- 在 NVIDIA A100-SXM4-40GB 上测试的性能如下:
+
+| Paddle Inference| PyTorch | Paddle 动态图 |
+| --------------- | ------------ | ------------ |
+| 1.2 s | 1.78 s | 4.202 s |
+
+
+## Paddle Stable Diffusion 3 模型多卡推理:
+### Data Parallel 实现原理
+- 在SD3中,对于输入是一个prompt时,使用CFG需要同时进行unconditional guide和text guide的生成,此时 MM-DiT-blocks 的输入batch_size=2;
+所以我们考虑在多卡并行的方案中,将batch为2的输入拆分到两张卡上进行计算,这样单卡的计算量就减少为原来的一半,降低了单卡所承载的浮点计算量。
+计算完成后,我们再把两张卡的计算结果聚合在一起,结果与单卡计算完全一致。
+
+### Model parallel 实现原理
+- 在SD3中,在Linear和Attnetion中有大量的GEMM(General Matrix Multiply),当生成高分辨率图像时,GEMM的计算量以及模型的预训练权重大小都呈线性递增。
+因此,我们考虑在多卡并行方案中,将模型的这些GEMM拆分到两张卡上进行计算,这样单卡的计算量和权重大小就都减少为原来的一半,不仅降低了单卡所承载的浮点计算量,也降低了单卡的显存占用。
+
+### 开启多卡推理方法
+- Paddle Inference 提供了SD3模型的多卡推理功能,用户可以通过设置 `mp_size 2` 来开启Model Parallel,使用 `dp_size 2`来开启Data Parallel。
+使用 `python -m paddle.distributed.launch --gpus “0,1,2,3”` 指定使用哪些卡进行推理,其中`--gpus “0,1,2,3”`即为启用的GPU卡号。
+如果只需使用两卡推理,则只需指定两卡即可,如 `python -m paddle.distributed.launch --gpus “0,1”`。同时需要指定使用的并行方法及并行度,如 `mp_size 2` 或者 `dp_size 2`。
+
+- 注意,这里的`mp_size`需要设定为不大于输入的batch_size个,且`mp_size`和`dp_size`的和不能超过机器总卡数。
+- 高性能多卡推理指令:
+```shell
+# 执行多卡推理指令
+python -m paddle.distributed.launch --gpus "0,1,2,3" text_to_image_generation-stable_diffusion_3.py \
+--dtype float16 \
+--height 1024 \
+--width 1024 \
+--num-inference-steps 20 \
+--inference_optimize 1 \
+--mp_size 2 \
+--dp_size 2 \
+--benchmark 1
+```
+注:--inference_optimize 1 用于开启推理优化,--benchmark 1 用于开启性能测试。
+
+## 在 NVIDIA A800-SXM4-80GB 上测试的性能如下:
+
+| Paddle mp_size=2 & dp_size=2 | Paddle mp_size=2 | Paddle dp_size=2 | Paddle Single Card | Paddle 动态图 |
+| ---------------------------- | ------------------- | ---------------- | ------------------ | ------------ |
+| 0.99s | 1.581 s | 1.319 s | 2.376 s | 3.2 s |
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_paddle.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_paddle.py
new file mode 100644
index 0000000000000000000000000000000000000000..14d1f5f24683cf98ff48ce2978666e3e7f91fb5d
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_paddle.py
@@ -0,0 +1,264 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import time
+import warnings
+
+import cv2
+import numpy as np
+import paddle
+from PIL import Image
+from tqdm.auto import trange
+
+from ppdiffusers import (
+ FlowMatchEulerDiscreteScheduler,
+ DDIMScheduler,
+ DDPMScheduler,
+ DEISMultistepScheduler,
+ DPMSolverMultistepScheduler,
+ DPMSolverSinglestepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ HeunDiscreteScheduler,
+ KDPM2AncestralDiscreteScheduler,
+ KDPM2DiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ StableDiffusion3Pipeline,
+ UniPCMultistepScheduler,
+)
+from ppdiffusers.utils import load_image
+
+
+
+def strtobool(v):
+ if isinstance(v, bool):
+ return v
+ if v.lower() in ("yes", "true", "t", "y", "1"):
+ return True
+ elif v.lower() in ("no", "false", "f", "n", "0"):
+ return False
+ else:
+ raise ValueError(
+ f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
+ )
+
+
+def change_scheduler(self, scheduler_type="ddim"):
+ self.orginal_scheduler_config = self.scheduler.config
+ scheduler_type = scheduler_type.lower()
+ if scheduler_type == "flow":
+ scheduler = FlowMatchEulerDiscreteScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
+ elif scheduler_type == "pndm":
+ scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
+ elif scheduler_type == "lms":
+ scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "heun":
+ scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "euler":
+ scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "euler-ancestral":
+ scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "dpm-multi":
+ scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "dpm-single":
+ scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "kdpm2-ancestral":
+ scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "kdpm2":
+ scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "unipc-multi":
+ scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "ddim":
+ scheduler = DDIMScheduler.from_config(
+ self.orginal_scheduler_config,
+ steps_offset=1,
+ clip_sample=False,
+ set_alpha_to_one=False,
+ )
+ elif scheduler_type == "ddpm":
+ scheduler = DDPMScheduler.from_config(
+ self.orginal_scheduler_config,
+ )
+ elif scheduler_type == "deis-multi":
+ scheduler = DEISMultistepScheduler.from_config(
+ self.orginal_scheduler_config,
+ )
+ else:
+ raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+ return scheduler
+
+
+def parse_arguments():
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--pretrained_model_name_or_path",
+ type=str,
+ default="stabilityai/stable-diffusion-3-medium-diffusers",
+ help="Path to the `diffusers` checkpoint to convert (either a local directory or on the bos).",
+ )
+ parser.add_argument(
+ "--inference_steps",
+ type=int,
+ default=50,
+ help="The number of unet inference steps.",
+ )
+ parser.add_argument(
+ "--benchmark_steps",
+ type=int,
+ default=10,
+ help="The number of performance benchmark steps.",
+ )
+ parser.add_argument(
+ "--task_name",
+ type=str,
+ default="all",
+ choices=[
+ "text2img",
+ "img2img",
+ "inpaint_legacy",
+ "all",
+ ],
+ help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ",
+ )
+ parser.add_argument(
+ "--parse_prompt_type",
+ type=str,
+ default="raw",
+ choices=[
+ "raw",
+ "lpw",
+ ],
+ help="The parse_prompt_type can be one of [raw, lpw]. ",
+ )
+ parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
+ parser.add_argument(
+ "--attention_type", type=str, default="raw", choices=["raw", "cutlass", "flash", "all"], help="attention_type."
+ )
+ parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
+ parser.add_argument(
+ "--scheduler",
+ type=str,
+ default="euler-ancestral",
+ choices=[
+ "flow",
+ "pndm",
+ "lms",
+ "euler",
+ "euler-ancestral",
+ "dpm-multi",
+ "dpm-single",
+ "unipc-multi",
+ "ddim",
+ "ddpm",
+ "deis-multi",
+ "heun",
+ "kdpm2-ancestral",
+ "kdpm2",
+ ],
+ help="The scheduler type of stable diffusion.",
+ )
+ parser.add_argument("--height", type=int, default=512, help="Height of input image")
+ parser.add_argument("--width", type=int, default=512, help="Width of input image")
+ parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint")
+ return parser.parse_args()
+
+
+def main(args):
+
+ seed = 1024
+ paddle_dtype = paddle.float16 if args.use_fp16 else paddle.float32
+ pipe = StableDiffusion3Pipeline.from_pretrained(
+ args.pretrained_model_name_or_path,
+ safety_checker=None,
+ feature_extractor=None,
+ requires_safety_checker=False,
+ paddle_dtype=paddle_dtype,
+ )
+ scheduler = change_scheduler(pipe, args.scheduler)
+ pipe.scheduler = scheduler
+
+ if args.attention_type == "all":
+ args.attention_type = ["raw", "cutlass", "flash"]
+ else:
+ args.attention_type = [args.attention_type]
+
+ for attention_type in args.attention_type:
+ if attention_type == "raw":
+ pipe.disable_xformers_memory_efficient_attention()
+ else:
+ try:
+ pipe.enable_xformers_memory_efficient_attention(attention_type)
+ except Exception as e:
+ if attention_type == "flash":
+ warnings.warn(
+ "Attention type flash is not supported on your GPU! We need to use 3060、3070、3080、3090、4060、4070、4080、4090、A30、A100 etc."
+ )
+ continue
+ else:
+ raise ValueError(e)
+
+ if not args.use_fp16 and attention_type == "flash":
+ print("Flash attention is not supported dtype=float32! Please use float16 or bfloat16. We will skip this!")
+ continue
+
+ width = args.width
+ height = args.height
+ pipe.set_progress_bar_config(disable=False)
+
+ folder = f"paddle_attn_{attention_type}_fp16" if args.use_fp16 else f"paddle_attn_{attention_type}_fp32"
+ os.makedirs(folder, exist_ok=True)
+ if args.task_name in ["text2img", "all"]:
+ init_image = load_image(
+ "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
+ )
+ # text2img
+ prompt = "bird"
+ time_costs = []
+ # warmup
+ pipe(
+ prompt,
+ num_inference_steps=10,
+ height=height,
+ width=width,
+ )
+ print("==> Test text2img performance.")
+ for step in trange(args.benchmark_steps):
+ start = time.time()
+ paddle.seed(seed)
+ images = pipe(
+ prompt,
+ num_inference_steps=args.inference_steps,
+ height=height,
+ width=width,
+ ).images
+ latency = time.time() - start
+ time_costs += [latency]
+ # print(f"No {step:3d} time cost: {latency:2f} s")
+ print(
+ f"Attention type: {attention_type}, "
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+ )
+ images[0].save(f"{folder}/text2img.png")
+
+
+if __name__ == "__main__":
+ args = parse_arguments()
+ main(args)
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_torch.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..14c547b5605833d2c25b775136cea0b4112ee94d
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/infer_dygraph_torch.py
@@ -0,0 +1,325 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import time
+
+import torch
+
+# torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention
+# delattr(torch.nn.functional, "scaled_dot_product_attention")
+
+import cv2
+import numpy as np
+from diffusers import (
+ FlowMatchEulerDiscreteScheduler,
+ DDIMScheduler,
+ DDPMScheduler,
+ DEISMultistepScheduler,
+ DPMSolverMultistepScheduler,
+ DPMSolverSinglestepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ HeunDiscreteScheduler,
+ KDPM2AncestralDiscreteScheduler,
+ KDPM2DiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+ StableDiffusion3Pipeline,
+ UniPCMultistepScheduler,
+)
+from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0
+from diffusers.utils import load_image
+from PIL import Image
+from tqdm.auto import trange
+
+
+
+def strtobool(v):
+ if isinstance(v, bool):
+ return v
+ if v.lower() in ("yes", "true", "t", "y", "1"):
+ return True
+ elif v.lower() in ("no", "false", "f", "n", "0"):
+ return False
+ else:
+ raise ValueError(
+ f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
+ )
+
+
+def change_scheduler(self, scheduler_type="ddim"):
+ self.orginal_scheduler_config = self.scheduler.config
+ scheduler_type = scheduler_type.lower()
+ if scheduler_type == "flow":
+ scheduler = FlowMatchEulerDiscreteScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
+ elif scheduler_type == "pndm":
+ scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
+ elif scheduler_type == "lms":
+ scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "heun":
+ scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "euler":
+ scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "euler-ancestral":
+ scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "dpm-multi":
+ scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "dpm-single":
+ scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "kdpm2-ancestral":
+ scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "kdpm2":
+ scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "unipc-multi":
+ scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
+ elif scheduler_type == "ddim":
+ scheduler = DDIMScheduler.from_config(
+ self.orginal_scheduler_config,
+ steps_offset=1,
+ clip_sample=False,
+ set_alpha_to_one=False,
+ )
+ elif scheduler_type == "ddpm":
+ scheduler = DDPMScheduler.from_config(
+ self.orginal_scheduler_config,
+ )
+ elif scheduler_type == "deis-multi":
+ scheduler = DEISMultistepScheduler.from_config(
+ self.orginal_scheduler_config,
+ )
+ else:
+ raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+ return scheduler
+
+
+def parse_arguments():
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--pretrained_model_name_or_path",
+ type=str,
+ default="stabilityai/stable-diffusion-3-medium-diffusers",
+ help="Path to the `diffusers` checkpoint to convert (either a local directory or on the bos).",
+ )
+ parser.add_argument(
+ "--inference_steps",
+ type=int,
+ default=50,
+ help="The number of unet inference steps.",
+ )
+ parser.add_argument(
+ "--benchmark_steps",
+ type=int,
+ default=10,
+ help="The number of performance benchmark steps.",
+ )
+ parser.add_argument(
+ "--task_name",
+ type=str,
+ default="all",
+ choices=[
+ "text2img",
+ "img2img",
+ "inpaint_legacy",
+ "all",
+ ],
+ help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ",
+ )
+ parser.add_argument(
+ "--parse_prompt_type",
+ type=str,
+ default="raw",
+ choices=[
+ "raw",
+ "lpw",
+ ],
+ help="The parse_prompt_type can be one of [raw, lpw]. ",
+ )
+ parser.add_argument(
+ "--channels_last",
+ type=strtobool,
+ default=False,
+ help="Wheter to use channels_last",
+ )
+ parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
+ parser.add_argument("--tf32", type=strtobool, default=True, help="tf32")
+ parser.add_argument("--compile", type=strtobool, default=False, help="compile")
+ parser.add_argument(
+ "--attention_type",
+ type=str,
+ default="sdp",
+ choices=[
+ "raw",
+ "sdp",
+ ],
+ help="attention_type.",
+ )
+ parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
+ parser.add_argument(
+ "--scheduler",
+ type=str,
+ default="euler-ancestral",
+ choices=[
+ "flow",
+ "pndm",
+ "lms",
+ "euler",
+ "euler-ancestral",
+ "dpm-multi",
+ "dpm-single",
+ "unipc-multi",
+ "ddim",
+ "ddpm",
+ "deis-multi",
+ "heun",
+ "kdpm2-ancestral",
+ "kdpm2",
+ ],
+ help="The scheduler type of stable diffusion.",
+ )
+ parser.add_argument("--height", type=int, default=512, help="Height of input image")
+ parser.add_argument("--width", type=int, default=512, help="Width of input image")
+ parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint")
+ return parser.parse_args()
+
+
+def attn_processors(self):
+ processors = {}
+
+ def fn_recursive_add_processors(name: str, module, processors):
+ if hasattr(module, "set_processor"):
+ processors[f"{name}.processor"] = module.processor
+
+ for sub_name, child in module.named_children():
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+ return processors
+
+ for name, module in self.named_children():
+ fn_recursive_add_processors(name, module, processors)
+
+ return processors
+
+
+def set_attn_processor(self, processor):
+ count = len(attn_processors(self).keys())
+
+ if isinstance(processor, dict) and len(processor) != count:
+ raise ValueError(
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+ )
+
+ def fn_recursive_attn_processor(name: str, module, processor):
+ if hasattr(module, "set_processor"):
+ if not isinstance(processor, dict):
+ module.set_processor(processor)
+ else:
+ module.set_processor(processor.pop(f"{name}.processor"))
+
+ for sub_name, child in module.named_children():
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+ for name, module in self.named_children():
+ fn_recursive_attn_processor(name, module, processor)
+
+
+def main(args):
+ if args.tf32:
+ torch.backends.cuda.matmul.allow_tf32 = True
+ else:
+ torch.backends.cuda.matmul.allow_tf32 = False
+
+ seed = 1024
+ torch_dtype = torch.float16 if args.use_fp16 else torch.float32
+ pipe = StableDiffusion3Pipeline.from_pretrained(
+ args.pretrained_model_name_or_path,
+ safety_checker=None,
+ feature_extractor=None,
+ requires_safety_checker=False,
+ torch_dtype=torch_dtype,
+ )
+ scheduler = change_scheduler(pipe, args.scheduler)
+ pipe.scheduler = scheduler
+ if args.device_id >= 0:
+ pipe.to(f"cuda:{args.device_id}")
+
+ if args.attention_type == "all":
+ args.attention_type = ["raw", "sdp"]
+ else:
+ args.attention_type = [args.attention_type]
+
+ for attention_type in args.attention_type:
+ # attn_prrocessor_cls = AttnProcessor if attention_type == "raw" else AttnProcessor2_0
+ # if attention_type == "sdp":
+ # torch.nn.functional.scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention_
+ # set_attn_processor(pipe.transformer, attn_prrocessor_cls())
+ # set_attn_processor(pipe.vae, attn_prrocessor_cls())
+
+ # if args.channels_last:
+ # pipe.transformer.to(memory_format=torch.channels_last)
+
+ # if args.compile:
+ # print("Run torch compile")
+ # pipe.unet = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=True)
+
+ width = args.width
+ height = args.height
+ pipe.set_progress_bar_config(disable=False)
+
+ folder = f"torch_attn_{attention_type}_fp16" if args.use_fp16 else f"torch_attn_{attention_type}_fp32"
+ os.makedirs(folder, exist_ok=True)
+ if args.task_name in ["text2img", "all"]:
+ init_image = load_image(
+ "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
+ )
+ # text2img
+ prompt = "bird"
+ time_costs = []
+ # warmup
+ pipe(
+ prompt,
+ num_inference_steps=10,
+ height=height,
+ width=width,
+ )
+ print("==> Test text2img performance.")
+ for step in trange(args.benchmark_steps):
+ start = time.time()
+ torch.cuda.manual_seed(seed)
+ images = pipe(
+ prompt,
+ num_inference_steps=args.inference_steps,
+ height=height,
+ width=width,
+ ).images
+ latency = time.time() - start
+ time_costs += [latency]
+ # print(f"No {step:3d} time cost: {latency:2f} s")
+ print(
+ f"Attention type: {attention_type}, "
+ f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+ f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+ f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+ f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+ )
+ images[0].save(f"{folder}/text2img.png")
+
+
+
+if __name__ == "__main__":
+ args = parse_arguments()
+ main(args)
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_paddle.sh b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_paddle.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a0c2d8d45763db9d01e9a0245c02d55c6c0925ae
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_paddle.sh
@@ -0,0 +1,32 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# attention raw fp16
+python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type raw --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
+
+# attention cutlass fp16
+python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type cutlass --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
+
+# attention flash fp16
+python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type flash --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
+
+
+# attention raw fp32
+python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type raw --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
+
+# attention cutlass fp32
+python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type cutlass --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
+
+# attention flash fp32
+python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type flash --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_torch.sh b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_torch.sh
new file mode 100644
index 0000000000000000000000000000000000000000..020c54969a75651f919585dab0e67beaf016306e
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/scripts/benchmark_torch.sh
@@ -0,0 +1,21 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# sd3 do ot supprot attention raw
+
+# attention sdp
+python infer_dygraph_torch.py --scheduler "flow" --task_name all --attention_type sdp --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
+
+# attention sdp fp32
+python infer_dygraph_torch.py --scheduler "flow" --task_name all --attention_type sdp --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
\ No newline at end of file
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/text_to_image_generation-stable_diffusion_3.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/text_to_image_generation-stable_diffusion_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..61d490d683af75b2fc0af87435f7656e9e6d9b42
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sd3/text_to_image_generation-stable_diffusion_3.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import paddle
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description=" Use PaddleMIX to accelerate the Stable Diffusion3 image generation model."
+ )
+ parser.add_argument(
+ "--benchmark",
+ type=(lambda x: str(x).lower() in ["true", "1", "yes"]),
+ default=False,
+ help="if set to True, measure inference performance",
+ )
+ parser.add_argument(
+ "--inference_optimize",
+ type=(lambda x: str(x).lower() in ["true", "1", "yes"]),
+ default=False,
+ help="If set to True, all optimizations except Triton are enabled.",
+ )
+
+ parser.add_argument("--height", type=int, default=512, help="Height of the generated image.")
+ parser.add_argument("--width", type=int, default=512, help="Width of the generated image.")
+ parser.add_argument("--num-inference-steps", type=int, default=50, help="Number of inference steps.")
+ parser.add_argument("--dtype", type=str, default="float32", help="Inference data types.")
+ parser.add_argument(
+ "--mp_size", type=int, default=1, help="This size refers to the degree of parallelism using model parallel."
+ )
+ parser.add_argument(
+ "--dp_size", type=int, default=1, help="This size refers to the degree of parallelism using data parallel."
+ )
+
+ return parser.parse_args()
+
+
+args = parse_args()
+
+if args.inference_optimize:
+ os.environ["INFERENCE_OPTIMIZE"] = "True"
+ os.environ["INFERENCE_OPTIMIZE_TRITON"] = "True"
+ os.environ["INFERENCE_MP_SIZE"] = str(args.mp_size)
+ os.environ["INFERENCE_DP_SIZE"] = str(args.dp_size)
+if args.dtype == "float32":
+ inference_dtype = paddle.float32
+elif args.dtype == "float16":
+ inference_dtype = paddle.float16
+
+
+import paddle.distributed as dist
+import paddle.distributed.fleet as fleet
+
+if args.mp_size > 1 or args.dp_size > 1:
+ strategy = fleet.DistributedStrategy()
+ model_parallel_size = args.mp_size
+ data_parallel_size = args.dp_size
+ strategy.hybrid_configs = {"dp_degree": data_parallel_size, "mp_degree": model_parallel_size, "pp_degree": 1}
+ fleet.init(is_collective=True, strategy=strategy)
+ hcg = fleet.get_hybrid_communicate_group()
+ mp_id = hcg.get_model_parallel_rank()
+ dp_id = hcg.get_data_parallel_rank()
+ rank_id = dist.get_rank()
+ mp_degree = hcg.get_model_parallel_world_size()
+ dp_degree = hcg.get_data_parallel_world_size()
+ assert mp_degree == args.mp_size
+ assert dp_degree == args.dp_size
+
+ # this is for triton kernel cache for dynamic graph
+ # os.environ["TRITON_KERNEL_CACHE_DIR"] = f"./tmp/sd3_parallel/{rank_id}"
+
+import datetime
+
+from ppdiffusers import StableDiffusion3Pipeline
+
+pipe = StableDiffusion3Pipeline.from_pretrained(
+ "stabilityai/stable-diffusion-3-medium-diffusers",
+ paddle_dtype=inference_dtype,
+)
+
+pipe.transformer = paddle.incubate.jit.inference(
+ pipe.transformer,
+ save_model_dir="./tmp/sd3",
+ enable_new_ir=True,
+ cache_static_model=True,
+ exp_enable_use_cutlass=True,
+ delete_pass_lists=["add_norm_fuse_pass"],
+)
+
+generator = paddle.Generator().manual_seed(42)
+prompt = "A cat holding a sign that says hello world"
+
+
+image = pipe(
+ prompt, num_inference_steps=args.num_inference_steps, width=args.width, height=args.height, generator=generator
+).images[0]
+
+if args.benchmark:
+ # warmup
+ for i in range(3):
+ image = pipe(
+ prompt,
+ num_inference_steps=args.num_inference_steps,
+ width=args.width,
+ height=args.height,
+ generator=generator,
+ ).images[0]
+
+ repeat_times = 10
+ sumtime = 0.0
+ for i in range(repeat_times):
+ paddle.device.synchronize()
+ starttime = datetime.datetime.now()
+ image = pipe(
+ prompt,
+ num_inference_steps=args.num_inference_steps,
+ width=args.width,
+ height=args.height,
+ generator=generator,
+ ).images[0]
+ paddle.device.synchronize()
+ endtime = datetime.datetime.now()
+ duringtime = endtime - starttime
+ duringtime = duringtime.seconds * 1000 + duringtime.microseconds / 1000.0
+ sumtime += duringtime
+ print("SD3 end to end time : ", duringtime, "ms")
+
+ print("SD3 ave end to end time : ", sumtime / repeat_times, "ms")
+
+ cuda_mem_after_used = paddle.device.cuda.max_memory_allocated() / (1024**3)
+ print(f"Max used CUDA memory : {cuda_mem_after_used:.3f} GiB")
+
+
+rank_id = dist.get_rank()
+if rank_id == 0:
+ image.save("text_to_image_generation-stable_diffusion_3-result.png")
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sdxl/README.md b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sdxl/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..991712e0582c5dad1598e450eaef1c0b09873be1
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/deploy/sdxl/README.md
@@ -0,0 +1,44 @@
+# PaddleInfer Stable Diffusion XL 模型高性能部署
+
+ **目录**
+ * [环境依赖](#环境依赖)
+ * [快速体验](#快速体验)
+ * [文图生成(Text-to-Image Generation)](#文图生成)
+ * [文本引导的图像变换(Image-to-Image Text-Guided Generation)](#文本引导的图像变换)
+ * [文本引导的图像编辑(Text-Guided Image Inpainting)](#文本引导的图像编辑)
+
+⚡️[PaddleInfer]是一款全场景、易用灵活、极致高效的AI推理部署工具,为开发者提供多硬件、多推理引擎后端的部署能力。开发者只需调用一行代码即可随意切换硬件、推理引擎后端。本示例展现如何通过 PaddleInfer 将我们 PPDiffusers 训练好的 Stable Diffusion XL模型进行多硬件、多推理引擎后端高性能部署。
+
+
+
+## 环境依赖
+
+在示例中使用了 PaddleInfer,需要执行以下命令安装依赖。
+
+```shell
+python -m pip install paddlepaddle-gpu==2.6.0.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
+```
+
+
+
+## 静态图模型导出 (static model export)
+```
+export USE_PPXFORMERS=False
+python export_model.py --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 --output_path static_model/stable-diffusion-xl-base-1.0
+```
+导出模型在static_model/stable-diffusion-xl-base-1.0目录下。
+
+### 文图生成(Text-to-Image Generation)
+```
+python infer.py --model_dir static_model/stable-diffusion-xl-base-1.0 --scheduler "preconfig-euler-ancestral" --backend paddle --device gpu --task_name text2img
+```
+
+### 文本引导的图像变换(Image-to-Image Text-Guided Generation)
+```
+python infer.py --model_dir static_model/stable-diffusion-xl-base-1.0 --scheduler "preconfig-euler-ancestral" --backend paddle --device gpu --task_name img2img
+```
+
+### 文本引导的图像编辑(Text-Guided Image Inpainting)
+```
+python infer.py --model_dir static_model/stable-diffusion-xl-base-1.0 --scheduler "preconfig-euler-ancestral" --backend paddle --device gpu --task_name inpaint
+```
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..afe6428281af43f57efb59b68bd1f918bf3bbd4c
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .value_guided_sampling import ValueGuidedRLPipeline
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..7024c5c94358fb40b62f653b1d7891dff12cd762
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py
@@ -0,0 +1,153 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+
+from ...models.unet_1d import UNet1DModel
+from ...pipelines import DiffusionPipeline
+from ...utils.dummy_paddle_objects import DDPMScheduler
+from ...utils.paddle_utils import randn_tensor
+
+
+class ValueGuidedRLPipeline(DiffusionPipeline):
+ r"""
+ Pipeline for value-guided sampling from a diffusion model trained to predict sequences of states.
+
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+ Parameters:
+ value_function ([`UNet1DModel`]):
+ A specialized UNet for fine-tuning trajectories base on reward.
+ unet ([`UNet1DModel`]):
+ UNet architecture to denoise the encoded trajectories.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded trajectories. Default for this
+ application is [`DDPMScheduler`].
+ env ():
+ An environment following the OpenAI gym API to act in. For now only Hopper has pretrained models.
+ """
+
+ def __init__(
+ self,
+ value_function: UNet1DModel,
+ unet: UNet1DModel,
+ scheduler: DDPMScheduler,
+ env,
+ ):
+ super().__init__()
+ self.value_function = value_function
+ self.unet = unet
+ self.scheduler = scheduler
+ self.env = env
+ self.data = env.get_dataset()
+ self.means = {}
+ for key in self.data.keys():
+ try:
+ self.means[key] = self.data[key].mean()
+ except Exception:
+ pass
+ self.stds = {}
+ for key in self.data.keys():
+ try:
+ self.stds[key] = self.data[key].std()
+ except Exception:
+ pass
+ self.state_dim = env.observation_space.shape[0]
+ self.action_dim = env.action_space.shape[0]
+
+ def normalize(self, x_in, key):
+ return (x_in - self.means[key]) / self.stds[key]
+
+ def de_normalize(self, x_in, key):
+ return x_in * self.stds[key] + self.means[key]
+
+ def to_paddle(self, x_in):
+ if isinstance(x_in, dict):
+ return {k: self.to_paddle(v) for k, v in x_in.items()}
+ elif paddle.is_tensor(x_in):
+ return x_in
+ return paddle.to_tensor(x_in)
+
+ def reset_x0(self, x_in, cond, act_dim):
+ for key, val in cond.items():
+ x_in[:, key, act_dim:] = val.clone()
+ return x_in
+
+ def run_diffusion(self, x, conditions, n_guide_steps, scale):
+ batch_size = x.shape[0]
+ y = None
+ for i in self.progress_bar(self.scheduler.timesteps):
+ # create batch of timesteps to pass into model
+ timesteps = paddle.full((batch_size,), i, dtype=paddle.int64)
+ for _ in range(n_guide_steps):
+ with paddle.set_grad_enabled(True):
+ x.stop_gradient = False
+
+ # permute to match dimension for pre-trained models
+ y = self.value_function(x.transpose([0, 2, 1]), timesteps).sample
+ grad = paddle.autograd.grad([y.sum()], [x])[0]
+
+ posterior_variance = self.scheduler._get_variance(i)
+ model_std = paddle.exp(0.5 * posterior_variance)
+ grad = model_std * grad
+
+ grad[timesteps < 2] = 0
+ x = x.detach()
+ x = x + scale * grad
+ x = self.reset_x0(x, conditions, self.action_dim)
+
+ prev_x = self.unet(x.transpose([0, 2, 1]), timesteps).sample.transpose([0, 2, 1])
+
+ # TODO: verify deprecation of this kwarg
+ x = self.scheduler.step(prev_x, i, x, predict_epsilon=False)["prev_sample"]
+
+ # apply conditions to the trajectory (set the initial state)
+ x = self.reset_x0(x, conditions, self.action_dim)
+ x = self.to_paddle(x)
+ return x, y
+
+ def __call__(self, obs, batch_size=64, planning_horizon=32, n_guide_steps=2, scale=0.1):
+ # normalize the observations and create batch dimension
+ obs = self.normalize(obs, "observations")
+ obs = obs[None].repeat(batch_size, axis=0)
+
+ conditions = {0: self.to_paddle(obs)}
+ shape = (batch_size, planning_horizon, self.state_dim + self.action_dim)
+
+ # generate initial noise and apply our conditions (to make the trajectories start at current state)
+ x1 = randn_tensor(shape, dtype=self.unet.dtype)
+ x = self.reset_x0(x1, conditions, self.action_dim)
+ x = self.to_paddle(x)
+
+ # run the diffusion process
+ x, y = self.run_diffusion(x, conditions, n_guide_steps, scale)
+
+ # sort output trajectories by value
+ sorted_idx = paddle.argsort(y, 0, descending=True).squeeze()
+ sorted_values = x[sorted_idx]
+ actions = sorted_values[:, :, : self.action_dim]
+ actions = actions.detach().cpu().numpy()
+ denorm_actions = self.de_normalize(actions, key="actions")
+
+ # select the action with the highest value
+ if y is not None:
+ selected_index = 0
+ else:
+ # if we didn't run value guiding, select a random action
+ selected_index = np.random.randint(0, batch_size)
+
+ denorm_actions = denorm_actions[selected_index, 0]
+ return denorm_actions
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/pose_guider.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/pose_guider.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bd36a6caa677ae6910f01acbb87777d8cfc1430
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/pose_guider.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import paddle
+
+from ppdiffusers.models.animate_anyone.motion_module import zero_module
+from ppdiffusers.models.animate_anyone.resnet import InflatedConv3d
+from ppdiffusers.models.modeling_utils import ContextManagers, ModelMixin
+
+
+class PoseGuider(ModelMixin):
+ def __init__(
+ self,
+ conditioning_embedding_channels: int,
+ conditioning_channels: int = 3,
+ block_out_channels: Tuple[int] = (16, 32, 64, 128),
+ weight_dtype=None,
+ ):
+ super().__init__()
+
+ init_contexts = []
+ if weight_dtype is not None:
+ init_contexts.append(paddle.dtype_guard(weight_dtype))
+
+ with ContextManagers(init_contexts):
+ self.conv_in = InflatedConv3d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
+
+ self.blocks = paddle.nn.LayerList(sublayers=[])
+
+ for i in range(len(block_out_channels) - 1):
+ channel_in = block_out_channels[i]
+ channel_out = block_out_channels[i + 1]
+ self.blocks.append(InflatedConv3d(channel_in, channel_in, kernel_size=3, padding=1))
+ self.blocks.append(InflatedConv3d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
+
+ self.conv_out = zero_module(
+ InflatedConv3d(
+ block_out_channels[-1],
+ conditioning_embedding_channels,
+ kernel_size=3,
+ padding=1,
+ )
+ )
+
+ def forward(self, conditioning):
+ embedding = self.conv_in(conditioning)
+ embedding = paddle.nn.functional.silu(x=embedding)
+
+ for block in self.blocks:
+ embedding = block(embedding)
+ embedding = paddle.nn.functional.silu(x=embedding)
+
+ embedding = self.conv_out(embedding)
+
+ return embedding
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/resnet.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..1aff93940aef9e99752f86fada1ce7cf8f96d69d
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/resnet.py
@@ -0,0 +1,235 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Adapted from https://github.com/huggingface/ppdiffusers/blob/main/src/ppdiffusers/models/resnet.py
+
+import paddle
+from einops import rearrange
+
+
+class InflatedConv3d(paddle.nn.Conv2D):
+ def forward(self, x):
+ video_length = x.shape[2]
+ x = rearrange(x, "b c f h w -> (b f) c h w")
+ x = super().forward(x)
+
+ x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
+
+ return x
+
+
+class InflatedGroupNorm(paddle.nn.GroupNorm):
+ def forward(self, x):
+ video_length = x.shape[2]
+
+ x = rearrange(x, "b c f h w -> (b f) c h w")
+ x = super().forward(x)
+ x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
+
+ return x
+
+
+class Upsample3D(paddle.nn.Layer):
+ def __init__(
+ self,
+ channels,
+ use_conv=False,
+ use_conv_transpose=False,
+ out_channels=None,
+ name="conv",
+ ):
+ super().__init__()
+ self.channels = channels
+ self.out_channels = out_channels or channels
+ self.use_conv = use_conv
+ self.use_conv_transpose = use_conv_transpose
+ self.name = name
+
+ if use_conv_transpose:
+ raise NotImplementedError
+ elif use_conv:
+ self.conv = InflatedConv3d(self.channels, self.out_channels, 3, padding=1)
+
+ def forward(self, hidden_states, output_size=None):
+ assert hidden_states.shape[1] == self.channels
+
+ if self.use_conv_transpose:
+ raise NotImplementedError
+
+ # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+ dtype = hidden_states.dtype
+ if dtype == "bfloat16":
+ hidden_states = hidden_states.to("float32")
+
+ # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/ppdiffusers/issues/984
+ if hidden_states.shape[0] >= 64:
+ hidden_states = hidden_states.contiguous()
+
+ if output_size is None:
+ hidden_states = paddle.nn.functional.interpolate(
+ x=hidden_states, scale_factor=[1.0, 2.0, 2.0], mode="nearest", data_format="NCDHW"
+ )
+ else:
+ hidden_states = paddle.nn.functional.interpolate(
+ x=hidden_states, size=output_size, mode="nearest", data_format="NCDHW"
+ )
+
+ # If the input is bfloat16, we cast back to bfloat16
+ if dtype == "bfloat16":
+ hidden_states = hidden_states.to(dtype)
+
+ hidden_states = self.conv(hidden_states)
+
+ return hidden_states
+
+
+class Downsample3D(paddle.nn.Layer):
+ def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
+ super().__init__()
+ self.channels = channels
+ self.out_channels = out_channels or channels
+ self.use_conv = use_conv
+ self.padding = padding
+ stride = 2
+ self.name = name
+
+ if use_conv:
+ self.conv = InflatedConv3d(self.channels, self.out_channels, 3, stride=stride, padding=padding)
+ else:
+ raise NotImplementedError
+
+ def forward(self, hidden_states):
+ assert hidden_states.shape[1] == self.channels
+ if self.use_conv and self.padding == 0:
+ raise NotImplementedError
+
+ assert hidden_states.shape[1] == self.channels
+ hidden_states = self.conv(hidden_states)
+
+ return hidden_states
+
+
+class ResnetBlock3D(paddle.nn.Layer):
+ def __init__(
+ self,
+ *,
+ in_channels,
+ out_channels=None,
+ conv_shortcut=False,
+ dropout=0.0,
+ temb_channels=512,
+ groups=32,
+ groups_out=None,
+ pre_norm=True,
+ eps=1e-6,
+ non_linearity="swish",
+ time_embedding_norm="default",
+ output_scale_factor=1.0,
+ use_in_shortcut=None,
+ use_inflated_groupnorm=None,
+ ):
+ super().__init__()
+ self.pre_norm = pre_norm
+ self.pre_norm = True
+ self.in_channels = in_channels
+ out_channels = in_channels if out_channels is None else out_channels
+ self.out_channels = out_channels
+ self.use_conv_shortcut = conv_shortcut
+ self.time_embedding_norm = time_embedding_norm
+ self.output_scale_factor = output_scale_factor
+
+ if groups_out is None:
+ groups_out = groups
+
+ assert use_inflated_groupnorm is not None
+ if use_inflated_groupnorm:
+ self.norm1 = InflatedGroupNorm(num_groups=groups, num_channels=in_channels, epsilon=eps)
+ else:
+
+ self.norm1 = paddle.nn.GroupNorm(
+ num_groups=groups, num_channels=in_channels, epsilon=eps, weight_attr=True, bias_attr=True
+ )
+
+ self.conv1 = InflatedConv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+ if temb_channels is not None:
+ if self.time_embedding_norm == "default":
+ time_emb_proj_out_channels = out_channels
+ elif self.time_embedding_norm == "scale_shift":
+ time_emb_proj_out_channels = out_channels * 2
+ else:
+ raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+
+ self.time_emb_proj = paddle.nn.Linear(in_features=temb_channels, out_features=time_emb_proj_out_channels)
+ else:
+ self.time_emb_proj = None
+
+ if use_inflated_groupnorm:
+ self.norm2 = InflatedGroupNorm(num_groups=groups_out, num_channels=out_channels, epsilon=eps)
+ else:
+ self.norm2 = paddle.nn.GroupNorm(
+ num_groups=groups_out, num_channels=out_channels, epsilon=eps, weight_attr=True, bias_attr=True
+ )
+ self.dropout = paddle.nn.Dropout(p=dropout)
+ self.conv2 = InflatedConv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+ if non_linearity == "swish":
+ self.nonlinearity = lambda x: paddle.nn.functional.silu(x=x)
+ elif non_linearity == "mish":
+ self.nonlinearity = Mish()
+ elif non_linearity == "silu":
+ self.nonlinearity = paddle.nn.Silu()
+
+ self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut
+
+ self.conv_shortcut = None
+ if self.use_in_shortcut:
+ self.conv_shortcut = InflatedConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+ def forward(self, input_tensor, temb):
+ hidden_states = input_tensor
+
+ hidden_states = self.norm1(hidden_states)
+ hidden_states = self.nonlinearity(hidden_states)
+
+ hidden_states = self.conv1(hidden_states)
+
+ if temb is not None:
+ temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None, None]
+
+ if temb is not None and self.time_embedding_norm == "default":
+ hidden_states = hidden_states + temb
+
+ hidden_states = self.norm2(hidden_states)
+
+ if temb is not None and self.time_embedding_norm == "scale_shift":
+ scale, shift = paddle.chunk(x=temb, chunks=2, axis=1)
+ hidden_states = hidden_states * (1 + scale) + shift
+
+ hidden_states = self.nonlinearity(hidden_states)
+
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.conv2(hidden_states)
+
+ if self.conv_shortcut is not None:
+ input_tensor = self.conv_shortcut(input_tensor)
+
+ output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+
+ return output_tensor
+
+
+class Mish(paddle.nn.Layer):
+ def forward(self, hidden_states):
+ return hidden_states * paddle.nn.functional.tanh(x=paddle.nn.functional.softplus(x=hidden_states))
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/transformer_3d.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/transformer_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f294f9afec90a6699bda8b9f7dc994cbad46654
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/transformer_3d.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional
+
+import paddle
+from einops import rearrange, repeat
+
+from ppdiffusers.configuration_utils import ConfigMixin, register_to_config
+from ppdiffusers.models import ModelMixin
+from ppdiffusers.utils import BaseOutput
+
+from .attention import TemporalBasicTransformerBlock
+
+
+@dataclass
+class Transformer3DModelOutput(BaseOutput):
+ sample: paddle.Tensor
+
+
+class Transformer3DModel(ModelMixin, ConfigMixin):
+ _supports_gradient_checkpointing = True
+
+ @register_to_config
+ def __init__(
+ self,
+ num_attention_heads: int = 16,
+ attention_head_dim: int = 88,
+ in_channels: Optional[int] = None,
+ num_layers: int = 1,
+ dropout: float = 0.0,
+ norm_num_groups: int = 32,
+ cross_attention_dim: Optional[int] = None,
+ attention_bias: bool = False,
+ activation_fn: str = "geglu",
+ num_embeds_ada_norm: Optional[int] = None,
+ use_linear_projection: bool = False,
+ only_cross_attention: bool = False,
+ upcast_attention: bool = False,
+ unet_use_cross_frame_attention=None,
+ unet_use_temporal_attention=None,
+ ):
+ super().__init__()
+ self.use_linear_projection = use_linear_projection
+ self.num_attention_heads = num_attention_heads
+ self.attention_head_dim = attention_head_dim
+ inner_dim = num_attention_heads * attention_head_dim
+
+ # Define input layers
+ self.in_channels = in_channels
+
+ self.norm = paddle.nn.GroupNorm(
+ num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-06, weight_attr=True, bias_attr=True
+ )
+ if use_linear_projection:
+ self.proj_in = paddle.nn.Linear(in_features=in_channels, out_features=inner_dim)
+ else:
+ self.proj_in = paddle.nn.Conv2D(
+ in_channels=in_channels, out_channels=inner_dim, kernel_size=1, stride=1, padding=0
+ )
+ self.transformer_blocks = paddle.nn.LayerList(
+ sublayers=[
+ TemporalBasicTransformerBlock(
+ inner_dim,
+ num_attention_heads,
+ attention_head_dim,
+ dropout=dropout,
+ cross_attention_dim=cross_attention_dim,
+ activation_fn=activation_fn,
+ num_embeds_ada_norm=num_embeds_ada_norm,
+ attention_bias=attention_bias,
+ only_cross_attention=only_cross_attention,
+ upcast_attention=upcast_attention,
+ unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+ unet_use_temporal_attention=unet_use_temporal_attention,
+ )
+ for d in range(num_layers)
+ ]
+ )
+ if use_linear_projection:
+ self.proj_out = paddle.nn.Linear(in_features=in_channels, out_features=inner_dim)
+ else:
+ self.proj_out = paddle.nn.Conv2D(
+ in_channels=inner_dim, out_channels=in_channels, kernel_size=1, stride=1, padding=0
+ )
+
+ self.gradient_checkpointing = False
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if hasattr(module, "gradient_checkpointing"):
+ module.gradient_checkpointing = value
+
+ def forward(
+ self,
+ hidden_states,
+ encoder_hidden_states=None,
+ timestep=None,
+ return_dict: bool = True,
+ ):
+ # Input
+ assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
+ video_length = hidden_states.shape[2]
+ hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+ if encoder_hidden_states.shape[0] != hidden_states.shape[0]:
+ encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b f) n c", f=video_length)
+
+ batch, channel, height, weight = hidden_states.shape
+ residual = hidden_states
+
+ hidden_states = self.norm(hidden_states)
+ if not self.use_linear_projection:
+ hidden_states = self.proj_in(hidden_states)
+ inner_dim = hidden_states.shape[1]
+ hidden_states = hidden_states.transpose(perm=[0, 2, 3, 1]).reshape((batch, height * weight, inner_dim))
+ else:
+ inner_dim = hidden_states.shape[1]
+ hidden_states = hidden_states.transpose(perm=[0, 2, 3, 1]).reshape((batch, height * weight, inner_dim))
+ hidden_states = self.proj_in(hidden_states)
+
+ # Blocks
+ for i, block in enumerate(self.transformer_blocks):
+ hidden_states = block(
+ hidden_states,
+ encoder_hidden_states=encoder_hidden_states,
+ timestep=timestep,
+ video_length=video_length,
+ )
+
+ # Output
+ if not self.use_linear_projection:
+ hidden_states = hidden_states.reshape((batch, height, weight, inner_dim)).transpose(perm=[0, 3, 1, 2])
+ hidden_states = self.proj_out(hidden_states)
+ else:
+ hidden_states = self.proj_out(hidden_states)
+ hidden_states = hidden_states.reshape((batch, height, weight, inner_dim)).transpose(perm=[0, 3, 1, 2])
+
+ output = hidden_states + residual
+
+ output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
+ if not return_dict:
+ return (output,)
+
+ return Transformer3DModelOutput(sample=output)
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/unet_3d.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/unet_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5e5645e7abe55191e5dfe004a3446270aa22df1
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/unet_3d.py
@@ -0,0 +1,615 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Adapted from https://github.com/guoyww/AnimateDiff/blob/main/animatediff/models/unet_blocks.py
+
+from dataclasses import dataclass
+from os import PathLike
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+
+import paddle
+
+from ppdiffusers.configuration_utils import ConfigMixin, register_to_config
+from ppdiffusers.models.attention_processor import AttentionProcessor
+from ppdiffusers.models.embeddings import TimestepEmbedding, Timesteps
+from ppdiffusers.models.modeling_utils import ContextManagers, ModelMixin
+from ppdiffusers.utils import BaseOutput, logging
+
+from .resnet import InflatedConv3d, InflatedGroupNorm
+from .unet_3d_blocks import UNetMidBlock3DCrossAttn, get_down_block, get_up_block
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+@dataclass
+class UNet3DConditionOutput(BaseOutput):
+ sample: paddle.Tensor
+
+
+class UNet3DConditionModel(ModelMixin, ConfigMixin):
+ _supports_gradient_checkpointing = True
+
+ @register_to_config
+ def __init__(
+ self,
+ sample_size: Optional[int] = None,
+ in_channels: int = 4,
+ out_channels: int = 4,
+ center_input_sample: bool = False,
+ flip_sin_to_cos: bool = True,
+ freq_shift: int = 0,
+ down_block_types: Tuple[str] = (
+ "CrossAttnDownBlock3D",
+ "CrossAttnDownBlock3D",
+ "CrossAttnDownBlock3D",
+ "DownBlock3D",
+ ),
+ mid_block_type: str = "UNetMidBlock3DCrossAttn",
+ up_block_types: Tuple[str] = (
+ "UpBlock3D",
+ "CrossAttnUpBlock3D",
+ "CrossAttnUpBlock3D",
+ "CrossAttnUpBlock3D",
+ ),
+ only_cross_attention: Union[bool, Tuple[bool]] = False,
+ block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+ layers_per_block: int = 2,
+ downsample_padding: int = 1,
+ mid_block_scale_factor: float = 1,
+ act_fn: str = "silu",
+ norm_num_groups: int = 32,
+ norm_eps: float = 1e-5,
+ cross_attention_dim: int = 1280,
+ attention_head_dim: Union[int, Tuple[int]] = 8,
+ dual_cross_attention: bool = False,
+ use_linear_projection: bool = False,
+ class_embed_type: Optional[str] = None,
+ num_class_embeds: Optional[int] = None,
+ upcast_attention: bool = False,
+ resnet_time_scale_shift: str = "default",
+ use_inflated_groupnorm=False,
+ # Additional
+ use_motion_module=False,
+ motion_module_resolutions=(1, 2, 4, 8),
+ motion_module_mid_block=False,
+ motion_module_decoder_only=False,
+ motion_module_type=None,
+ motion_module_kwargs={},
+ unet_use_cross_frame_attention=None,
+ unet_use_temporal_attention=None,
+ ):
+ super().__init__()
+
+ self.sample_size = sample_size
+ time_embed_dim = block_out_channels[0] * 4
+
+ # input
+ self.conv_in = InflatedConv3d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
+
+ # time
+ self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+ timestep_input_dim = block_out_channels[0]
+
+ self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+
+ # class embedding
+ if class_embed_type is None and num_class_embeds is not None:
+ self.class_embedding = paddle.nn.Embedding(num_embeddings=num_class_embeds, embedding_dim=time_embed_dim)
+ elif class_embed_type == "timestep":
+ self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+ elif class_embed_type == "identity":
+ self.class_embedding = paddle.nn.Identity(time_embed_dim, time_embed_dim)
+ else:
+ self.class_embedding = None
+
+ self.down_blocks = paddle.nn.LayerList(sublayers=[])
+ self.mid_block = None
+ self.up_blocks = paddle.nn.LayerList(sublayers=[])
+
+ if isinstance(only_cross_attention, bool):
+ only_cross_attention = [only_cross_attention] * len(down_block_types)
+
+ if isinstance(attention_head_dim, int):
+ attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+ # down
+ output_channel = block_out_channels[0]
+ for i, down_block_type in enumerate(down_block_types):
+ res = 2**i
+ input_channel = output_channel
+ output_channel = block_out_channels[i]
+ is_final_block = i == len(block_out_channels) - 1
+
+ down_block = get_down_block(
+ down_block_type,
+ num_layers=layers_per_block,
+ in_channels=input_channel,
+ out_channels=output_channel,
+ temb_channels=time_embed_dim,
+ add_downsample=not is_final_block,
+ resnet_eps=norm_eps,
+ resnet_act_fn=act_fn,
+ resnet_groups=norm_num_groups,
+ cross_attention_dim=cross_attention_dim,
+ attn_num_head_channels=attention_head_dim[i],
+ downsample_padding=downsample_padding,
+ dual_cross_attention=dual_cross_attention,
+ use_linear_projection=use_linear_projection,
+ only_cross_attention=only_cross_attention[i],
+ upcast_attention=upcast_attention,
+ resnet_time_scale_shift=resnet_time_scale_shift,
+ unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+ unet_use_temporal_attention=unet_use_temporal_attention,
+ use_inflated_groupnorm=use_inflated_groupnorm,
+ use_motion_module=use_motion_module
+ and (res in motion_module_resolutions)
+ and (not motion_module_decoder_only),
+ motion_module_type=motion_module_type,
+ motion_module_kwargs=motion_module_kwargs,
+ )
+ self.down_blocks.append(down_block)
+
+ # mid
+ if mid_block_type == "UNetMidBlock3DCrossAttn":
+ self.mid_block = UNetMidBlock3DCrossAttn(
+ in_channels=block_out_channels[-1],
+ temb_channels=time_embed_dim,
+ resnet_eps=norm_eps,
+ resnet_act_fn=act_fn,
+ output_scale_factor=mid_block_scale_factor,
+ resnet_time_scale_shift=resnet_time_scale_shift,
+ cross_attention_dim=cross_attention_dim,
+ attn_num_head_channels=attention_head_dim[-1],
+ resnet_groups=norm_num_groups,
+ dual_cross_attention=dual_cross_attention,
+ use_linear_projection=use_linear_projection,
+ upcast_attention=upcast_attention,
+ unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+ unet_use_temporal_attention=unet_use_temporal_attention,
+ use_inflated_groupnorm=use_inflated_groupnorm,
+ use_motion_module=use_motion_module and motion_module_mid_block,
+ motion_module_type=motion_module_type,
+ motion_module_kwargs=motion_module_kwargs,
+ )
+ else:
+ raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+
+ # count how many layers upsample the videos
+ self.num_upsamplers = 0
+
+ # up
+ reversed_block_out_channels = list(reversed(block_out_channels))
+ reversed_attention_head_dim = list(reversed(attention_head_dim))
+ only_cross_attention = list(reversed(only_cross_attention))
+ output_channel = reversed_block_out_channels[0]
+ for i, up_block_type in enumerate(up_block_types):
+ res = 2 ** (3 - i)
+ is_final_block = i == len(block_out_channels) - 1
+
+ prev_output_channel = output_channel
+ output_channel = reversed_block_out_channels[i]
+ input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+ # add upsample block for all BUT final layer
+ if not is_final_block:
+ add_upsample = True
+ self.num_upsamplers += 1
+ else:
+ add_upsample = False
+
+ up_block = get_up_block(
+ up_block_type,
+ num_layers=layers_per_block + 1,
+ in_channels=input_channel,
+ out_channels=output_channel,
+ prev_output_channel=prev_output_channel,
+ temb_channels=time_embed_dim,
+ add_upsample=add_upsample,
+ resnet_eps=norm_eps,
+ resnet_act_fn=act_fn,
+ resnet_groups=norm_num_groups,
+ cross_attention_dim=cross_attention_dim,
+ attn_num_head_channels=reversed_attention_head_dim[i],
+ dual_cross_attention=dual_cross_attention,
+ use_linear_projection=use_linear_projection,
+ only_cross_attention=only_cross_attention[i],
+ upcast_attention=upcast_attention,
+ resnet_time_scale_shift=resnet_time_scale_shift,
+ unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+ unet_use_temporal_attention=unet_use_temporal_attention,
+ use_inflated_groupnorm=use_inflated_groupnorm,
+ use_motion_module=use_motion_module and (res in motion_module_resolutions),
+ motion_module_type=motion_module_type,
+ motion_module_kwargs=motion_module_kwargs,
+ )
+ self.up_blocks.append(up_block)
+ prev_output_channel = output_channel
+
+ # out
+ if use_inflated_groupnorm:
+ self.conv_norm_out = InflatedGroupNorm(
+ num_channels=block_out_channels[0],
+ num_groups=norm_num_groups,
+ epsilon=norm_eps,
+ )
+ else:
+
+ self.conv_norm_out = paddle.nn.GroupNorm(
+ num_channels=block_out_channels[0], num_groups=norm_num_groups, epsilon=norm_eps
+ )
+ self.conv_act = paddle.nn.Silu()
+ self.conv_out = InflatedConv3d(block_out_channels[0], out_channels, kernel_size=3, padding=1)
+
+ @property
+ # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
+ r"""
+ Returns:
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
+ indexed by its weight name.
+ """
+ # set recursively
+ processors = {}
+
+ def fn_recursive_add_processors(
+ name: str,
+ module: paddle.nn.Layer,
+ processors: Dict[str, AttentionProcessor],
+ ):
+ if hasattr(module, "set_processor"):
+ processors[f"{name}.processor"] = module.processor
+
+ for sub_name, child in module.named_children():
+ if "temporal_transformer" not in sub_name:
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+ return processors
+
+ for name, module in self.named_children():
+ if "temporal_transformer" not in name:
+ fn_recursive_add_processors(name, module, processors)
+
+ return processors
+
+ def set_attention_slice(self, slice_size):
+ r"""
+ Enable sliced attention computation.
+
+ When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+ in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+ Args:
+ slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+ When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+ `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is
+ provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+ must be a multiple of `slice_size`.
+ """
+ sliceable_head_dims = []
+
+ def fn_recursive_retrieve_slicable_dims(module: paddle.nn.Layer):
+ if hasattr(module, "set_attention_slice"):
+ sliceable_head_dims.append(module.sliceable_head_dim)
+
+ for child in module.children():
+ fn_recursive_retrieve_slicable_dims(child)
+
+ # retrieve number of attention layers
+ for module in self.children():
+ fn_recursive_retrieve_slicable_dims(module)
+
+ num_slicable_layers = len(sliceable_head_dims)
+
+ if slice_size == "auto":
+ # half the attention head size is usually a good trade-off between
+ # speed and memory
+ slice_size = [dim // 2 for dim in sliceable_head_dims]
+ elif slice_size == "max":
+ # make smallest slice possible
+ slice_size = num_slicable_layers * [1]
+
+ slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+
+ if len(slice_size) != len(sliceable_head_dims):
+ raise ValueError(
+ f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+ f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+ )
+
+ for i in range(len(slice_size)):
+ size = slice_size[i]
+ dim = sliceable_head_dims[i]
+ if size is not None and size > dim:
+ raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+ # Recursively walk through all the children.
+ # Any children which exposes the set_attention_slice method
+ # gets the message
+ def fn_recursive_set_attention_slice(module: paddle.nn.Layer, slice_size: List[int]):
+ if hasattr(module, "set_attention_slice"):
+ module.set_attention_slice(slice_size.pop())
+
+ for child in module.children():
+ fn_recursive_set_attention_slice(child, slice_size)
+
+ reversed_slice_size = list(reversed(slice_size))
+ for module in self.children():
+ fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if hasattr(module, "gradient_checkpointing"):
+ module.gradient_checkpointing = value
+
+ # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+ r"""
+ Sets the attention processor to use to compute attention.
+
+ Parameters:
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
+ for **all** `Attention` layers.
+
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+ processor. This is strongly recommended when setting trainable attention processors.
+
+ """
+ count = len(self.attn_processors.keys())
+
+ if isinstance(processor, dict) and len(processor) != count:
+ raise ValueError(
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+ )
+
+ def fn_recursive_attn_processor(name: str, module: paddle.nn.Layer, processor):
+ if hasattr(module, "set_processor"):
+ if not isinstance(processor, dict):
+ module.set_processor(processor)
+ else:
+ module.set_processor(processor.pop(f"{name}.processor"))
+
+ for sub_name, child in module.named_children():
+ if "temporal_transformer" not in sub_name:
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+ for name, module in self.named_children():
+ if "temporal_transformer" not in name:
+ fn_recursive_attn_processor(name, module, processor)
+
+ def forward(
+ self,
+ sample: paddle.Tensor,
+ timestep: Union[paddle.Tensor, float, int],
+ encoder_hidden_states: paddle.Tensor,
+ class_labels: Optional[paddle.Tensor] = None,
+ pose_cond_fea: Optional[paddle.Tensor] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
+ mid_block_additional_residual: Optional[paddle.Tensor] = None,
+ return_dict: bool = True,
+ ) -> Union[UNet3DConditionOutput, Tuple]:
+ r"""
+ Args:
+ sample (`paddle.Tensor`): (batch, channel, height, width) noisy inputs tensor
+ timestep (`paddle.Tensor` or `float` or `int`): (batch) timesteps
+ encoder_hidden_states (`paddle.Tensor`): (batch, sequence_length, feature_dim) encoder hidden states
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+
+ Returns:
+ [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+ [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+ returning a tuple, the first element is the sample tensor.
+ """
+ # By default samples have to be AT least a multiple of the overall upsampling factor.
+ # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+ # However, the upsampling interpolation output size can be forced to fit any upsampling size
+ # on the fly if necessary.
+ default_overall_up_factor = 2**self.num_upsamplers
+
+ # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+ forward_upsample_size = False
+ upsample_size = None
+
+ if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+ logger.info("Forward upsample size to force interpolation output size.")
+ forward_upsample_size = True
+
+ # prepare attention_mask
+ if attention_mask is not None:
+ attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+ attention_mask = attention_mask.unsqueeze(1)
+
+ # center input if necessary
+ if self.config.center_input_sample:
+ sample = 2 * sample - 1.0
+
+ # time
+ timesteps = timestep
+ if not paddle.is_tensor(timesteps):
+ # This would be a good case for the `match` statement (Python 3.10+)
+ is_mps = sample.device.type == "mps"
+ if isinstance(timestep, float):
+ dtype = "float32" if is_mps else "float64"
+ else:
+ dtype = "int32" if is_mps else "int64"
+ timesteps = paddle.Tensor([timesteps], dtype=dtype)
+ elif len(timesteps.shape) == 0:
+ timesteps = timesteps[None]
+
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+ timesteps = timesteps.expand(sample.shape[0])
+
+ t_emb = self.time_proj(timesteps)
+
+ # timesteps does not contain any weights and will always return f32 tensors
+ # but time_embedding might actually be running in fp16. so we need to cast here.
+ # there might be better ways to encapsulate this.
+ t_emb = t_emb.to(dtype=self.dtype)
+ emb = self.time_embedding(t_emb)
+
+ if self.class_embedding is not None:
+ if class_labels is None:
+ raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+ if self.config.class_embed_type == "timestep":
+ class_labels = self.time_proj(class_labels)
+
+ class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+ emb = emb + class_emb
+
+ # pre-process
+
+ sample = self.conv_in(sample)
+
+ if pose_cond_fea is not None:
+ sample = sample + pose_cond_fea
+
+ # down
+ down_block_res_samples = (sample,)
+ for downsample_block in self.down_blocks:
+ if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+
+ sample, res_samples = downsample_block(
+ hidden_states=sample,
+ temb=emb,
+ encoder_hidden_states=encoder_hidden_states,
+ attention_mask=attention_mask,
+ )
+
+ else:
+ sample, res_samples = downsample_block(
+ hidden_states=sample,
+ temb=emb,
+ encoder_hidden_states=encoder_hidden_states,
+ )
+
+ down_block_res_samples += res_samples
+
+ if down_block_additional_residuals is not None:
+ new_down_block_res_samples = ()
+
+ for down_block_res_sample, down_block_additional_residual in zip(
+ down_block_res_samples, down_block_additional_residuals
+ ):
+ down_block_res_sample = down_block_res_sample + down_block_additional_residual
+ new_down_block_res_samples += (down_block_res_sample,)
+
+ down_block_res_samples = new_down_block_res_samples
+
+ # mid
+ sample = self.mid_block(
+ sample,
+ emb,
+ encoder_hidden_states=encoder_hidden_states,
+ attention_mask=attention_mask,
+ )
+
+ if mid_block_additional_residual is not None:
+ sample = sample + mid_block_additional_residual
+
+ # up
+ for i, upsample_block in enumerate(self.up_blocks):
+ is_final_block = i == len(self.up_blocks) - 1
+
+ res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+ down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+ # if we have not reached the final block and need to forward the
+ # upsample size, we do it here
+ if not is_final_block and forward_upsample_size:
+ upsample_size = down_block_res_samples[-1].shape[2:]
+
+ if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+ sample = upsample_block(
+ hidden_states=sample,
+ temb=emb,
+ res_hidden_states_tuple=res_samples,
+ encoder_hidden_states=encoder_hidden_states,
+ upsample_size=upsample_size,
+ attention_mask=attention_mask,
+ )
+ else:
+ sample = upsample_block(
+ hidden_states=sample,
+ temb=emb,
+ res_hidden_states_tuple=res_samples,
+ upsample_size=upsample_size,
+ encoder_hidden_states=encoder_hidden_states,
+ )
+
+ # post-process
+ sample = self.conv_norm_out(sample)
+ sample = self.conv_act(sample)
+ sample = self.conv_out(sample)
+
+ if not return_dict:
+ return (sample,)
+
+ return UNet3DConditionOutput(sample=sample)
+
+ @classmethod
+ def from_pretrained_2d(
+ cls,
+ denoising_unet_config_path: Optional[Union[str, PathLike]],
+ base_model_path: Optional[Union[str, PathLike]] = None,
+ motion_module_path: Optional[Union[str, PathLike]] = None,
+ weight_dtype=None,
+ unet_additional_kwargs=None,
+ ):
+
+ config_file = denoising_unet_config_path
+ if not (Path(config_file).exists() and Path(config_file).is_file()):
+ raise RuntimeError(f"{config_file} does not exist or is not a file")
+
+ unet_config = cls.load_config(config_file)
+ unet_config["_class_name"] = cls.__name__
+ unet_config["down_block_types"] = [
+ "CrossAttnDownBlock3D",
+ "CrossAttnDownBlock3D",
+ "CrossAttnDownBlock3D",
+ "DownBlock3D",
+ ]
+ unet_config["up_block_types"] = [
+ "UpBlock3D",
+ "CrossAttnUpBlock3D",
+ "CrossAttnUpBlock3D",
+ "CrossAttnUpBlock3D",
+ ]
+ unet_config["mid_block_type"] = "UNetMidBlock3DCrossAttn"
+
+ init_contexts = []
+ if weight_dtype is not None:
+ init_contexts.append(paddle.dtype_guard(weight_dtype))
+
+ with ContextManagers(init_contexts):
+ model = cls.from_config(unet_config, **unet_additional_kwargs)
+
+ state_dict = paddle.load(base_model_path)
+
+ # motion module updating
+ if motion_module_path is not None:
+ motion_state_dict = paddle.load(motion_module_path)
+ state_dict.update(motion_state_dict)
+
+ if weight_dtype is not None:
+ for k in state_dict.keys():
+ state_dict[k] = state_dict[k].astype(weight_dtype)
+
+ m, u = model.set_state_dict(state_dict)
+ print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
+
+ return model
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef05224cf6aff170028d4e2e50ce4f3572bc9387
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Union
+
+import numpy as np
+import paddle
+
+import ppdiffusers
+
+from .unet import UNet3DConditionModel # noqa: *
+
+
+@dataclass
+class HotshotPipelineXLOutput(ppdiffusers.utils.BaseOutput):
+ videos: Union[paddle.Tensor, np.ndarray]
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/resnet.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..85f2f60e155b2094be815f83b900548368027939
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/resnet.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from einops import rearrange
+
+import ppdiffusers
+from ppdiffusers.models import resnet
+
+
+class Upsample3D(resnet.Upsample2D):
+ def forward(self, hidden_states, output_size=None, scale: float = 1.0):
+ f = tuple(hidden_states.shape)[2]
+ hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+ hidden_states = super(Upsample3D, self).forward(hidden_states, output_size, scale)
+ return rearrange(hidden_states, "(b f) c h w -> b c f h w", f=f)
+
+
+class Downsample3D(ppdiffusers.models.resnet.Downsample2D):
+ def forward(self, hidden_states, scale: float = 1.0):
+ f = tuple(hidden_states.shape)[2]
+ hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+ hidden_states = super(Downsample3D, self).forward(hidden_states, scale)
+ return rearrange(hidden_states, "(b f) c h w -> b c f h w", f=f)
+
+
+class Conv3d(ppdiffusers.models.resnet.LoRACompatibleConv):
+ def forward(self, hidden_states, scale: float = 1.0):
+ f = tuple(hidden_states.shape)[2]
+ hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+ hidden_states = super().forward(hidden_states, scale)
+ return rearrange(hidden_states, "(b f) c h w -> b c f h w", f=f)
+
+
+class ResnetBlock3D(paddle.nn.Layer):
+ def __init__(
+ self,
+ *,
+ in_channels,
+ out_channels=None,
+ conv_shortcut=False,
+ dropout=0.0,
+ temb_channels=512,
+ groups=32,
+ groups_out=None,
+ pre_norm=True,
+ eps=1e-06,
+ non_linearity="silu",
+ time_embedding_norm="default",
+ output_scale_factor=1.0,
+ use_in_shortcut=None,
+ conv_shortcut_bias: bool = True
+ ):
+ super().__init__()
+ self.pre_norm = pre_norm
+ self.pre_norm = True
+ self.in_channels = in_channels
+ out_channels = in_channels if out_channels is None else out_channels
+ self.out_channels = out_channels
+ self.use_conv_shortcut = conv_shortcut
+ self.time_embedding_norm = time_embedding_norm
+ self.output_scale_factor = output_scale_factor
+ if groups_out is None:
+ groups_out = groups
+ self.norm1 = paddle.nn.GroupNorm(
+ num_groups=groups, num_channels=in_channels, epsilon=eps, weight_attr=True, bias_attr=True
+ )
+ self.conv1 = Conv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+ if temb_channels is not None:
+ if self.time_embedding_norm == "default":
+ time_emb_proj_out_channels = out_channels
+ elif self.time_embedding_norm == "scale_shift":
+ time_emb_proj_out_channels = out_channels * 2
+ else:
+ raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+ self.time_emb_proj = paddle.nn.Linear(in_features=temb_channels, out_features=time_emb_proj_out_channels)
+ else:
+ self.time_emb_proj = None
+ self.norm2 = paddle.nn.GroupNorm(
+ num_groups=groups_out, num_channels=out_channels, epsilon=eps, weight_attr=True, bias_attr=True
+ )
+ self.dropout = paddle.nn.Dropout(p=dropout)
+ self.conv2 = Conv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+ assert non_linearity == "silu"
+ self.nonlinearity = paddle.nn.Silu()
+ self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut
+ self.conv_shortcut = None
+ if self.use_in_shortcut:
+ self.conv_shortcut = Conv3d(
+ in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias_attr=conv_shortcut_bias
+ )
+
+ def forward(self, input_tensor, temb):
+ hidden_states = input_tensor
+ hidden_states = self.norm1(hidden_states)
+ hidden_states = self.nonlinearity(hidden_states)
+ hidden_states = self.conv1(hidden_states)
+ if temb is not None:
+ temb = self.nonlinearity(temb)
+ temb = self.time_emb_proj(temb)[:, :, None, None, None]
+ if temb is not None and self.time_embedding_norm == "default":
+ hidden_states = hidden_states + temb
+ hidden_states = self.norm2(hidden_states)
+ if temb is not None and self.time_embedding_norm == "scale_shift":
+ scale, shift = paddle.chunk(x=temb, chunks=2, axis=1)
+ hidden_states = hidden_states * (1 + scale) + shift
+ hidden_states = self.nonlinearity(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.conv2(hidden_states)
+ if self.conv_shortcut is not None:
+ input_tensor = self.conv_shortcut(input_tensor)
+ output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+ return output_tensor
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/transformer_3d.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/transformer_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c387c4a905e5f7207cf576fc4a06bc88066d9ba
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/transformer_3d.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+import paddle
+from einops import rearrange, repeat
+
+import ppdiffusers
+
+
+@dataclass
+class Transformer3DModelOutput(ppdiffusers.utils.BaseOutput):
+ """
+ The output of [`Transformer3DModel`].
+
+ Args:
+ sample (`paddle.FloatTensor` of shape `(batch_size, num_channels, height, width)`:
+ The hidden states output conditioned on the `encoder_hidden_states` input.
+ """
+
+ sample: paddle.float32
+
+
+class Transformer3DModel(ppdiffusers.models.transformer_2d.Transformer2DModel):
+ def __init__(self, *args, **kwargs):
+ super(Transformer3DModel, self).__init__(*args, **kwargs)
+ init_Constant = paddle.nn.initializer.Constant(value=0.0)
+ init_Constant(self.proj_out.weight.data)
+ init_Constant = paddle.nn.initializer.Constant(value=0.0)
+ init_Constant(self.proj_out.bias.data)
+
+ def forward(
+ self,
+ hidden_states: paddle.Tensor,
+ encoder_hidden_states: Optional[paddle.Tensor] = None,
+ timestep: Optional[int] = None,
+ class_labels: Optional[int] = None,
+ cross_attention_kwargs: Dict[str, Any] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ encoder_attention_mask: Optional[paddle.Tensor] = None,
+ enable_temporal_layers: bool = True,
+ positional_embedding: Optional[paddle.Tensor] = None,
+ return_dict: bool = True,
+ ):
+ is_video = len(tuple(hidden_states.shape)) == 5
+ if is_video:
+ f = tuple(hidden_states.shape)[2]
+ hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+ encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b f) n c", f=f)
+ hidden_states = super(Transformer3DModel, self).forward(
+ hidden_states,
+ encoder_hidden_states,
+ timestep,
+ class_labels,
+ cross_attention_kwargs,
+ attention_mask,
+ encoder_attention_mask,
+ return_dict=False,
+ )[0]
+ if is_video:
+ hidden_states = rearrange(hidden_states, "(b f) c h w -> b c f h w", f=f)
+ if not return_dict:
+ return (hidden_states,)
+ return Transformer3DModelOutput(sample=hidden_states)
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..39fae6fe6ecdd4a6e619a415836d0aa57c0196ac
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet.py
@@ -0,0 +1,778 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import paddle
+
+import ppdiffusers
+from ppdiffusers import loaders, transformers # noqa: *
+
+from .resnet import Conv3d
+from .unet_blocks import (
+ CrossAttnDownBlock3D,
+ CrossAttnUpBlock3D,
+ DownBlock3D,
+ UNetMidBlock3DCrossAttn,
+ UpBlock3D,
+ get_down_block,
+ get_up_block,
+)
+
+logger = ppdiffusers.utils.logging.get_logger(__name__)
+
+
+@dataclass
+class UNet3DConditionOutput(ppdiffusers.utils.BaseOutput):
+ """
+ The output of [`UNet2DConditionModel`].
+
+ Args:
+ sample (`paddle.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+ """
+
+ sample: paddle.float32 = None
+
+
+class UNet3DConditionModel(
+ ppdiffusers.models.modeling_utils.ModelMixin,
+ ppdiffusers.configuration_utils.ConfigMixin,
+ loaders.UNet2DConditionLoadersMixin,
+):
+ _supports_gradient_checkpointing = True
+
+ @ppdiffusers.configuration_utils.register_to_config
+ def __init__(
+ self,
+ sample_size: Optional[int] = None,
+ in_channels: int = 4,
+ out_channels: int = 4,
+ center_input_sample: bool = False,
+ flip_sin_to_cos: bool = True,
+ freq_shift: int = 0,
+ down_block_types: Tuple[str] = ("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D"),
+ mid_block_type: Optional[str] = "UNetMidBlock3DCrossAttn",
+ up_block_types: Tuple[str] = ("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"),
+ only_cross_attention: Union[bool, Tuple[bool]] = False,
+ block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+ layers_per_block: Union[int, Tuple[int]] = 2,
+ downsample_padding: int = 1,
+ mid_block_scale_factor: float = 1,
+ act_fn: str = "silu",
+ norm_num_groups: Optional[int] = 32,
+ norm_eps: float = 1e-05,
+ cross_attention_dim: Union[int, Tuple[int]] = 1280,
+ transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+ encoder_hid_dim: Optional[int] = None,
+ encoder_hid_dim_type: Optional[str] = None,
+ attention_head_dim: Union[int, Tuple[int]] = 8,
+ num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+ dual_cross_attention: bool = False,
+ use_linear_projection: bool = False,
+ class_embed_type: Optional[str] = None,
+ addition_embed_type: Optional[str] = None,
+ addition_time_embed_dim: Optional[int] = None,
+ num_class_embeds: Optional[int] = None,
+ upcast_attention: bool = False,
+ resnet_time_scale_shift: str = "default",
+ resnet_skip_time_act: bool = False,
+ resnet_out_scale_factor: int = 1.0,
+ time_embedding_type: str = "positional",
+ time_embedding_dim: Optional[int] = None,
+ time_embedding_act_fn: Optional[str] = None,
+ timestep_post_act: Optional[str] = None,
+ time_cond_proj_dim: Optional[int] = None,
+ conv_in_kernel: int = 3,
+ conv_out_kernel: int = 3,
+ projection_class_embeddings_input_dim: Optional[int] = None,
+ class_embeddings_concat: bool = False,
+ mid_block_only_cross_attention: Optional[bool] = None,
+ cross_attention_norm: Optional[str] = None,
+ addition_embed_type_num_heads=64,
+ ):
+ super().__init__()
+ self.sample_size = sample_size
+ if num_attention_heads is not None:
+ raise ValueError(
+ "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+ )
+ num_attention_heads = num_attention_heads or attention_head_dim
+ if len(down_block_types) != len(up_block_types):
+ raise ValueError(
+ f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+ )
+ if len(block_out_channels) != len(down_block_types):
+ raise ValueError(
+ f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+ )
+ if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+ raise ValueError(
+ f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+ )
+ if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+ raise ValueError(
+ f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+ )
+ if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+ raise ValueError(
+ f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+ )
+ if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+ raise ValueError(
+ f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+ )
+ if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+ raise ValueError(
+ f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+ )
+ conv_in_padding = (conv_in_kernel - 1) // 2
+ self.conv_in = Conv3d(in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding)
+ if time_embedding_type == "fourier":
+ time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+ if time_embed_dim % 2 != 0:
+ raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+ self.time_proj = ppdiffusers.models.embeddings.GaussianFourierProjection(
+ time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+ )
+ timestep_input_dim = time_embed_dim
+ elif time_embedding_type == "positional":
+ time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+ self.time_proj = ppdiffusers.models.embeddings.Timesteps(
+ block_out_channels[0], flip_sin_to_cos, freq_shift
+ )
+ timestep_input_dim = block_out_channels[0]
+ else:
+ raise ValueError(
+ f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+ )
+ self.time_embedding = ppdiffusers.models.embeddings.TimestepEmbedding(
+ timestep_input_dim,
+ time_embed_dim,
+ act_fn=act_fn,
+ post_act_fn=timestep_post_act,
+ cond_proj_dim=time_cond_proj_dim,
+ )
+ if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+ encoder_hid_dim_type = "text_proj"
+ self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+ logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+ if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+ raise ValueError(
+ f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+ )
+ if encoder_hid_dim_type == "text_proj":
+ self.encoder_hid_proj = paddle.nn.Linear(in_features=encoder_hid_dim, out_features=cross_attention_dim)
+ elif encoder_hid_dim_type == "text_image_proj":
+ self.encoder_hid_proj = ppdiffusers.models.embeddings.TextImageProjection(
+ text_embed_dim=encoder_hid_dim,
+ image_embed_dim=cross_attention_dim,
+ cross_attention_dim=cross_attention_dim,
+ )
+ elif encoder_hid_dim_type == "image_proj":
+ self.encoder_hid_proj = ppdiffusers.models.embeddings.ImageProjection(
+ image_embed_dim=encoder_hid_dim, cross_attention_dim=cross_attention_dim
+ )
+ elif encoder_hid_dim_type is not None:
+ raise ValueError(
+ f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+ )
+ else:
+ self.encoder_hid_proj = None
+ if class_embed_type is None and num_class_embeds is not None:
+ self.class_embedding = paddle.nn.Embedding(num_embeddings=num_class_embeds, embedding_dim=time_embed_dim)
+ elif class_embed_type == "timestep":
+ self.class_embedding = ppdiffusers.models.embeddings.TimestepEmbedding(
+ timestep_input_dim, time_embed_dim, act_fn=act_fn
+ )
+ elif class_embed_type == "identity":
+ self.class_embedding = paddle.nn.Identity(time_embed_dim, time_embed_dim)
+ elif class_embed_type == "projection":
+ if projection_class_embeddings_input_dim is None:
+ raise ValueError(
+ "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+ )
+ self.class_embedding = ppdiffusers.models.embeddings.TimestepEmbedding(
+ projection_class_embeddings_input_dim, time_embed_dim
+ )
+ elif class_embed_type == "simple_projection":
+ if projection_class_embeddings_input_dim is None:
+ raise ValueError(
+ "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+ )
+ self.class_embedding = paddle.nn.Linear(
+ in_features=projection_class_embeddings_input_dim, out_features=time_embed_dim
+ )
+ else:
+ self.class_embedding = None
+ if addition_embed_type == "text":
+ if encoder_hid_dim is not None:
+ text_time_embedding_from_dim = encoder_hid_dim
+ else:
+ text_time_embedding_from_dim = cross_attention_dim
+ self.add_embedding = ppdiffusers.models.embeddings.TextTimeEmbedding(
+ text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+ )
+ elif addition_embed_type == "text_image":
+ self.add_embedding = ppdiffusers.models.embeddings.TextImageTimeEmbedding(
+ text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+ )
+ elif addition_embed_type == "text_time":
+ self.add_time_proj = ppdiffusers.models.embeddings.Timesteps(
+ addition_time_embed_dim, flip_sin_to_cos, freq_shift
+ )
+ self.add_embedding = ppdiffusers.models.embeddings.TimestepEmbedding(
+ projection_class_embeddings_input_dim, time_embed_dim
+ )
+ elif addition_embed_type == "image":
+ self.add_embedding = ppdiffusers.models.embeddings.ImageTimeEmbedding(
+ image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim
+ )
+ elif addition_embed_type == "image_hint":
+ self.add_embedding = ppdiffusers.models.embeddings.ImageHintTimeEmbedding(
+ image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim
+ )
+ elif addition_embed_type is not None:
+ raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+ if time_embedding_act_fn is None:
+ self.time_embed_act = None
+ else:
+ self.time_embed_act = ppdiffusers.models.activations.get_activation(time_embedding_act_fn)
+ self.down_blocks = paddle.nn.LayerList(sublayers=[])
+ self.up_blocks = paddle.nn.LayerList(sublayers=[])
+ if isinstance(only_cross_attention, bool):
+ if mid_block_only_cross_attention is None:
+ mid_block_only_cross_attention = only_cross_attention
+ only_cross_attention = [only_cross_attention] * len(down_block_types)
+ if mid_block_only_cross_attention is None:
+ mid_block_only_cross_attention = False
+ if isinstance(num_attention_heads, int):
+ num_attention_heads = (num_attention_heads,) * len(down_block_types)
+ if isinstance(attention_head_dim, int):
+ attention_head_dim = (attention_head_dim,) * len(down_block_types)
+ if isinstance(cross_attention_dim, int):
+ cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+ if isinstance(layers_per_block, int):
+ layers_per_block = [layers_per_block] * len(down_block_types)
+ if isinstance(transformer_layers_per_block, int):
+ transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+ if class_embeddings_concat:
+ blocks_time_embed_dim = time_embed_dim * 2
+ else:
+ blocks_time_embed_dim = time_embed_dim
+ output_channel = block_out_channels[0]
+ for i, down_block_type in enumerate(down_block_types):
+ res = 2**i
+ input_channel = output_channel
+ output_channel = block_out_channels[i]
+ is_final_block = i == len(block_out_channels) - 1
+ down_block = get_down_block(
+ down_block_type,
+ num_layers=layers_per_block[i],
+ transformer_layers_per_block=transformer_layers_per_block[i],
+ in_channels=input_channel,
+ out_channels=output_channel,
+ temb_channels=blocks_time_embed_dim,
+ add_downsample=not is_final_block,
+ resnet_eps=norm_eps,
+ resnet_act_fn=act_fn,
+ resnet_groups=norm_num_groups,
+ cross_attention_dim=cross_attention_dim[i],
+ num_attention_heads=num_attention_heads[i],
+ downsample_padding=downsample_padding,
+ dual_cross_attention=dual_cross_attention,
+ use_linear_projection=use_linear_projection,
+ only_cross_attention=only_cross_attention[i],
+ upcast_attention=upcast_attention,
+ resnet_time_scale_shift=resnet_time_scale_shift,
+ resnet_skip_time_act=resnet_skip_time_act,
+ resnet_out_scale_factor=resnet_out_scale_factor,
+ cross_attention_norm=cross_attention_norm,
+ attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+ )
+ self.down_blocks.append(down_block)
+ if mid_block_type == "UNetMidBlock3DCrossAttn":
+ self.mid_block = UNetMidBlock3DCrossAttn(
+ transformer_layers_per_block=transformer_layers_per_block[-1],
+ in_channels=block_out_channels[-1],
+ temb_channels=blocks_time_embed_dim,
+ resnet_eps=norm_eps,
+ resnet_act_fn=act_fn,
+ output_scale_factor=mid_block_scale_factor,
+ resnet_time_scale_shift=resnet_time_scale_shift,
+ cross_attention_dim=cross_attention_dim[-1],
+ num_attention_heads=num_attention_heads[-1],
+ resnet_groups=norm_num_groups,
+ dual_cross_attention=dual_cross_attention,
+ use_linear_projection=use_linear_projection,
+ upcast_attention=upcast_attention,
+ )
+ elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
+ raise ValueError("UNetMidBlock2DSimpleCrossAttn not supported")
+ elif mid_block_type is None:
+ self.mid_block = None
+ else:
+ raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+ self.num_upsamplers = 0
+ reversed_block_out_channels = list(reversed(block_out_channels))
+ reversed_num_attention_heads = list(reversed(num_attention_heads))
+ reversed_layers_per_block = list(reversed(layers_per_block))
+ reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+ reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+ only_cross_attention = list(reversed(only_cross_attention))
+ output_channel = reversed_block_out_channels[0]
+ for i, up_block_type in enumerate(up_block_types):
+ res = 2 ** (len(up_block_types) - 1 - i) # noqa: *
+ is_final_block = i == len(block_out_channels) - 1
+ prev_output_channel = output_channel
+ output_channel = reversed_block_out_channels[i]
+ input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+ if not is_final_block:
+ add_upsample = True
+ self.num_upsamplers += 1
+ else:
+ add_upsample = False
+ up_block = get_up_block(
+ up_block_type,
+ num_layers=reversed_layers_per_block[i] + 1,
+ transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+ in_channels=input_channel,
+ out_channels=output_channel,
+ prev_output_channel=prev_output_channel,
+ temb_channels=blocks_time_embed_dim,
+ add_upsample=add_upsample,
+ resnet_eps=norm_eps,
+ resnet_act_fn=act_fn,
+ resnet_groups=norm_num_groups,
+ cross_attention_dim=reversed_cross_attention_dim[i],
+ num_attention_heads=reversed_num_attention_heads[i],
+ dual_cross_attention=dual_cross_attention,
+ use_linear_projection=use_linear_projection,
+ only_cross_attention=only_cross_attention[i],
+ upcast_attention=upcast_attention,
+ resnet_time_scale_shift=resnet_time_scale_shift,
+ resnet_skip_time_act=resnet_skip_time_act,
+ resnet_out_scale_factor=resnet_out_scale_factor,
+ cross_attention_norm=cross_attention_norm,
+ attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+ )
+ self.up_blocks.append(up_block)
+ prev_output_channel = output_channel
+ if norm_num_groups is not None:
+ self.conv_norm_out = paddle.nn.GroupNorm(
+ num_channels=block_out_channels[0], num_groups=norm_num_groups, epsilon=norm_eps
+ )
+ self.conv_act = ppdiffusers.models.activations.get_activation(act_fn)
+ else:
+ self.conv_norm_out = None
+ self.conv_act = None
+ conv_out_padding = (conv_out_kernel - 1) // 2
+ self.conv_out = Conv3d(
+ block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+ )
+
+ def temporal_parameters(self) -> list:
+ output = []
+ all_blocks = list(self.down_blocks) + list(self.up_blocks) + [self.mid_block]
+ for block in all_blocks:
+ output.extend(block.temporal_parameters())
+ return output
+
+ @property
+ def attn_processors(self) -> Dict[str, ppdiffusers.models.attention_processor.AttentionProcessor]:
+ return self.get_attn_processors(include_temporal_layers=False)
+
+ def get_attn_processors(
+ self, include_temporal_layers=True
+ ) -> Dict[str, ppdiffusers.models.attention_processor.AttentionProcessor]:
+ """
+ Returns:
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
+ indexed by its weight name.
+ """
+ processors = {}
+
+ def fn_recursive_add_processors(
+ name: str,
+ module: paddle.nn.Layer,
+ processors: Dict[str, ppdiffusers.models.attention_processor.AttentionProcessor],
+ ):
+ if not include_temporal_layers:
+ if "temporal" in name:
+ return processors
+ if hasattr(module, "set_processor"):
+ processors[f"{name}.processor"] = module.processor
+ for sub_name, child in module.named_children():
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+ return processors
+
+ for name, module in self.named_children():
+ fn_recursive_add_processors(name, module, processors)
+ return processors
+
+ def set_attn_processor(
+ self,
+ processor: Union[
+ ppdiffusers.models.attention_processor.AttentionProcessor,
+ Dict[str, ppdiffusers.models.attention_processor.AttentionProcessor],
+ ],
+ include_temporal_layers=False,
+ ):
+ """
+ Sets the attention processor to use to compute attention.
+
+ Parameters:
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
+ for **all** `Attention` layers.
+
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+ processor. This is strongly recommended when setting trainable attention processors.
+
+ """
+ count = len(self.get_attn_processors(include_temporal_layers=include_temporal_layers).keys())
+ if isinstance(processor, dict) and len(processor) != count:
+ raise ValueError(
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the number of attention layers: {count}. Please make sure to pass {count} processor classes."
+ )
+
+ def fn_recursive_attn_processor(name: str, module: paddle.nn.Layer, processor):
+ if not include_temporal_layers:
+ if "temporal" in name:
+ return
+ if hasattr(module, "set_processor"):
+ if not isinstance(processor, dict):
+ module.set_processor(processor)
+ else:
+ module.set_processor(processor.pop(f"{name}.processor"))
+ for sub_name, child in module.named_children():
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+ for name, module in self.named_children():
+ fn_recursive_attn_processor(name, module, processor)
+
+ def set_default_attn_processor(self):
+ """
+ Disables custom attention processors and sets the default attention implementation.
+ """
+ self.set_attn_processor(ppdiffusers.models.attention_processor.AttnProcessor())
+
+ def set_attention_slice(self, slice_size):
+ """
+ Enable sliced attention computation.
+
+ When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+ several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+
+ Args:
+ slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+ When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+ `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+ provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+ must be a multiple of `slice_size`.
+ """
+ sliceable_head_dims = []
+
+ def fn_recursive_retrieve_sliceable_dims(module: paddle.nn.Layer):
+ if hasattr(module, "set_attention_slice"):
+ sliceable_head_dims.append(module.sliceable_head_dim)
+ for child in module.children():
+ fn_recursive_retrieve_sliceable_dims(child)
+
+ for module in self.children():
+ fn_recursive_retrieve_sliceable_dims(module)
+ num_sliceable_layers = len(sliceable_head_dims)
+ if slice_size == "auto":
+ slice_size = [(dim // 2) for dim in sliceable_head_dims]
+ elif slice_size == "max":
+ slice_size = num_sliceable_layers * [1]
+ slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+ if len(slice_size) != len(sliceable_head_dims):
+ raise ValueError(
+ f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+ )
+ for i in range(len(slice_size)):
+ size = slice_size[i]
+ dim = sliceable_head_dims[i]
+ if size is not None and size > dim:
+ raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+ def fn_recursive_set_attention_slice(module: paddle.nn.Layer, slice_size: List[int]):
+ if hasattr(module, "set_attention_slice"):
+ module.set_attention_slice(slice_size.pop())
+ for child in module.children():
+ fn_recursive_set_attention_slice(child, slice_size)
+
+ reversed_slice_size = list(reversed(slice_size))
+ for module in self.children():
+ fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
+ module.gradient_checkpointing = value
+
+ def forward(
+ self,
+ sample: paddle.float32,
+ timestep: Union[paddle.Tensor, float, int],
+ encoder_hidden_states: paddle.Tensor,
+ class_labels: Optional[paddle.Tensor] = None,
+ timestep_cond: Optional[paddle.Tensor] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ added_cond_kwargs: Optional[Dict[str, paddle.Tensor]] = None,
+ down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
+ mid_block_additional_residual: Optional[paddle.Tensor] = None,
+ encoder_attention_mask: Optional[paddle.Tensor] = None,
+ return_dict: bool = True,
+ enable_temporal_attentions: bool = True,
+ ) -> Union[UNet3DConditionOutput, Tuple]:
+ """
+ The [`UNet2DConditionModel`] forward method.
+
+ Args:
+ sample (`paddle.FloatTensor`):
+ The noisy input tensor with the following shape `(batch, channel, height, width)`.
+ timestep (`paddle.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+ encoder_hidden_states (`paddle.FloatTensor`):
+ The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+ encoder_attention_mask (`paddle.Tensor`):
+ A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+ `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+ which adds large negative values to the attention scores corresponding to "discard" tokens.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+ tuple.
+ cross_attention_kwargs (`dict`, *optional*):
+ A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+ added_cond_kwargs: (`dict`, *optional*):
+ A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+ are passed along to the UNet blocks.
+
+ Returns:
+ [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+ If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+ a `tuple` is returned where the first element is the sample tensor.
+ """
+ default_overall_up_factor = 2**self.num_upsamplers
+ forward_upsample_size = False
+ upsample_size = None
+ if any(s % default_overall_up_factor != 0 for s in tuple(sample.shape)[-2:]):
+ logger.info("Forward upsample size to force interpolation output size.")
+ forward_upsample_size = True
+ if attention_mask is not None:
+ attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+ attention_mask = attention_mask.unsqueeze(axis=1)
+ if encoder_attention_mask is not None:
+ encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+ encoder_attention_mask = encoder_attention_mask.unsqueeze(axis=1)
+ if self.config.center_input_sample:
+ sample = 2 * sample - 1.0
+ timesteps = timestep
+ if not paddle.is_tensor(x=timesteps):
+ is_mps = sample.device.type == "mps"
+ if isinstance(timestep, float):
+ dtype = "float32" if is_mps else "float64"
+ else:
+ dtype = "int32" if is_mps else "int64"
+ timesteps = paddle.to_tensor(data=[timesteps], dtype=dtype, place=sample.place)
+ elif len(tuple(timesteps.shape)) == 0:
+ timesteps = timesteps[None].to(sample.place)
+ timesteps = timesteps.expand(shape=tuple(sample.shape)[0])
+ t_emb = self.time_proj(timesteps)
+ t_emb = t_emb.to(dtype=sample.dtype)
+ emb = self.time_embedding(t_emb, timestep_cond)
+ aug_emb = None
+ if self.class_embedding is not None:
+ if class_labels is None:
+ raise ValueError("class_labels should be provided when num_class_embeds > 0")
+ if self.config.class_embed_type == "timestep":
+ class_labels = self.time_proj(class_labels)
+ class_labels = class_labels.to(dtype=sample.dtype)
+ class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+ if self.config.class_embeddings_concat:
+ emb = paddle.concat(x=[emb, class_emb], axis=-1)
+ else:
+ emb = emb + class_emb
+ if self.config.addition_embed_type == "text":
+ aug_emb = self.add_embedding(encoder_hidden_states)
+ elif self.config.addition_embed_type == "text_image":
+ if "image_embeds" not in added_cond_kwargs:
+ raise ValueError(
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+ )
+ image_embs = added_cond_kwargs.get("image_embeds")
+ text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+ aug_emb = self.add_embedding(text_embs, image_embs)
+ elif self.config.addition_embed_type == "text_time":
+ if "text_embeds" not in added_cond_kwargs:
+ raise ValueError(
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+ )
+ text_embeds = added_cond_kwargs.get("text_embeds")
+ if "time_ids" not in added_cond_kwargs:
+ raise ValueError(
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+ )
+ time_ids = added_cond_kwargs.get("time_ids")
+ time_embeds = self.add_time_proj(time_ids.flatten())
+ time_embeds = time_embeds.reshape((tuple(text_embeds.shape)[0], -1))
+ add_embeds = paddle.concat(x=[text_embeds, time_embeds], axis=-1)
+ add_embeds = add_embeds.to(emb.dtype)
+ aug_emb = self.add_embedding(add_embeds)
+ elif self.config.addition_embed_type == "image":
+ if "image_embeds" not in added_cond_kwargs:
+ raise ValueError(
+ f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+ )
+ image_embs = added_cond_kwargs.get("image_embeds")
+ aug_emb = self.add_embedding(image_embs)
+ elif self.config.addition_embed_type == "image_hint":
+ if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+ raise ValueError(
+ f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+ )
+ image_embs = added_cond_kwargs.get("image_embeds")
+ hint = added_cond_kwargs.get("hint")
+ aug_emb, hint = self.add_embedding(image_embs, hint)
+ sample = paddle.concat(x=[sample, hint], axis=1)
+ emb = emb + aug_emb if aug_emb is not None else emb
+ if self.time_embed_act is not None:
+ emb = self.time_embed_act(emb)
+ if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+ encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+ elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+ if "image_embeds" not in added_cond_kwargs:
+ raise ValueError(
+ f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
+ )
+ image_embeds = added_cond_kwargs.get("image_embeds")
+ encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+ elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+ if "image_embeds" not in added_cond_kwargs:
+ raise ValueError(
+ f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
+ )
+ image_embeds = added_cond_kwargs.get("image_embeds")
+ encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+ sample = self.conv_in(sample)
+ down_block_res_samples = (sample,)
+ for downsample_block in self.down_blocks:
+ if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+ sample, res_samples = downsample_block(
+ hidden_states=sample,
+ temb=emb,
+ encoder_hidden_states=encoder_hidden_states,
+ attention_mask=attention_mask,
+ cross_attention_kwargs=cross_attention_kwargs,
+ enable_temporal_attentions=enable_temporal_attentions,
+ )
+ else:
+ sample, res_samples = downsample_block(
+ hidden_states=sample,
+ temb=emb,
+ encoder_hidden_states=encoder_hidden_states,
+ enable_temporal_attentions=enable_temporal_attentions,
+ )
+ down_block_res_samples += res_samples
+ if down_block_additional_residuals is not None:
+ new_down_block_res_samples = ()
+ for down_block_res_sample, down_block_additional_residual in zip(
+ down_block_res_samples, down_block_additional_residuals
+ ):
+ down_block_res_sample = down_block_res_sample + down_block_additional_residual
+ new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+ down_block_res_samples = new_down_block_res_samples
+ if self.mid_block is not None:
+ sample = self.mid_block(
+ sample,
+ emb,
+ encoder_hidden_states=encoder_hidden_states,
+ attention_mask=attention_mask,
+ cross_attention_kwargs=cross_attention_kwargs,
+ enable_temporal_attentions=enable_temporal_attentions,
+ )
+ if mid_block_additional_residual is not None:
+ sample = sample + mid_block_additional_residual
+ for i, upsample_block in enumerate(self.up_blocks):
+ is_final_block = i == len(self.up_blocks) - 1
+ res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+ down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+ if not is_final_block and forward_upsample_size:
+ upsample_size = tuple(down_block_res_samples[-1].shape)[2:]
+ if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+ sample = upsample_block(
+ hidden_states=sample,
+ temb=emb,
+ res_hidden_states_tuple=res_samples,
+ encoder_hidden_states=encoder_hidden_states,
+ cross_attention_kwargs=cross_attention_kwargs,
+ upsample_size=upsample_size,
+ attention_mask=attention_mask,
+ enable_temporal_attentions=enable_temporal_attentions,
+ )
+ else:
+ sample = upsample_block(
+ hidden_states=sample,
+ temb=emb,
+ res_hidden_states_tuple=res_samples,
+ upsample_size=upsample_size,
+ encoder_hidden_states=encoder_hidden_states,
+ enable_temporal_attentions=enable_temporal_attentions,
+ )
+ if self.conv_norm_out:
+ sample = self.conv_norm_out(sample)
+ sample = self.conv_act(sample)
+ sample = self.conv_out(sample)
+ if not return_dict:
+ return (sample,)
+ return UNet3DConditionOutput(sample=sample)
+
+ @classmethod
+ def from_pretrained_spatial(cls, pretrained_model_path, subfolder=None):
+ import json
+
+ if subfolder is not None:
+ pretrained_model_path = os.path.join(pretrained_model_path, subfolder)
+ config_file = os.path.join(pretrained_model_path, "config.json")
+ with open(config_file, "r") as f:
+ config = json.load(f)
+ config["_class_name"] = "UNet3DConditionModel"
+ config["down_block_types"] = ["DownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D"]
+ config["up_block_types"] = ["CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "UpBlock3D"]
+ config["mid_block_type"] = "UNetMidBlock3DCrossAttn"
+ model = cls.from_config(config)
+ model_files = [
+ os.path.join(pretrained_model_path, "diffusion_paddle_model.bin"),
+ os.path.join(pretrained_model_path, "diffusion_paddle_model.safetensors"),
+ ]
+ model_file = None
+ for fp in model_files:
+ if os.path.exists(fp):
+ model_file = fp
+ if not model_file:
+ raise RuntimeError(f"{model_file} does not exist")
+ if model_file.split(".")[-1] == "safetensors":
+ from safetensors import safe_open
+
+ state_dict = {}
+ with safe_open(model_file, framework="pt", device="cuda") as f:
+ for key in f.keys():
+ state_dict[key] = f.get_tensor(key)
+ else:
+ state_dict = paddle.load(path=model_file)
+ model.set_state_dict(state_dict=state_dict, use_structured_name=False)
+ return model
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet_blocks.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet_blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..093c3b912d7c5d4e382848fba1a984d7450bd1ad
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/unet_blocks.py
@@ -0,0 +1,717 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.distributed.fleet.utils import recompute
+
+from .resnet import Downsample3D, ResnetBlock3D, Upsample3D
+from .transformer_3d import Transformer3DModel
+from .transformer_temporal import TransformerTemporal
+
+
+def get_down_block(
+ down_block_type,
+ num_layers,
+ in_channels,
+ out_channels,
+ temb_channels,
+ add_downsample,
+ resnet_eps,
+ resnet_act_fn,
+ transformer_layers_per_block=1,
+ num_attention_heads=None,
+ resnet_groups=None,
+ cross_attention_dim=None,
+ downsample_padding=None,
+ dual_cross_attention=False,
+ use_linear_projection=False,
+ only_cross_attention=False,
+ upcast_attention=False,
+ resnet_time_scale_shift="default",
+ resnet_skip_time_act=False,
+ resnet_out_scale_factor=1.0,
+ cross_attention_norm=None,
+ attention_head_dim=None,
+ downsample_type=None,
+):
+ down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+ if down_block_type == "DownBlock3D":
+ return DownBlock3D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ add_downsample=add_downsample,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ resnet_groups=resnet_groups,
+ downsample_padding=downsample_padding,
+ resnet_time_scale_shift=resnet_time_scale_shift,
+ )
+ elif down_block_type == "CrossAttnDownBlock3D":
+ if cross_attention_dim is None:
+ raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock3D")
+ return CrossAttnDownBlock3D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ transformer_layers_per_block=transformer_layers_per_block,
+ temb_channels=temb_channels,
+ add_downsample=add_downsample,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ resnet_groups=resnet_groups,
+ downsample_padding=downsample_padding,
+ cross_attention_dim=cross_attention_dim,
+ num_attention_heads=num_attention_heads,
+ dual_cross_attention=dual_cross_attention,
+ use_linear_projection=use_linear_projection,
+ only_cross_attention=only_cross_attention,
+ upcast_attention=upcast_attention,
+ resnet_time_scale_shift=resnet_time_scale_shift,
+ )
+ raise ValueError(f"{down_block_type} does not exist.")
+
+
+def get_up_block(
+ up_block_type,
+ num_layers,
+ in_channels,
+ out_channels,
+ prev_output_channel,
+ temb_channels,
+ add_upsample,
+ resnet_eps,
+ resnet_act_fn,
+ transformer_layers_per_block=1,
+ num_attention_heads=None,
+ resnet_groups=None,
+ cross_attention_dim=None,
+ dual_cross_attention=False,
+ use_linear_projection=False,
+ only_cross_attention=False,
+ upcast_attention=False,
+ resnet_time_scale_shift="default",
+ resnet_skip_time_act=False,
+ resnet_out_scale_factor=1.0,
+ cross_attention_norm=None,
+ attention_head_dim=None,
+ upsample_type=None,
+):
+ up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+ if up_block_type == "UpBlock3D":
+ return UpBlock3D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ prev_output_channel=prev_output_channel,
+ temb_channels=temb_channels,
+ add_upsample=add_upsample,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ resnet_groups=resnet_groups,
+ resnet_time_scale_shift=resnet_time_scale_shift,
+ )
+ elif up_block_type == "CrossAttnUpBlock3D":
+ if cross_attention_dim is None:
+ raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock3D")
+ return CrossAttnUpBlock3D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ transformer_layers_per_block=transformer_layers_per_block,
+ out_channels=out_channels,
+ prev_output_channel=prev_output_channel,
+ temb_channels=temb_channels,
+ add_upsample=add_upsample,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ resnet_groups=resnet_groups,
+ cross_attention_dim=cross_attention_dim,
+ num_attention_heads=num_attention_heads,
+ dual_cross_attention=dual_cross_attention,
+ use_linear_projection=use_linear_projection,
+ only_cross_attention=only_cross_attention,
+ upcast_attention=upcast_attention,
+ resnet_time_scale_shift=resnet_time_scale_shift,
+ )
+ raise ValueError(f"{up_block_type} does not exist.")
+
+
+class UNetMidBlock3DCrossAttn(paddle.nn.Layer):
+ def __init__(
+ self,
+ in_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ transformer_layers_per_block: int = 1,
+ resnet_eps: float = 1e-06,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ num_attention_heads=1,
+ output_scale_factor=1.0,
+ cross_attention_dim=1280,
+ dual_cross_attention=False,
+ use_linear_projection=False,
+ upcast_attention=False,
+ ):
+ super().__init__()
+ self.has_cross_attention = True
+ self.num_attention_heads = num_attention_heads
+ resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+ resnets = [
+ ResnetBlock3D(
+ in_channels=in_channels,
+ out_channels=in_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ ]
+ attentions = []
+ for _ in range(num_layers):
+ if dual_cross_attention:
+ raise NotImplementedError
+ attentions.append(
+ Transformer3DModel(
+ num_attention_heads,
+ in_channels // num_attention_heads,
+ in_channels=in_channels,
+ num_layers=transformer_layers_per_block,
+ cross_attention_dim=cross_attention_dim,
+ norm_num_groups=resnet_groups,
+ use_linear_projection=use_linear_projection,
+ upcast_attention=upcast_attention,
+ )
+ )
+ resnets.append(
+ ResnetBlock3D(
+ in_channels=in_channels,
+ out_channels=in_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+ self.attentions = paddle.nn.LayerList(sublayers=attentions)
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
+
+ def forward(
+ self,
+ hidden_states,
+ temb=None,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ cross_attention_kwargs=None,
+ enable_temporal_attentions: bool = True,
+ ):
+ hidden_states = self.resnets[0](hidden_states, temb)
+ for attn, resnet in zip(self.attentions, self.resnets[1:]):
+ hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+ hidden_states = resnet(hidden_states, temb)
+ return hidden_states
+
+ def temporal_parameters(self) -> list:
+ return []
+
+
+class CrossAttnDownBlock3D(paddle.nn.Layer):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ transformer_layers_per_block: int = 1,
+ resnet_eps: float = 1e-06,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ num_attention_heads=1,
+ cross_attention_dim=1280,
+ output_scale_factor=1.0,
+ downsample_padding=1,
+ add_downsample=True,
+ dual_cross_attention=False,
+ use_linear_projection=False,
+ only_cross_attention=False,
+ upcast_attention=False,
+ ):
+ super().__init__()
+ resnets = []
+ attentions = []
+ temporal_attentions = []
+ self.has_cross_attention = True
+ self.num_attention_heads = num_attention_heads
+ for i in range(num_layers):
+ in_channels = in_channels if i == 0 else out_channels
+ resnets.append(
+ ResnetBlock3D(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+ if dual_cross_attention:
+ raise NotImplementedError
+ attentions.append(
+ Transformer3DModel(
+ num_attention_heads,
+ out_channels // num_attention_heads,
+ in_channels=out_channels,
+ num_layers=transformer_layers_per_block,
+ cross_attention_dim=cross_attention_dim,
+ norm_num_groups=resnet_groups,
+ use_linear_projection=use_linear_projection,
+ only_cross_attention=only_cross_attention,
+ upcast_attention=upcast_attention,
+ )
+ )
+ temporal_attentions.append(
+ TransformerTemporal(
+ num_attention_heads=8,
+ attention_head_dim=out_channels // 8,
+ in_channels=out_channels,
+ cross_attention_dim=None,
+ )
+ )
+ self.attentions = paddle.nn.LayerList(sublayers=attentions)
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
+ self.temporal_attentions = paddle.nn.LayerList(sublayers=temporal_attentions)
+ if add_downsample:
+ self.downsamplers = paddle.nn.LayerList(
+ sublayers=[
+ Downsample3D(
+ out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+ )
+ ]
+ )
+ else:
+ self.downsamplers = None
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ hidden_states,
+ temb=None,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ cross_attention_kwargs=None,
+ enable_temporal_attentions: bool = True,
+ ):
+ output_states = ()
+ for resnet, attn, temporal_attention in zip(self.resnets, self.attentions, self.temporal_attentions):
+ if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
+
+ def create_custom_forward(module, return_dict=None):
+ def custom_forward(*inputs):
+ if return_dict is not None:
+ return module(*inputs, return_dict=return_dict)
+ else:
+ return module(*inputs)
+
+ return custom_forward
+
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb, use_reentrant=False)
+ hidden_states = recompute(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ use_reentrant=False,
+ )[0]
+ if enable_temporal_attentions and temporal_attention is not None:
+ hidden_states = recompute(
+ create_custom_forward(temporal_attention),
+ hidden_states,
+ encoder_hidden_states,
+ use_reentrant=False,
+ )
+ else:
+ hidden_states = resnet(hidden_states, temb)
+ hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+ if temporal_attention and enable_temporal_attentions:
+ hidden_states = temporal_attention(hidden_states, encoder_hidden_states=encoder_hidden_states)
+ output_states += (hidden_states,)
+ if self.downsamplers is not None:
+ for downsampler in self.downsamplers:
+ hidden_states = downsampler(hidden_states)
+ output_states += (hidden_states,)
+ return hidden_states, output_states
+
+ def temporal_parameters(self) -> list:
+ output = []
+ for block in self.temporal_attentions:
+ if block:
+ output.extend(block.parameters())
+ return output
+
+
+class DownBlock3D(paddle.nn.Layer):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-06,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ output_scale_factor=1.0,
+ add_downsample=True,
+ downsample_padding=1,
+ ):
+ super().__init__()
+ resnets = []
+ temporal_attentions = []
+ for i in range(num_layers):
+ in_channels = in_channels if i == 0 else out_channels
+ resnets.append(
+ ResnetBlock3D(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+ temporal_attentions.append(
+ TransformerTemporal(
+ num_attention_heads=8,
+ attention_head_dim=out_channels // 8,
+ in_channels=out_channels,
+ cross_attention_dim=None,
+ )
+ )
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
+ self.temporal_attentions = paddle.nn.LayerList(sublayers=temporal_attentions)
+ if add_downsample:
+ self.downsamplers = paddle.nn.LayerList(
+ sublayers=[
+ Downsample3D(
+ out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+ )
+ ]
+ )
+ else:
+ self.downsamplers = None
+ self.gradient_checkpointing = False
+
+ def forward(self, hidden_states, temb=None, encoder_hidden_states=None, enable_temporal_attentions: bool = True):
+ output_states = ()
+ for resnet, temporal_attention in zip(self.resnets, self.temporal_attentions):
+ if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ return module(*inputs)
+
+ return custom_forward
+
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb, use_reentrant=False)
+ if enable_temporal_attentions and temporal_attention is not None:
+ hidden_states = recompute(
+ create_custom_forward(temporal_attention),
+ hidden_states,
+ encoder_hidden_states,
+ use_reentrant=False,
+ )
+ else:
+ hidden_states = resnet(hidden_states, temb)
+ if enable_temporal_attentions and temporal_attention:
+ hidden_states = temporal_attention(hidden_states, encoder_hidden_states=encoder_hidden_states)
+ output_states += (hidden_states,)
+ if self.downsamplers is not None:
+ for downsampler in self.downsamplers:
+ hidden_states = downsampler(hidden_states)
+ output_states += (hidden_states,)
+ return hidden_states, output_states
+
+ def temporal_parameters(self) -> list:
+ output = []
+ for block in self.temporal_attentions:
+ if block:
+ output.extend(block.parameters())
+ return output
+
+
+class CrossAttnUpBlock3D(paddle.nn.Layer):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ prev_output_channel: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ transformer_layers_per_block: int = 1,
+ resnet_eps: float = 1e-06,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ num_attention_heads=1,
+ cross_attention_dim=1280,
+ output_scale_factor=1.0,
+ add_upsample=True,
+ dual_cross_attention=False,
+ use_linear_projection=False,
+ only_cross_attention=False,
+ upcast_attention=False,
+ ):
+ super().__init__()
+ resnets = []
+ attentions = []
+ temporal_attentions = []
+ self.has_cross_attention = True
+ self.num_attention_heads = num_attention_heads
+ for i in range(num_layers):
+ res_skip_channels = in_channels if i == num_layers - 1 else out_channels
+ resnet_in_channels = prev_output_channel if i == 0 else out_channels
+ resnets.append(
+ ResnetBlock3D(
+ in_channels=resnet_in_channels + res_skip_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+ if dual_cross_attention:
+ raise NotImplementedError
+ attentions.append(
+ Transformer3DModel(
+ num_attention_heads,
+ out_channels // num_attention_heads,
+ in_channels=out_channels,
+ num_layers=transformer_layers_per_block,
+ cross_attention_dim=cross_attention_dim,
+ norm_num_groups=resnet_groups,
+ use_linear_projection=use_linear_projection,
+ only_cross_attention=only_cross_attention,
+ upcast_attention=upcast_attention,
+ )
+ )
+ temporal_attentions.append(
+ TransformerTemporal(
+ num_attention_heads=8,
+ attention_head_dim=out_channels // 8,
+ in_channels=out_channels,
+ cross_attention_dim=None,
+ )
+ )
+ self.attentions = paddle.nn.LayerList(sublayers=attentions)
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
+ self.temporal_attentions = paddle.nn.LayerList(sublayers=temporal_attentions)
+ if add_upsample:
+ self.upsamplers = paddle.nn.LayerList(
+ sublayers=[Upsample3D(out_channels, use_conv=True, out_channels=out_channels)]
+ )
+ else:
+ self.upsamplers = None
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ hidden_states,
+ res_hidden_states_tuple,
+ temb=None,
+ encoder_hidden_states=None,
+ upsample_size=None,
+ cross_attention_kwargs=None,
+ attention_mask=None,
+ enable_temporal_attentions: bool = True,
+ ):
+ for resnet, attn, temporal_attention in zip(self.resnets, self.attentions, self.temporal_attentions):
+ res_hidden_states = res_hidden_states_tuple[-1]
+ res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+ hidden_states = paddle.concat(x=[hidden_states, res_hidden_states], axis=1)
+ if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
+
+ def create_custom_forward(module, return_dict=None):
+ def custom_forward(*inputs):
+ if return_dict is not None:
+ return module(*inputs, return_dict=return_dict)
+ else:
+ return module(*inputs)
+
+ return custom_forward
+
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb, use_reentrant=False)
+ hidden_states = recompute(
+ create_custom_forward(attn, return_dict=False),
+ hidden_states,
+ encoder_hidden_states,
+ use_reentrant=False,
+ )[0]
+ if enable_temporal_attentions and temporal_attention is not None:
+ hidden_states = recompute(
+ create_custom_forward(temporal_attention),
+ hidden_states,
+ encoder_hidden_states,
+ use_reentrant=False,
+ )
+ else:
+ hidden_states = resnet(hidden_states, temb)
+ hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+ if enable_temporal_attentions and temporal_attention:
+ hidden_states = temporal_attention(hidden_states, encoder_hidden_states=encoder_hidden_states)
+ if self.upsamplers is not None:
+ for upsampler in self.upsamplers:
+ hidden_states = upsampler(hidden_states, upsample_size)
+ return hidden_states
+
+ def temporal_parameters(self) -> list:
+ output = []
+ for block in self.temporal_attentions:
+ if block:
+ output.extend(block.parameters())
+ return output
+
+
+class UpBlock3D(paddle.nn.Layer):
+ def __init__(
+ self,
+ in_channels: int,
+ prev_output_channel: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-06,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ output_scale_factor=1.0,
+ add_upsample=True,
+ ):
+ super().__init__()
+ resnets = []
+ temporal_attentions = []
+ for i in range(num_layers):
+ res_skip_channels = in_channels if i == num_layers - 1 else out_channels
+ resnet_in_channels = prev_output_channel if i == 0 else out_channels
+ resnets.append(
+ ResnetBlock3D(
+ in_channels=resnet_in_channels + res_skip_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+ temporal_attentions.append(
+ TransformerTemporal(
+ num_attention_heads=8,
+ attention_head_dim=out_channels // 8,
+ in_channels=out_channels,
+ cross_attention_dim=None,
+ )
+ )
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
+ self.temporal_attentions = paddle.nn.LayerList(sublayers=temporal_attentions)
+ if add_upsample:
+ self.upsamplers = paddle.nn.LayerList(
+ sublayers=[Upsample3D(out_channels, use_conv=True, out_channels=out_channels)]
+ )
+ else:
+ self.upsamplers = None
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ hidden_states,
+ res_hidden_states_tuple,
+ temb=None,
+ upsample_size=None,
+ encoder_hidden_states=None,
+ enable_temporal_attentions: bool = True,
+ ):
+ for resnet, temporal_attention in zip(self.resnets, self.temporal_attentions):
+ res_hidden_states = res_hidden_states_tuple[-1]
+ res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+ hidden_states = paddle.concat(x=[hidden_states, res_hidden_states], axis=1)
+ if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ return module(*inputs)
+
+ return custom_forward
+
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb, use_reentrant=False)
+ if enable_temporal_attentions and temporal_attention is not None:
+ hidden_states = recompute(
+ create_custom_forward(temporal_attention),
+ hidden_states,
+ encoder_hidden_states,
+ use_reentrant=False,
+ )
+ else:
+ hidden_states = resnet(hidden_states, temb)
+ hidden_states = (
+ temporal_attention(hidden_states, encoder_hidden_states=encoder_hidden_states)
+ if enable_temporal_attentions and temporal_attention is not None
+ else hidden_states
+ )
+ if self.upsamplers is not None:
+ for upsampler in self.upsamplers:
+ hidden_states = upsampler(hidden_states, upsample_size)
+ return hidden_states
+
+ def temporal_parameters(self) -> list:
+ output = []
+ for block in self.temporal_attentions:
+ if block:
+ output.extend(block.parameters())
+ return output
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2cc31a03a59f05a9bc2b53fb21829b2dbd83cbe
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/__init__.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import paddle
+import paddle_aux
+
+import ppdiffusers
+
+from .loss_weights import *
+from .noise_conditions import *
+from .samplers import *
+from .scalers import *
+from .schedulers import *
+from .targets import *
+
+
+class GDF:
+ def __init__(self, schedule, input_scaler, target, noise_cond, loss_weight, offset_noise=0):
+ self.schedule = schedule
+ self.input_scaler = input_scaler
+ self.target = target
+ self.noise_cond = noise_cond
+ self.loss_weight = loss_weight
+ self.offset_noise = offset_noise
+
+ def setup_limits(self, stretch_max=True, stretch_min=True, shift=1):
+ stretched_limits = self.input_scaler.setup_limits(
+ self.schedule, self.input_scaler, stretch_max, stretch_min, shift
+ )
+ return stretched_limits
+
+ def diffuse(self, x0, epsilon=None, t=None, shift=1, loss_shift=1, offset=None):
+ if epsilon is None:
+ epsilon = paddle.randn(shape=x0.shape, dtype=x0.dtype)
+
+ if self.offset_noise > 0:
+ if offset is None:
+ offset = paddle.randn(
+ shape=[x0.shape[0], x0.shape[1]] + [1] * (len(x0.shape) - 2),
+ )
+ epsilon = epsilon + offset * self.offset_noise
+ logSNR = self.schedule(x0.shape[0] if t is None else t, shift=shift)
+ a, b = self.input_scaler(logSNR)
+ if len(a.shape) == 1:
+ a, b = a.reshape([-1, *([1] * (len(x0.shape) - 1))]), b.reshape([-1, *([1] * (len(x0.shape) - 1))])
+ target = self.target(x0, epsilon, logSNR, a, b)
+ return (
+ x0 * a + epsilon * b,
+ epsilon,
+ target,
+ logSNR,
+ self.noise_cond(logSNR),
+ self.loss_weight(logSNR, shift=loss_shift),
+ )
+
+ def undiffuse(self, x, logSNR, pred):
+ a, b = self.input_scaler(logSNR)
+ if len(a.shape) == 1:
+ a, b = a.reshape([-1, *([1] * (len(x.shape) - 1))]), b.reshape([-1, *([1] * (len(x.shape) - 1))])
+ return self.target.x0(x, pred, logSNR, a, b), self.target.epsilon(x, pred, logSNR, a, b)
+
+ def sample(
+ self,
+ model,
+ model_inputs,
+ shape,
+ unconditional_inputs=None,
+ sampler=None,
+ schedule=None,
+ t_start=1.0,
+ t_end=0.0,
+ timesteps=20,
+ x_init=None,
+ cfg=3.0,
+ cfg_t_stop=None,
+ cfg_t_start=None,
+ cfg_rho=0.7,
+ sampler_params=None,
+ shift=1,
+ device="cpu",
+ ):
+ sampler_params = {} if sampler_params is None else sampler_params
+ if sampler is None:
+ sampler = DDPMSampler(self) # noqa
+ r_range = paddle.linspace(start=t_start, stop=t_end, num=timesteps + 1)
+ schedule = self.schedule if schedule is None else schedule
+ logSNR_range = (
+ schedule(r_range, shift=shift)[:, None]
+ .expand(shape=[-1, shape[0] if x_init is None else x_init.shape[0]])
+ .to(device)
+ )
+ x = sampler.init_x(shape).to(device) if x_init is None else x_init.clone()
+ if cfg is not None:
+ if unconditional_inputs is None:
+ unconditional_inputs = {k: paddle.zeros_like(x=v) for k, v in model_inputs.items()}
+ model_inputs = {
+ k: (
+ paddle.concat(x=[v, v_u], axis=0)
+ if isinstance(v, paddle.Tensor)
+ else [
+ (
+ paddle.concat(x=[vi, vi_u], axis=0)
+ if isinstance(vi, paddle.Tensor) and isinstance(vi_u, paddle.Tensor)
+ else None
+ )
+ for vi, vi_u in zip(v, v_u)
+ ]
+ if isinstance(v, list)
+ else {vk: paddle.concat(x=[v[vk], v_u.get(vk, paddle.zeros_like(x=v[vk]))], axis=0) for vk in v}
+ if isinstance(v, dict)
+ else None
+ )
+ for (k, v), (k_u, v_u) in zip(model_inputs.items(), unconditional_inputs.items())
+ }
+ for i in range(0, timesteps):
+ noise_cond = self.noise_cond(logSNR_range[i])
+ if (
+ cfg is not None
+ and (cfg_t_stop is None or r_range[i].item() >= cfg_t_stop)
+ and (cfg_t_start is None or r_range[i].item() <= cfg_t_start)
+ ):
+ cfg_val = cfg
+ if isinstance(cfg_val, (list, tuple)):
+ assert len(cfg_val) == 2, "cfg must be a float or a list/tuple of length 2"
+ cfg_val = cfg_val[0] * r_range[i].item() + cfg_val[1] * (1 - r_range[i].item())
+
+ pred, pred_unconditional = model(
+ paddle.concat(x=[x, x], axis=0), noise_cond.repeat(2), **model_inputs
+ ).chunk(chunks=2)
+
+ pred_cfg = paddle.lerp(pred_unconditional, pred, paddle.to_tensor(cfg_val, dtype=paddle.float32))
+ if cfg_rho > 0:
+ std_pos, std_cfg = pred.std(), pred_cfg.std()
+ pred = cfg_rho * (pred_cfg * std_pos / (std_cfg + 1e-9)) + pred_cfg * (1 - cfg_rho)
+ else:
+ pred = pred_cfg
+ else:
+ pred = model(x, noise_cond, **model_inputs)
+
+ x0, epsilon = self.undiffuse(x, logSNR_range[i], pred)
+ x = sampler(x, x0, epsilon, logSNR_range[i], logSNR_range[i + 1], **sampler_params)
+ altered_vars = yield x0, x, pred
+ if altered_vars is not None:
+ cfg = altered_vars.get("cfg", cfg)
+ cfg_rho = altered_vars.get("cfg_rho", cfg_rho)
+ sampler = altered_vars.get("sampler", sampler)
+ model_inputs = altered_vars.get("model_inputs", model_inputs)
+ x = altered_vars.get("x", x)
+ x_init = altered_vars.get("x_init", x_init)
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/loss_weights.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/loss_weights.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2fefb2dd19a63300881b315e085661da7ca16a2
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/loss_weights.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle_aux # noqa
+
+
+class BaseLossWeight:
+ def weight(self, logSNR):
+ raise NotImplementedError("this method needs to be overridden")
+
+ def __call__(self, logSNR, *args, shift=1, clamp_range=None, **kwargs):
+ clamp_range = [-1000000000.0, 1000000000.0] if clamp_range is None else clamp_range
+ if shift != 1:
+ logSNR = logSNR.clone() + 2 * np.log(shift)
+ return self.weight(logSNR, *args, **kwargs).clip(*clamp_range)
+
+
+class ComposedLossWeight(BaseLossWeight):
+ def __init__(self, div, mul):
+ self.mul = [mul] if isinstance(mul, BaseLossWeight) else mul
+ self.div = [div] if isinstance(div, BaseLossWeight) else div
+
+ def weight(self, logSNR):
+ prod, div = 1, 1
+ for m in self.mul:
+ prod *= m.weight(logSNR)
+ for d in self.div:
+ div *= d.weight(logSNR)
+ return prod / div
+
+
+class ConstantLossWeight(BaseLossWeight):
+ def __init__(self, v=1):
+ self.v = v
+
+ def weight(self, logSNR):
+ return paddle.ones_like(x=logSNR) * self.v
+
+
+class SNRLossWeight(BaseLossWeight):
+ def weight(self, logSNR):
+ return logSNR.exp()
+
+
+class P2LossWeight(BaseLossWeight):
+ def __init__(self, k=1.0, gamma=1.0, s=1.0):
+ self.k, self.gamma, self.s = k, gamma, s
+
+ def weight(self, logSNR):
+ return (self.k + (logSNR * self.s).exp()) ** -self.gamma
+
+
+class SNRPlusOneLossWeight(BaseLossWeight):
+ def weight(self, logSNR):
+ return logSNR.exp() + 1
+
+
+class MinSNRLossWeight(BaseLossWeight):
+ def __init__(self, max_snr=5):
+ self.max_snr = max_snr
+
+ def weight(self, logSNR):
+ return logSNR.exp().clip(max=self.max_snr)
+
+
+class MinSNRPlusOneLossWeight(BaseLossWeight):
+ def __init__(self, max_snr=5):
+ self.max_snr = max_snr
+
+ def weight(self, logSNR):
+ return (logSNR.exp() + 1).clip(max=self.max_snr)
+
+
+class TruncatedSNRLossWeight(BaseLossWeight):
+ def __init__(self, min_snr=1):
+ self.min_snr = min_snr
+
+ def weight(self, logSNR):
+ return logSNR.exp().clip(min=self.min_snr)
+
+
+class SechLossWeight(BaseLossWeight):
+ def __init__(self, div=2):
+ self.div = div
+
+ def weight(self, logSNR):
+ return 1 / (logSNR / self.div).cosh()
+
+
+class DebiasedLossWeight(BaseLossWeight):
+ def weight(self, logSNR):
+ return 1 / logSNR.exp().sqrt()
+
+
+class SigmoidLossWeight(BaseLossWeight):
+ def __init__(self, s=1):
+ self.s = s
+
+ def weight(self, logSNR):
+ return (logSNR * self.s).sigmoid()
+
+
+class AdaptiveLossWeight(BaseLossWeight):
+ def __init__(self, logsnr_range=[-10, 10], buckets=300, weight_range=[1e-07, 10000000.0]):
+ self.bucket_ranges = paddle.linspace(start=logsnr_range[0], stop=logsnr_range[1], num=buckets - 1)
+ self.bucket_losses = paddle.ones(shape=buckets)
+ self.weight_range = weight_range
+
+ def weight(self, logSNR):
+ indices = paddle.searchsorted(sorted_sequence=self.bucket_ranges.to(logSNR.place), values=logSNR)
+ return (1 / self.bucket_losses.to(logSNR.place)[indices]).clip([*self.weight_range])
+
+ def update_buckets(self, logSNR, loss, beta=0.99):
+ indices = paddle.searchsorted(sorted_sequence=self.bucket_ranges.to(logSNR.place), values=logSNR).cpu()
+ self.bucket_losses[indices] = self.bucket_losses[indices] * beta + loss.detach().cpu() * (1 - beta)
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/scalers.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/scalers.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ea70592b8882b8261a52a8e6d2717fb7c28c3cb
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/scalers.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+
+class BaseScaler:
+ def __init__(self):
+ self.stretched_limits = None
+
+ def setup_limits(self, schedule, input_scaler, stretch_max=True, stretch_min=True, shift=1):
+ min_logSNR = schedule(paddle.ones(shape=[1]), shift=shift)
+ max_logSNR = schedule(paddle.zeros(shape=[1]), shift=shift)
+ min_a, max_b = [v.item() for v in input_scaler(min_logSNR)] if stretch_max else [0, 1]
+ max_a, min_b = [v.item() for v in input_scaler(max_logSNR)] if stretch_min else [1, 0]
+ self.stretched_limits = [min_a, max_a, min_b, max_b]
+ return self.stretched_limits
+
+ def stretch_limits(self, a, b):
+ min_a, max_a, min_b, max_b = self.stretched_limits
+ return (a - min_a) / (max_a - min_a), (b - min_b) / (max_b - min_b)
+
+ def scalers(self, logSNR):
+ raise NotImplementedError("this method needs to be overridden")
+
+ def __call__(self, logSNR):
+ a, b = self.scalers(logSNR)
+ if self.stretched_limits is not None:
+ a, b = self.stretch_limits(a, b)
+ return a, b
+
+
+class VPScaler(BaseScaler):
+ def scalers(self, logSNR):
+ a_squared = logSNR.sigmoid()
+ a = a_squared.sqrt()
+ b = (1 - a_squared).sqrt()
+ return a, b
+
+
+class LERPScaler(BaseScaler):
+ def scalers(self, logSNR):
+ _a = logSNR.exp() - 1
+ _a[_a == 0] = 0.001
+ a = 1 + (2 - (2**2 + 4 * _a) ** 0.5) / (2 * _a)
+ b = 1 - a
+ return a, b
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/targets.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/targets.py
new file mode 100644
index 0000000000000000000000000000000000000000..51fb2e2e4601cbff4910892b861f06b2040d6e2d
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/targets.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class EpsilonTarget:
+ def __call__(self, x0, epsilon, logSNR, a, b):
+ return epsilon
+
+ def x0(self, noised, pred, logSNR, a, b):
+ return (noised - pred * b) / a
+
+ def epsilon(self, noised, pred, logSNR, a, b):
+ return pred
+
+
+class X0Target:
+ def __call__(self, x0, epsilon, logSNR, a, b):
+ return x0
+
+ def x0(self, noised, pred, logSNR, a, b):
+ return pred
+
+ def epsilon(self, noised, pred, logSNR, a, b):
+ return (noised - pred * a) / b
+
+
+class VTarget:
+ def __call__(self, x0, epsilon, logSNR, a, b):
+ return a * epsilon - b * x0
+
+ def x0(self, noised, pred, logSNR, a, b):
+ squared_sum = a**2 + b**2
+ return a / squared_sum * noised - b / squared_sum * pred
+
+ def epsilon(self, noised, pred, logSNR, a, b):
+ squared_sum = a**2 + b**2
+ return b / squared_sum * noised + a / squared_sum * pred
+
+
+class RectifiedFlowsTarget:
+ def __call__(self, x0, epsilon, logSNR, a, b):
+ return epsilon - x0
+
+ def x0(self, noised, pred, logSNR, a, b):
+ return noised - pred * b
+
+ def epsilon(self, noised, pred, logSNR, a, b):
+ return noised + pred * a
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..063951a2f34e2da6d2ac9dd82221183876e22354
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .effnet import EfficientNetEncoder
+from .previewer import Previewer
+from .stage_c import AttnBlock, FeedForwardBlock, ResBlock, StageC, TimestepBlock
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/common.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..93724d128cab9e8b7d34438c1ae1f0bc467cc963
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/common.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+
+def load(path="../x.npy"):
+ return paddle.to_tensor(np.load(path))
+
+
+def diff(a, b):
+ return (a - b).abs().mean()
+
+
+class Linear(nn.Linear):
+ def reset_parameters(self):
+ return None
+
+
+class Conv2d(nn.Conv2D):
+ def reset_parameters(self):
+ return None
+
+
+class Attention2D(nn.Layer):
+ def __init__(self, c, nhead, dropout=0.0):
+ super().__init__()
+ self.attn = nn.MultiHeadAttention(c, nhead, dropout=dropout)
+
+ def forward(self, x, kv, self_attn=False):
+ orig_shape = x.shape
+ x = x.reshape([x.shape[0], x.shape[1], -1]).transpose([0, 2, 1])
+ if self_attn:
+ kv = paddle.concat([x, kv], axis=1)
+ x = self.attn(x, kv, kv)
+ x = x.transpose([0, 2, 1]).reshape(orig_shape)
+ return x
+
+
+class LayerNorm2d(nn.LayerNorm):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ def forward(self, x):
+ return super().forward(x.transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2])
+
+
+class GlobalResponseNorm(nn.Layer):
+ def __init__(self, dim):
+ super(GlobalResponseNorm, self).__init__()
+ self.gamma = self.create_parameter(
+ shape=[1, 1, 1, dim], default_initializer=paddle.nn.initializer.Constant(value=0.0)
+ )
+ self.beta = self.create_parameter(
+ shape=[1, 1, 1, dim], default_initializer=paddle.nn.initializer.Constant(value=0.0)
+ )
+ self.gamma.stop_gradient = False
+ self.beta.stop_gradient = False
+
+ def forward(self, x):
+ Gx = paddle.norm(x, p=2, axis=(1, 2), keepdim=True)
+ Nx = Gx / (paddle.mean(Gx, axis=-1, keepdim=True) + 1e-6)
+ x = self.gamma * (x * Nx) + self.beta + x
+ return x
+
+
+class ResBlock(nn.Layer):
+ def __init__(self, c, c_skip=0, kernel_size=3, dropout=0.0):
+ super().__init__()
+ self.depthwise = Conv2d(c, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c)
+ self.norm = LayerNorm2d(c, weight_attr=False, bias_attr=False, epsilon=1e-06)
+ self.channelwise = nn.Sequential(
+ Linear(c + c_skip, c * 4),
+ nn.GELU(),
+ GlobalResponseNorm(c * 4),
+ nn.Dropout(p=dropout),
+ Linear(c * 4, c),
+ )
+
+ def forward(self, x, x_skip=None):
+ x_res = x
+ x = self.depthwise(x)
+ x = self.norm(x)
+ if x_skip is not None:
+ x = paddle.concat(x=[x, x_skip], axis=1)
+
+ x = self.channelwise(x.transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2])
+ return x + x_res
+
+
+class AttnBlock(nn.Layer):
+ def __init__(self, c, c_cond, nhead, self_attn=True, dropout=0.0):
+ super().__init__()
+ self.self_attn = self_attn
+ self.norm = LayerNorm2d(c, weight_attr=False, bias_attr=False, epsilon=1e-06)
+ self.attention = Attention2D(c, nhead, dropout)
+ self.kv_mapper = nn.Sequential(nn.Silu(), Linear(c_cond, c))
+
+ def forward(self, x, kv):
+ kv = self.kv_mapper(kv)
+ x = x + self.attention(self.norm(x), kv, self_attn=self.self_attn)
+ return x
+
+
+class FeedForwardBlock(nn.Layer):
+ def __init__(self, c, dropout=0.0):
+ super().__init__()
+ self.norm = LayerNorm2d(c, weight_attr=False, bias_attr=False, epsilon=1e-06)
+ self.channelwise = nn.Sequential(
+ Linear(c, c * 4),
+ nn.GELU(),
+ GlobalResponseNorm(c * 4),
+ nn.Dropout(p=dropout),
+ Linear(c * 4, c),
+ )
+
+ def forward(self, x):
+ x = x + self.channelwise(self.norm(x).transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2])
+ return x
+
+
+class TimestepBlock(nn.Layer):
+ def __init__(self, c, c_timestep, conds=["sca"], trainable=True):
+ super(TimestepBlock, self).__init__()
+ self.mapper = nn.Linear(c_timestep, c * 2, bias_attr=trainable)
+ self.conds = conds
+ for cname in conds:
+ setattr(self, f"mapper_{cname}", nn.Linear(c_timestep, c * 2, bias_attr=trainable))
+
+ def forward(self, x, t):
+ t = paddle.split(t, num_or_sections=len(self.conds) + 1, axis=1)
+ a_b = self.mapper(t[0])
+ a, b = a_b[:, : a_b.shape[1] // 2, None, None], a_b[:, a_b.shape[1] // 2 :, None, None]
+ for i, c in enumerate(self.conds):
+ ac_bc = getattr(self, f"mapper_{c}")(t[i + 1])
+ ac, bc = ac_bc[:, : ac_bc.shape[1] // 2, None, None], ac_bc[:, ac_bc.shape[1] // 2 :, None, None]
+ a, b = a + ac, b + bc
+ return x * (1 + a) + b
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/efficientnet_v2_s.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/efficientnet_v2_s.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9497b6373f4b8f289fbadc9b318ff4bd14a1741
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/efficientnet_v2_s.py
@@ -0,0 +1,561 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import math
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+from paddle import Tensor
+from paddle.nn import (
+ AdaptiveAvgPool2D,
+ BatchNorm,
+ BatchNorm2D,
+ Conv2D,
+ Dropout,
+ GroupNorm,
+ Layer,
+ Linear,
+ ReLU,
+ Sequential,
+ Sigmoid,
+ Silu,
+)
+from paddle.nn.initializer import Constant, KaimingNormal, Uniform
+from paddle.utils.download import get_weights_path_from_url
+
+__all__ = ["EfficientNet", "EfficientNet_V2_S_Weights", "efficientnet_v2_s"]
+
+
+class SqueezeExcitation(paddle.nn.Layer):
+ """
+ This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
+ Parameters ``activation`` and ``scale_activation`` correspond to ``delta`` and ``sigma`` in eq. 3.
+
+ Args:
+ input_channels (int): Number of channels in the input feature maps
+ squeeze_channels (int): Number of squeeze channels
+ activation (Callable[[Tensor], Tensor], optional): ``delta`` activation. Default: ReLU
+ scale_activation (Callable[[Tensor], Tensor], optional): ``sigma`` activation. Default: Sigmoid
+ """
+
+ def __init__(
+ self,
+ input_channels: int,
+ squeeze_channels: int,
+ activation: Callable[[Tensor], Tensor] = ReLU(),
+ scale_activation: Callable[[Tensor], Tensor] = Sigmoid(),
+ ) -> None:
+ super(SqueezeExcitation, self).__init__()
+ self.avgpool = AdaptiveAvgPool2D(1)
+ self.fc1 = Conv2D(in_channels=input_channels, out_channels=squeeze_channels, kernel_size=1)
+ self.fc2 = Conv2D(in_channels=squeeze_channels, out_channels=input_channels, kernel_size=1)
+ self.activation = activation
+ self.scale_activation = scale_activation
+
+ def forward(self, input: paddle.Tensor) -> paddle.Tensor:
+ scale = self.avgpool(input)
+ scale = self.fc1(scale)
+ scale = self.activation(scale)
+ scale = self.fc2(scale)
+ scale = self.scale_activation(scale)
+ return scale * input
+
+
+def stochastic_depth(input, p, mode, training=True):
+ """
+ Implements the Stochastic Depth from `"Deep Networks with Stochastic Depth"
+
`_ used for randomly dropping residual
+ branches of residual architectures.
+
+ Args:
+ input (paddle.Tensor): The input tensor or arbitrary dimensions with the first one
+ being its batch i.e. a batch with ``N`` rows.
+ p (float): probability of the input to be zeroed.
+ mode (str): ``"batch"`` or ``"row"``.
+ ``"batch"`` randomly zeroes the entire input, ``"row"`` zeroes
+ randomly selected rows from the batch.
+ training (bool): apply stochastic depth if is ``True``. Default: ``True``
+
+ Returns:
+ paddle.Tensor: The randomly zeroed tensor.
+ """
+ if p < 0.0 or p > 1.0:
+ raise ValueError(f"drop probability has to be between 0 and 1, but got {p}")
+ if mode not in ["batch", "row"]:
+ raise ValueError(f"mode has to be either 'batch' or 'row', but got {mode}")
+ if not training or p == 0.0:
+ return input
+
+ survival_rate = 1.0 - p
+ if mode == "row":
+ size = [input.shape[0]] + [1] * (input.ndim - 1)
+ else:
+ size = [1] * input.ndim
+ noise = paddle.empty(size, dtype=input.dtype)
+ survival_rate = paddle.to_tensor(survival_rate, dtype=input.dtype)
+ paddle.assign(paddle.bernoulli(paddle.broadcast_to(survival_rate, noise.shape)), noise)
+ if survival_rate > 0.0:
+ noise /= survival_rate
+ return input * noise
+
+
+class StochasticDepth(Layer):
+ """
+ See :func:`stochastic_depth`.
+ """
+
+ def __init__(self, p: float, mode: str) -> None:
+ super(StochasticDepth, self).__init__()
+ self.p = p
+ self.mode = mode
+
+ def forward(self, input):
+ return stochastic_depth(input, self.p, self.mode, self.training)
+
+ def __repr__(self):
+ s = f"{self.__class__.__name__}(p={self.p}, mode={self.mode})"
+ return s
+
+
+def _make_ntuple(value, n):
+ """Helper function to create a tuple of size n with the given value."""
+ if isinstance(value, int):
+ return (value,) * n
+ return value
+
+
+class ConvNormActivation(Sequential):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: Union[int, Sequence[int]] = 3,
+ stride: Union[int, Sequence[int]] = 1,
+ padding: Optional[Union[int, Sequence[int], str]] = None,
+ groups: int = 1,
+ norm_layer: Optional[Callable[..., paddle.nn.Layer]] = BatchNorm,
+ activation_layer: Optional[Callable[..., paddle.nn.Layer]] = ReLU,
+ dilation: Union[int, Sequence[int]] = 1,
+ inplace: Optional[bool] = True,
+ bias: Optional[bool] = None,
+ conv_layer: Callable[..., Conv2D] = Conv2D,
+ ) -> None:
+ if padding is None:
+ padding = (kernel_size - 1) // 2 * dilation
+ else:
+ padding = _make_ntuple(padding, len(kernel_size))
+
+ layers = [
+ conv_layer(
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride,
+ padding,
+ dilation=dilation,
+ groups=groups,
+ bias_attr=False if bias is None else bias,
+ )
+ ]
+
+ if norm_layer is not None:
+ norm_layer_instance = norm_layer(out_channels, use_global_stats=True)
+ layers.append(norm_layer_instance)
+
+ if activation_layer is not None:
+ layers.append(activation_layer)
+
+ super(ConvNormActivation, self).__init__(*layers)
+ self.out_channels = out_channels
+
+
+class Conv2DNormActivation(ConvNormActivation):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: Union[int, Tuple[int, int]] = 3,
+ stride: Union[int, Tuple[int, int]] = 1,
+ padding: Optional[Union[int, Tuple[int, int], str]] = None,
+ groups: int = 1,
+ norm_layer: Optional[Callable[..., paddle.nn.Layer]] = BatchNorm,
+ activation_layer: Optional[Callable[..., paddle.nn.Layer]] = ReLU,
+ dilation: Union[int, Tuple[int, int]] = 1,
+ inplace: Optional[bool] = True,
+ bias: Optional[bool] = None,
+ ) -> None:
+ super().__init__(
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride,
+ padding,
+ groups,
+ norm_layer,
+ activation_layer,
+ dilation,
+ inplace,
+ bias,
+ Conv2D,
+ )
+
+
+class EfficientNet_V2_S_Weights:
+ IMAGENET1K_V1 = "https://download.pytorch.org/models/efficientnet_v2_s-dd5fe13b.pth"
+
+ def __init__(self, url: str, transforms: Callable[..., Any], meta: Dict[str, Any]) -> None:
+ self.url = url
+ self.transforms = transforms
+ self.meta = meta
+
+ def state_dict(self, progress: bool = True, check_hash: bool = False) -> Dict[str, Any]:
+ path = get_weights_path_from_url(self.url, progress=progress, check_hash=check_hash)
+ return paddle.load(path)
+
+ @classmethod
+ def verify(cls, weights):
+ if weights is None:
+ return None
+ if not isinstance(weights, EfficientNet_V2_S_Weights):
+ raise ValueError(f"weights must be an instance of EfficientNet_V2_S_Weights, but got {type(weights)}")
+ return weights
+
+
+@dataclass
+class _MBConvConfig:
+ expand_ratio: float
+ kernel: int
+ stride: int
+ input_channels: int
+ out_channels: int
+ num_layers: int
+ block: Callable[..., paddle.nn.Layer]
+
+ @staticmethod
+ def adjust_channels(channels: int, width_mult: float, min_value: Optional[int] = None) -> int:
+ return _make_divisible(channels * width_mult, 8, min_value)
+
+
+class MBConvConfig(_MBConvConfig):
+ def __init__(
+ self,
+ expand_ratio: float,
+ kernel: int,
+ stride: int,
+ input_channels: int,
+ out_channels: int,
+ num_layers: int,
+ width_mult: float = 1.0,
+ depth_mult: float = 1.0,
+ block: Optional[Callable[..., paddle.nn.Layer]] = None,
+ ) -> None:
+ input_channels = self.adjust_channels(input_channels, width_mult)
+ out_channels = self.adjust_channels(out_channels, width_mult)
+ num_layers = self.adjust_depth(num_layers, depth_mult)
+ if block is None:
+ block = MBConv
+ super().__init__(expand_ratio, kernel, stride, input_channels, out_channels, num_layers, block)
+
+ @staticmethod
+ def adjust_depth(num_layers: int, depth_mult: float):
+ return int(math.ceil(num_layers * depth_mult))
+
+
+class FusedMBConvConfig(_MBConvConfig):
+ def __init__(
+ self,
+ expand_ratio: float,
+ kernel: int,
+ stride: int,
+ input_channels: int,
+ out_channels: int,
+ num_layers: int,
+ block: Optional[Callable[..., paddle.nn.Layer]] = None,
+ ) -> None:
+ if block is None:
+ block = FusedMBConv
+ super().__init__(expand_ratio, kernel, stride, input_channels, out_channels, num_layers, block)
+
+
+class MBConv(Layer):
+ def __init__(
+ self,
+ cnf,
+ stochastic_depth_prob: float,
+ norm_layer: Callable[..., Layer],
+ se_layer: Callable[..., Layer] = SqueezeExcitation,
+ ) -> None:
+ super(MBConv, self).__init__()
+
+ if not (1 <= cnf.stride <= 2):
+ raise ValueError("illegal stride value")
+
+ self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels
+
+ layers = []
+ activation_layer = nn.Silu()
+
+ # expand
+ expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
+ if expanded_channels != cnf.input_channels:
+ layers.append(
+ Conv2DNormActivation(
+ cnf.input_channels,
+ expanded_channels,
+ kernel_size=1,
+ norm_layer=norm_layer,
+ activation_layer=activation_layer,
+ )
+ )
+
+ # depthwise
+ layers.append(
+ Conv2DNormActivation(
+ expanded_channels,
+ expanded_channels,
+ kernel_size=cnf.kernel,
+ stride=cnf.stride,
+ groups=expanded_channels,
+ norm_layer=norm_layer,
+ activation_layer=activation_layer,
+ )
+ )
+
+ # squeeze and excitation
+ squeeze_channels = max(1, cnf.input_channels // 4)
+ layers.append(se_layer(expanded_channels, squeeze_channels, activation=nn.Silu()))
+
+ # project
+ layers.append(
+ Conv2DNormActivation(
+ expanded_channels, cnf.out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=None
+ )
+ )
+
+ self.block = Sequential(*layers)
+ self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
+ self.out_channels = cnf.out_channels
+
+ def forward(self, input) -> paddle.Tensor:
+ result = self.block(input)
+ if self.use_res_connect:
+ result = self.stochastic_depth(result)
+ result += input
+ return result
+
+
+class FusedMBConv(Layer):
+ def __init__(
+ self,
+ cnf: "FusedMBConvConfig",
+ stochastic_depth_prob: float,
+ norm_layer: Callable[..., Layer],
+ ) -> None:
+ super(FusedMBConv, self).__init__()
+
+ if not (1 <= cnf.stride <= 2):
+ raise ValueError("illegal stride value")
+
+ self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels
+
+ layers: List[Layer] = []
+ activation_layer = nn.Silu()
+
+ expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
+ if expanded_channels != cnf.input_channels:
+ # fused expand and project
+ layers.append(
+ Conv2DNormActivation(
+ cnf.input_channels,
+ expanded_channels,
+ kernel_size=cnf.kernel,
+ stride=cnf.stride,
+ norm_layer=norm_layer,
+ activation_layer=activation_layer,
+ )
+ )
+ # project
+ layers.append(
+ Conv2DNormActivation(
+ expanded_channels,
+ cnf.out_channels,
+ kernel_size=1,
+ norm_layer=norm_layer,
+ activation_layer=None,
+ )
+ )
+ else:
+ layers.append(
+ Conv2DNormActivation(
+ cnf.input_channels,
+ cnf.out_channels,
+ kernel_size=cnf.kernel,
+ stride=cnf.stride,
+ norm_layer=norm_layer,
+ activation_layer=activation_layer,
+ )
+ )
+
+ self.block = Sequential(*layers)
+ self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
+ self.out_channels = cnf.out_channels
+
+ def forward(self, input: Tensor) -> Tensor:
+ result = self.block(input)
+ if self.use_res_connect:
+ result = self.stochastic_depth(result)
+ result += input
+ return result
+
+
+class EfficientNet(Layer):
+ def __init__(
+ self,
+ inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]],
+ dropout: float,
+ stochastic_depth_prob: float = 0.2,
+ num_classes: int = 1000,
+ norm_layer: Optional[Callable[..., paddle.nn.Layer]] = None,
+ last_channel: Optional[int] = None,
+ ) -> None:
+ super().__init__()
+ if not inverted_residual_setting:
+ raise ValueError("The inverted_residual_setting should not be empty")
+ elif not (
+ isinstance(inverted_residual_setting, Sequence)
+ and all([isinstance(s, _MBConvConfig) for s in inverted_residual_setting])
+ ):
+ raise TypeError("The inverted_residual_setting should be List[MBConvConfig]")
+ if norm_layer is None:
+ norm_layer = BatchNorm2D
+ layers: List[paddle.nn.Layer] = []
+ firstconv_output_channels = inverted_residual_setting[0].input_channels
+ layers.append(
+ Conv2DNormActivation(
+ 3, firstconv_output_channels, kernel_size=3, stride=2, norm_layer=norm_layer, activation_layer=Silu()
+ )
+ )
+ total_stage_blocks = sum(cnf.num_layers for cnf in inverted_residual_setting)
+ stage_block_id = 0
+ for cnf in inverted_residual_setting:
+ stage: List[paddle.nn.Layer] = []
+ for _ in range(cnf.num_layers):
+ block_cnf = copy.copy(cnf)
+ if stage:
+ block_cnf.input_channels = block_cnf.out_channels
+ block_cnf.stride = 1
+ sd_prob = stochastic_depth_prob * float(stage_block_id) / total_stage_blocks
+ stage.append(block_cnf.block(block_cnf, sd_prob, norm_layer))
+ stage_block_id += 1
+ layers.append(Sequential(*stage))
+ lastconv_input_channels = inverted_residual_setting[-1].out_channels
+ lastconv_output_channels = last_channel if last_channel is not None else 4 * lastconv_input_channels
+ layers.append(
+ Conv2DNormActivation(
+ lastconv_input_channels,
+ lastconv_output_channels,
+ kernel_size=1,
+ norm_layer=norm_layer,
+ activation_layer=Silu(),
+ )
+ )
+ self.features = Sequential(*layers)
+ self.avgpool = AdaptiveAvgPool2D(output_size=1)
+ self.classifier = Sequential(
+ Dropout(p=dropout), Linear(in_features=lastconv_output_channels, out_features=num_classes)
+ )
+
+ for m in self.sublayers():
+ if isinstance(m, Conv2D):
+ KaimingNormal()(m.weight)
+ if m.bias is not None:
+ Constant(value=0.0)(m.bias)
+ elif isinstance(m, (BatchNorm2D, GroupNorm)):
+ Constant(value=1.0)(m.weight)
+ Constant(value=0.0)(m.bias)
+ elif isinstance(m, Linear):
+ init_range = 1.0 / math.sqrt(m.weight.shape[1])
+ Uniform(low=-init_range, high=init_range)(m.weight)
+ Constant(value=0.0)(m.bias)
+
+ def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+ x = self.features(x)
+ x = self.avgpool(x)
+ x = paddle.flatten(x=x, start_axis=1)
+ x = self.classifier(x)
+ return x
+
+
+def _make_divisible(value: float, divisor: int, min_value: Optional[int] = None) -> int:
+ if min_value is None:
+ min_value = divisor
+ new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+ if new_value < 0.9 * value:
+ new_value += divisor
+ return new_value
+
+
+def _efficientnet(
+ inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]],
+ dropout: float,
+ last_channel: Optional[int],
+ weights: Optional[EfficientNet_V2_S_Weights],
+ progress: bool,
+ **kwargs: Any
+) -> EfficientNet:
+ if weights is not None:
+ kwargs["num_classes"] = len(weights.meta["categories"])
+ model = EfficientNet(inverted_residual_setting, dropout, last_channel=last_channel, **kwargs)
+ if weights is not None:
+ model.set_state_dict(weights.state_dict(progress=progress, check_hash=True))
+ return model
+
+
+def _efficientnet_conf(
+ arch: str, **kwargs: Any
+) -> Tuple[Sequence[Union[MBConvConfig, FusedMBConvConfig]], Optional[int]]:
+ inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]]
+ if arch.startswith("efficientnet_v2_s"):
+ inverted_residual_setting = [
+ FusedMBConvConfig(1, 3, 1, 24, 24, 2),
+ FusedMBConvConfig(4, 3, 2, 24, 48, 4),
+ FusedMBConvConfig(4, 3, 2, 48, 64, 4),
+ MBConvConfig(4, 3, 2, 64, 128, 6),
+ MBConvConfig(6, 3, 1, 128, 160, 9),
+ MBConvConfig(6, 3, 2, 160, 256, 15),
+ ]
+ last_channel = 1280
+ else:
+ raise ValueError(f"Unsupported model type {arch}")
+ return inverted_residual_setting, last_channel
+
+
+def efficientnet_v2_s(
+ *, weights: Optional[EfficientNet_V2_S_Weights] = None, progress: bool = True, **kwargs: Any
+) -> EfficientNet:
+ weights = EfficientNet_V2_S_Weights.verify(weights)
+ inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_s")
+ return _efficientnet(
+ inverted_residual_setting,
+ kwargs.pop("dropout", 0.2),
+ last_channel,
+ weights,
+ progress,
+ norm_layer=partial(BatchNorm2D, epsilon=0.001),
+ **kwargs,
+ )
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/effnet.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/effnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..84bc3fb0f907f802a807e51102ecd6bbba7ea338
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/effnet.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from .efficientnet_v2_s import efficientnet_v2_s
+
+
+class BatchNorm2D(nn.Layer):
+ def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True):
+ super(BatchNorm2D, self).__init__()
+ self.num_features = num_features
+ self.eps = eps
+ self.momentum = momentum
+ self.affine = affine
+ self.track_running_stats = track_running_stats
+
+ if self.affine:
+ self.weight = self.create_parameter(
+ shape=[num_features], default_initializer=paddle.nn.initializer.Constant(value=1.0)
+ )
+ self.bias = self.create_parameter(
+ shape=[num_features], default_initializer=paddle.nn.initializer.Constant(value=0.0)
+ )
+ else:
+ self.weight = None
+ self.bias = None
+
+ if self.track_running_stats:
+ self._mean = self.create_parameter(
+ shape=[num_features], default_initializer=paddle.nn.initializer.Constant(value=0.0), is_bias=False
+ )
+ self._variance = self.create_parameter(
+ shape=[num_features], default_initializer=paddle.nn.initializer.Constant(value=1.0), is_bias=False
+ )
+ self._mean.stop_gradient = True
+ self._variance.stop_gradient = True
+ else:
+ self._mean = None
+ self._variance = None
+
+ def forward(self, input):
+ mean = self._mean
+ variance = self._variance
+
+ output = (input - paddle.unsqueeze(mean, axis=[0, 2, 3])) / paddle.unsqueeze(
+ paddle.sqrt(variance + self.eps), axis=[0, 2, 3]
+ )
+ if self.affine:
+ output = output * paddle.unsqueeze(self.weight, axis=[0, 2, 3]) + paddle.unsqueeze(
+ self.bias, axis=[0, 2, 3]
+ )
+ return output
+
+
+class EfficientNetEncoder(nn.Layer):
+ def __init__(self, c_latent=16):
+ super().__init__()
+ self.backbone = efficientnet_v2_s().features
+ self.backbone.eval()
+ self.mapper = nn.Sequential(
+ nn.Conv2D(1280, c_latent, kernel_size=1, bias_attr=False),
+ BatchNorm2D(c_latent, affine=False),
+ )
+ self.mapper.eval()
+
+ def forward(self, x):
+
+ x = self.backbone(x)
+ x = self.mapper(x)
+ return x
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/previewer.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/previewer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d26ef68dd319d993bf3bc51881441fb657170a62
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/previewer.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+
+class Previewer(paddle.nn.Layer):
+ def __init__(self, c_in=16, c_hidden=512, c_out=3):
+ super().__init__()
+ self.blocks = paddle.nn.Sequential(
+ paddle.nn.Conv2D(in_channels=c_in, out_channels=c_hidden, kernel_size=1),
+ paddle.nn.GELU(),
+ paddle.nn.BatchNorm2D(num_features=c_hidden),
+ paddle.nn.Conv2D(in_channels=c_hidden, out_channels=c_hidden, kernel_size=3, padding=1),
+ paddle.nn.GELU(),
+ paddle.nn.BatchNorm2D(num_features=c_hidden),
+ paddle.nn.Conv2DTranspose(
+ in_channels=c_hidden,
+ out_channels=c_hidden // 2,
+ kernel_size=2,
+ stride=2,
+ ),
+ paddle.nn.GELU(),
+ paddle.nn.BatchNorm2D(num_features=c_hidden // 2),
+ paddle.nn.Conv2D(
+ in_channels=c_hidden // 2,
+ out_channels=c_hidden // 2,
+ kernel_size=3,
+ padding=1,
+ ),
+ paddle.nn.GELU(),
+ paddle.nn.BatchNorm2D(num_features=c_hidden // 2),
+ paddle.nn.Conv2DTranspose(
+ in_channels=c_hidden // 2,
+ out_channels=c_hidden // 4,
+ kernel_size=2,
+ stride=2,
+ ),
+ paddle.nn.GELU(),
+ paddle.nn.BatchNorm2D(num_features=c_hidden // 4),
+ paddle.nn.Conv2D(
+ in_channels=c_hidden // 4,
+ out_channels=c_hidden // 4,
+ kernel_size=3,
+ padding=1,
+ ),
+ paddle.nn.GELU(),
+ paddle.nn.BatchNorm2D(num_features=c_hidden // 4),
+ paddle.nn.Conv2DTranspose(
+ in_channels=c_hidden // 4,
+ out_channels=c_hidden // 4,
+ kernel_size=2,
+ stride=2,
+ ),
+ paddle.nn.GELU(),
+ paddle.nn.BatchNorm2D(num_features=c_hidden // 4),
+ paddle.nn.Conv2D(
+ in_channels=c_hidden // 4,
+ out_channels=c_hidden // 4,
+ kernel_size=3,
+ padding=1,
+ ),
+ paddle.nn.GELU(),
+ paddle.nn.BatchNorm2D(num_features=c_hidden // 4),
+ paddle.nn.Conv2D(in_channels=c_hidden // 4, out_channels=c_out, kernel_size=1),
+ )
+
+ def forward(self, x):
+ return self.blocks(x)
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_a.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_a.py
new file mode 100644
index 0000000000000000000000000000000000000000..24861c58f4ddf14f4ac88af18d6d8d59f6f6edc6
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_a.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from torchtools.nn import VectorQuantize
+
+
+class ResBlock(paddle.nn.Layer):
+ def __init__(self, c, c_hidden):
+ super().__init__()
+ self.norm1 = paddle.nn.LayerNorm(normalized_shape=c, weight_attr=False, bias_attr=False, epsilon=1e-06)
+ self.depthwise = paddle.nn.Sequential(
+ paddle.nn.Pad2D(padding=1, mode="replicate"),
+ paddle.nn.Conv2D(in_channels=c, out_channels=c, kernel_size=3, groups=c),
+ )
+ self.norm2 = paddle.nn.LayerNorm(normalized_shape=c, weight_attr=False, bias_attr=False, epsilon=1e-06)
+ self.channelwise = paddle.nn.Sequential(
+ paddle.nn.Linear(in_features=c, out_features=c_hidden),
+ paddle.nn.GELU(),
+ paddle.nn.Linear(in_features=c_hidden, out_features=c),
+ )
+ out_19 = paddle.create_parameter(
+ shape=paddle.zeros(shape=[6]).shape,
+ dtype=paddle.zeros(shape=[6]).numpy().dtype,
+ default_initializer=paddle.nn.initializer.Assign(paddle.zeros(shape=[6])),
+ )
+ out_19.stop_gradient = not True
+ self.gammas = out_19
+
+ def _basic_init(module):
+ if isinstance(module, paddle.nn.Linear) or isinstance(module, paddle.nn.Conv2D):
+ init_XavierUniform = paddle.nn.initializer.XavierUniform()
+ init_XavierUniform(module.weight)
+ if module.bias is not None:
+ init_Constant = paddle.nn.initializer.Constant(value=0)
+ init_Constant(module.bias)
+
+ self.apply(_basic_init)
+
+ def _norm(self, x, norm):
+ return norm(x.transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2])
+
+ def forward(self, x):
+ mods = self.gammas
+ x_temp = self._norm(x, self.norm1) * (1 + mods[0]) + mods[1]
+ x = x + self.depthwise(x_temp) * mods[2]
+ x_temp = self._norm(x, self.norm2) * (1 + mods[3]) + mods[4]
+ x = x + self.channelwise(x_temp.transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2]) * mods[5]
+ return x
+
+
+class StageA(paddle.nn.Layer):
+ def __init__(
+ self,
+ levels=2,
+ bottleneck_blocks=12,
+ c_hidden=384,
+ c_latent=4,
+ codebook_size=8192,
+ scale_factor=0.43,
+ ):
+ super().__init__()
+ self.c_latent = c_latent
+ self.scale_factor = scale_factor
+ c_levels = [(c_hidden // 2**i) for i in reversed(range(levels))]
+ self.in_block = paddle.nn.Sequential(
+ paddle.nn.PixelUnshuffle(downscale_factor=2),
+ paddle.nn.Conv2D(in_channels=3 * 4, out_channels=c_levels[0], kernel_size=1),
+ )
+ down_blocks = []
+ for i in range(levels):
+ if i > 0:
+ down_blocks.append(
+ paddle.nn.Conv2D(
+ in_channels=c_levels[i - 1],
+ out_channels=c_levels[i],
+ kernel_size=4,
+ stride=2,
+ padding=1,
+ )
+ )
+ block = ResBlock(c_levels[i], c_levels[i] * 4)
+ down_blocks.append(block)
+ down_blocks.append(
+ paddle.nn.Sequential(
+ paddle.nn.Conv2D(
+ in_channels=c_levels[-1],
+ out_channels=c_latent,
+ kernel_size=1,
+ bias_attr=False,
+ ),
+ paddle.nn.BatchNorm2D(num_features=c_latent),
+ )
+ )
+ self.down_blocks = paddle.nn.Sequential(*down_blocks)
+ self.down_blocks[0]
+ self.codebook_size = codebook_size
+ self.vquantizer = VectorQuantize(c_latent, k=codebook_size)
+ up_blocks = [
+ paddle.nn.Sequential(paddle.nn.Conv2D(in_channels=c_latent, out_channels=c_levels[-1], kernel_size=1))
+ ]
+ for i in range(levels):
+ for j in range(bottleneck_blocks if i == 0 else 1):
+ block = ResBlock(c_levels[levels - 1 - i], c_levels[levels - 1 - i] * 4)
+ up_blocks.append(block)
+ if i < levels - 1:
+ up_blocks.append(
+ paddle.nn.Conv2DTranspose(
+ in_channels=c_levels[levels - 1 - i],
+ out_channels=c_levels[levels - 2 - i],
+ kernel_size=4,
+ stride=2,
+ padding=1,
+ )
+ )
+ self.up_blocks = paddle.nn.Sequential(*up_blocks)
+ self.out_block = paddle.nn.Sequential(
+ paddle.nn.Conv2D(in_channels=c_levels[0], out_channels=3 * 4, kernel_size=1),
+ paddle.nn.PixelShuffle(upscale_factor=2),
+ )
+
+ def encode(self, x, quantize=False):
+ x = self.in_block(x)
+ x = self.down_blocks(x)
+ if quantize:
+ qe, (vq_loss, commit_loss), indices = self.vquantizer.forward(x, dim=1)
+ return (
+ qe / self.scale_factor,
+ x / self.scale_factor,
+ indices,
+ vq_loss + commit_loss * 0.25,
+ )
+ else:
+ return x / self.scale_factor, None, None, None
+
+ def decode(self, x):
+ x = x * self.scale_factor
+ x = self.up_blocks(x)
+ x = self.out_block(x)
+ return x
+
+ def forward(self, x, quantize=False):
+ qe, x, _, vq_loss = self.encode(x, quantize)
+ x = self.decode(qe)
+ return x, vq_loss
+
+
+class Discriminator(paddle.nn.Layer):
+ def __init__(self, c_in=3, c_cond=0, c_hidden=512, depth=6):
+ super().__init__()
+ d = max(depth - 3, 3)
+ layers = [
+ paddle.nn.utils.spectral_norm(
+ layer=paddle.nn.Conv2D(
+ in_channels=c_in,
+ out_channels=c_hidden // 2**d,
+ kernel_size=3,
+ stride=2,
+ padding=1,
+ )
+ ),
+ paddle.nn.LeakyReLU(negative_slope=0.2),
+ ]
+ for i in range(depth - 1):
+ c_in = c_hidden // 2 ** max(d - i, 0)
+ c_out = c_hidden // 2 ** max(d - 1 - i, 0)
+ layers.append(
+ paddle.nn.utils.spectral_norm(
+ layer=paddle.nn.Conv2D(
+ in_channels=c_in,
+ out_channels=c_out,
+ kernel_size=3,
+ stride=2,
+ padding=1,
+ )
+ )
+ )
+ layers.append(paddle.nn.InstanceNorm2D(num_features=c_out, momentum=1 - 0.1))
+ layers.append(paddle.nn.LeakyReLU(negative_slope=0.2))
+ self.encoder = paddle.nn.Sequential(*layers)
+ self.shuffle = paddle.nn.Conv2D(
+ in_channels=c_hidden + c_cond if c_cond > 0 else c_hidden,
+ out_channels=1,
+ kernel_size=1,
+ )
+ self.logits = paddle.nn.Sigmoid()
+
+ def forward(self, x, cond=None):
+ x = self.encoder(x)
+ if cond is not None:
+ cond = cond.reshape([cond.shape[0], cond.shape[1], 1, 1]).expand(shape=[-1, -1, x.shape[-2], x.shape[-1]])
+ x = paddle.concat(x=[x, cond], axis=1)
+ x = self.shuffle(x)
+ x = self.logits(x)
+ return x
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_b.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_b.py
new file mode 100644
index 0000000000000000000000000000000000000000..34a9fd7abc8b43658437d367b56ef064dab746fc
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_b.py
@@ -0,0 +1,349 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+import paddle
+import paddle_aux # noqa
+
+from .common import AttnBlock, FeedForwardBlock, LayerNorm2d, ResBlock, TimestepBlock
+
+
+class StageB(paddle.nn.Layer):
+ def __init__(
+ self,
+ c_in=4,
+ c_out=4,
+ c_r=64,
+ patch_size=2,
+ c_cond=1280,
+ c_hidden=[320, 640, 1280, 1280],
+ nhead=[-1, -1, 20, 20],
+ blocks=[[2, 6, 28, 6], [6, 28, 6, 2]],
+ block_repeat=[[1, 1, 1, 1], [3, 3, 2, 2]],
+ level_config=["CT", "CT", "CTA", "CTA"],
+ c_clip=1280,
+ c_clip_seq=4,
+ c_effnet=16,
+ c_pixels=3,
+ kernel_size=3,
+ dropout=[0, 0, 0.1, 0.1],
+ self_attn=True,
+ t_conds=["sca"],
+ ):
+ super().__init__()
+ self.c_r = c_r
+ self.t_conds = t_conds
+ self.c_clip_seq = c_clip_seq
+ if not isinstance(dropout, list):
+ dropout = [dropout] * len(c_hidden)
+ if not isinstance(self_attn, list):
+ self_attn = [self_attn] * len(c_hidden)
+ self.effnet_mapper = paddle.nn.Sequential(
+ paddle.nn.Conv2D(in_channels=c_effnet, out_channels=c_hidden[0] * 4, kernel_size=1),
+ paddle.nn.GELU(),
+ paddle.nn.Conv2D(in_channels=c_hidden[0] * 4, out_channels=c_hidden[0], kernel_size=1),
+ LayerNorm2d(c_hidden[0], weight_attr=False, bias_attr=False, epsilon=1e-06),
+ )
+ self.pixels_mapper = paddle.nn.Sequential(
+ paddle.nn.Conv2D(in_channels=c_pixels, out_channels=c_hidden[0] * 4, kernel_size=1),
+ paddle.nn.GELU(),
+ paddle.nn.Conv2D(in_channels=c_hidden[0] * 4, out_channels=c_hidden[0], kernel_size=1),
+ LayerNorm2d(c_hidden[0], weight_attr=False, bias_attr=False, epsilon=1e-06),
+ )
+ self.clip_mapper = paddle.nn.Linear(in_features=c_clip, out_features=c_cond * c_clip_seq)
+ self.clip_norm = paddle.nn.LayerNorm(
+ normalized_shape=c_cond, weight_attr=False, bias_attr=False, epsilon=1e-06
+ )
+ self.embedding = paddle.nn.Sequential(
+ paddle.nn.PixelUnshuffle(downscale_factor=patch_size),
+ paddle.nn.Conv2D(
+ in_channels=c_in * patch_size**2,
+ out_channels=c_hidden[0],
+ kernel_size=1,
+ ),
+ LayerNorm2d(c_hidden[0], weight_attr=False, bias_attr=False, epsilon=1e-06),
+ )
+
+ def get_block(block_type, c_hidden, nhead, c_skip=0, dropout=0, self_attn=True):
+ if block_type == "C":
+ return ResBlock(c_hidden, c_skip, kernel_size=kernel_size, dropout=dropout)
+ elif block_type == "A":
+ return AttnBlock(c_hidden, c_cond, nhead, self_attn=self_attn, dropout=dropout)
+ elif block_type == "F":
+ return FeedForwardBlock(c_hidden, dropout=dropout)
+ elif block_type == "T":
+ return TimestepBlock(c_hidden, c_r, conds=t_conds)
+ else:
+ raise Exception(f"Block type {block_type} not supported")
+
+ self.down_blocks = paddle.nn.LayerList()
+ self.down_downscalers = paddle.nn.LayerList()
+ self.down_repeat_mappers = paddle.nn.LayerList()
+ for i in range(len(c_hidden)):
+ if i > 0:
+ self.down_downscalers.append(
+ paddle.nn.Sequential(
+ LayerNorm2d(
+ c_hidden[i - 1],
+ weight_attr=False,
+ bias_attr=False,
+ epsilon=1e-06,
+ ),
+ paddle.nn.Conv2D(
+ in_channels=c_hidden[i - 1],
+ out_channels=c_hidden[i],
+ kernel_size=2,
+ stride=2,
+ ),
+ )
+ )
+ else:
+ self.down_downscalers.append(paddle.nn.Identity())
+ down_block = paddle.nn.LayerList()
+ for _ in range(blocks[0][i]):
+ for block_type in level_config[i]:
+ block = get_block(
+ block_type,
+ c_hidden[i],
+ nhead[i],
+ dropout=dropout[i],
+ self_attn=self_attn[i],
+ )
+ down_block.append(block)
+ self.down_blocks.append(down_block)
+ if block_repeat is not None:
+ block_repeat_mappers = paddle.nn.LayerList()
+ for _ in range(block_repeat[0][i] - 1):
+ block_repeat_mappers.append(
+ paddle.nn.Conv2D(
+ in_channels=c_hidden[i],
+ out_channels=c_hidden[i],
+ kernel_size=1,
+ )
+ )
+ self.down_repeat_mappers.append(block_repeat_mappers)
+ self.up_blocks = paddle.nn.LayerList()
+ self.up_upscalers = paddle.nn.LayerList()
+ self.up_repeat_mappers = paddle.nn.LayerList()
+ for i in reversed(range(len(c_hidden))):
+ if i > 0:
+ self.up_upscalers.append(
+ paddle.nn.Sequential(
+ LayerNorm2d(
+ c_hidden[i],
+ weight_attr=False,
+ bias_attr=False,
+ epsilon=1e-06,
+ ),
+ paddle.nn.Conv2DTranspose(
+ in_channels=c_hidden[i],
+ out_channels=c_hidden[i - 1],
+ kernel_size=2,
+ stride=2,
+ ),
+ )
+ )
+ else:
+ self.up_upscalers.append(paddle.nn.Identity())
+ up_block = paddle.nn.LayerList()
+ for j in range(blocks[1][::-1][i]):
+ for k, block_type in enumerate(level_config[i]):
+ c_skip = c_hidden[i] if i < len(c_hidden) - 1 and j == k == 0 else 0
+ block = get_block(
+ block_type,
+ c_hidden[i],
+ nhead[i],
+ c_skip=c_skip,
+ dropout=dropout[i],
+ self_attn=self_attn[i],
+ )
+ up_block.append(block)
+ self.up_blocks.append(up_block)
+ if block_repeat is not None:
+ block_repeat_mappers = paddle.nn.LayerList()
+ for _ in range(block_repeat[1][::-1][i] - 1):
+ block_repeat_mappers.append(
+ paddle.nn.Conv2D(
+ in_channels=c_hidden[i],
+ out_channels=c_hidden[i],
+ kernel_size=1,
+ )
+ )
+ self.up_repeat_mappers.append(block_repeat_mappers)
+ self.clf = paddle.nn.Sequential(
+ LayerNorm2d(c_hidden[0], weight_attr=False, bias_attr=False, epsilon=1e-06),
+ paddle.nn.Conv2D(
+ in_channels=c_hidden[0],
+ out_channels=c_out * patch_size**2,
+ kernel_size=1,
+ ),
+ paddle.nn.PixelShuffle(upscale_factor=patch_size),
+ )
+ self.apply(self._init_weights)
+ init_Normal = paddle.nn.initializer.Normal(std=0.02)
+ init_Normal(self.clip_mapper.weight)
+ init_Normal = paddle.nn.initializer.Normal(std=0.02)
+ init_Normal(self.effnet_mapper[0].weight)
+ init_Normal = paddle.nn.initializer.Normal(std=0.02)
+ init_Normal(self.effnet_mapper[2].weight)
+ init_Normal = paddle.nn.initializer.Normal(std=0.02)
+ init_Normal(self.pixels_mapper[0].weight)
+ init_Normal = paddle.nn.initializer.Normal(std=0.02)
+ init_Normal(self.pixels_mapper[2].weight)
+ paddle.nn.initializer.XavierUniform()(self.embedding[1].weight)
+ init_Constant = paddle.nn.initializer.Constant(value=0)
+ init_Constant(self.clf[1].weight)
+ for level_list in (self.down_blocks, self.up_blocks):
+ for level_block in level_list:
+ for block in level_block:
+ if isinstance(block, ResBlock) or isinstance(block, FeedForwardBlock):
+ block.channelwise[-1].weight.data *= np.sqrt(1 / sum(blocks[0]))
+ elif isinstance(block, TimestepBlock):
+ for layer in block.sublayers():
+ if isinstance(layer, paddle.nn.Linear):
+ init_Constant = paddle.nn.initializer.Constant(value=0)
+ init_Constant(layer.weight)
+
+ def _init_weights(self, m):
+ if isinstance(m, (paddle.nn.Conv2D, paddle.nn.Linear)):
+ init_XavierUniform = paddle.nn.initializer.XavierUniform()
+ init_XavierUniform(m.weight)
+ if m.bias is not None:
+ init_Constant = paddle.nn.initializer.Constant(value=0)
+ init_Constant(m.bias)
+
+ def gen_r_embedding(self, r, max_positions=10000):
+ r = r * max_positions
+ half_dim = self.c_r // 2
+ emb = math.log(max_positions) / (half_dim - 1)
+ emb = paddle.arange(end=half_dim).astype(dtype="float32").mul(-emb).exp()
+ emb = r[:, None] * emb[None, :]
+ emb = paddle.concat(x=[emb.sin(), emb.cos()], axis=1)
+ if self.c_r % 2 == 1:
+ emb = paddle.nn.functional.pad(emb, [0, 1], mode="constant")
+ return emb
+
+ def gen_c_embeddings(self, clip):
+ if len(clip.shape) == 2:
+ clip = clip.unsqueeze(axis=1)
+ clip = self.clip_mapper(clip).reshape([clip.shape[0], clip.shape[1] * self.c_clip_seq, -1])
+
+ clip = self.clip_norm(clip)
+ return clip
+
+ def _down_encode(self, x, r_embed, clip):
+ level_outputs = []
+ block_group = zip(self.down_blocks, self.down_downscalers, self.down_repeat_mappers)
+ for down_block, downscaler, repmap in block_group:
+ x = downscaler(x)
+ for i in range(len(repmap) + 1):
+ for block in down_block:
+ if (
+ isinstance(block, ResBlock)
+ or hasattr(block, "_fsdp_wrapped_module")
+ and isinstance(block._fsdp_wrapped_module, ResBlock)
+ ):
+ x = block(x)
+ elif (
+ isinstance(block, AttnBlock)
+ or hasattr(block, "_fsdp_wrapped_module")
+ and isinstance(block._fsdp_wrapped_module, AttnBlock)
+ ):
+ x = block(x, clip)
+ elif (
+ isinstance(block, TimestepBlock)
+ or hasattr(block, "_fsdp_wrapped_module")
+ and isinstance(block._fsdp_wrapped_module, TimestepBlock)
+ ):
+ x = block(x, r_embed)
+ else:
+ x = block(x)
+ if i < len(repmap):
+ x = repmap[i](x)
+ level_outputs.insert(0, x)
+ return level_outputs
+
+ def _up_decode(self, level_outputs, r_embed, clip):
+ x = level_outputs[0]
+ block_group = zip(self.up_blocks, self.up_upscalers, self.up_repeat_mappers)
+ for i, (up_block, upscaler, repmap) in enumerate(block_group):
+ for j in range(len(repmap) + 1):
+ for k, block in enumerate(up_block):
+ if (
+ isinstance(block, ResBlock)
+ or hasattr(block, "_fsdp_wrapped_module")
+ and isinstance(block._fsdp_wrapped_module, ResBlock)
+ ):
+ skip = level_outputs[i] if k == 0 and i > 0 else None
+ if skip is not None and (x.shape[-1] != skip.shape[-1] or x.shape[-2] != skip.shape[-2]):
+ x = paddle.nn.functional.interpolate(
+ x=x.astype(dtype="float32"),
+ size=skip.shape[-2:],
+ mode="bilinear",
+ align_corners=True,
+ )
+ x = block(x, skip)
+ elif (
+ isinstance(block, AttnBlock)
+ or hasattr(block, "_fsdp_wrapped_module")
+ and isinstance(block._fsdp_wrapped_module, AttnBlock)
+ ):
+ x = block(x, clip)
+ elif (
+ isinstance(block, TimestepBlock)
+ or hasattr(block, "_fsdp_wrapped_module")
+ and isinstance(block._fsdp_wrapped_module, TimestepBlock)
+ ):
+ x = block(x, r_embed)
+ else:
+ x = block(x)
+ if j < len(repmap):
+ x = repmap[j](x)
+ x = upscaler(x)
+ return x
+
+ def forward(self, x, r, effnet, clip, pixels=None, **kwargs):
+ if pixels is None:
+ pixels = paddle.zeros(shape=[x.shape[0], 3, 8, 8], dtype=x.dtype)
+ r_embed = self.gen_r_embedding(r)
+ for c in self.t_conds:
+ t_cond = kwargs.get(c, paddle.zeros_like(x=r))
+ r_embed = paddle.concat(x=[r_embed, self.gen_r_embedding(t_cond)], axis=1)
+ clip = self.gen_c_embeddings(clip)
+ x = self.embedding(x)
+ x = x + self.effnet_mapper(
+ paddle.nn.functional.interpolate(
+ x=effnet.astype(dtype="float32"),
+ size=x.shape[-2:],
+ mode="bilinear",
+ align_corners=True,
+ )
+ )
+ x = x + paddle.nn.functional.interpolate(
+ x=self.pixels_mapper(pixels).astype(dtype="float32"),
+ size=x.shape[-2:],
+ mode="bilinear",
+ align_corners=True,
+ )
+ level_outputs = self._down_encode(x, r_embed, clip)
+ x = self._up_decode(level_outputs, r_embed, clip)
+ return self.clf(x)
+
+ def update_weights_ema(self, src_model, beta=0.999):
+ for self_params, src_params in zip(self.parameters(), src_model.parameters()):
+ self_params.data = self_params.data * beta + src_params.data.clone() * (1 - beta)
+ for self_buffers, src_buffers in zip(self.buffers(), src_model.buffers()):
+ self_buffers.data = self_buffers.data * beta + src_buffers.data.clone() * (1 - beta)
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_c.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_c.py
new file mode 100644
index 0000000000000000000000000000000000000000..c868be56c475de877c6cc02b44c03a47e81db102
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/modules/stage_c.py
@@ -0,0 +1,368 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle_aux # noqa
+
+from .common import AttnBlock, FeedForwardBlock, LayerNorm2d, ResBlock, TimestepBlock
+
+
+def load(path="../x.npy"):
+ return paddle.to_tensor(np.load(path))
+
+
+def diff(a, b):
+ return (a - b).abs().mean()
+
+
+class UpDownBlock2d(nn.Layer):
+ def __init__(self, c_in, c_out, mode, enabled=True):
+ super().__init__()
+ assert mode in ["up", "down"]
+ interpolation = (
+ nn.Upsample(
+ scale_factor=2 if mode == "up" else 0.5,
+ mode="bilinear",
+ align_corners=True,
+ )
+ if enabled
+ else nn.Identity()
+ )
+ mapping = nn.Conv2D(in_channels=c_in, out_channels=c_out, kernel_size=1)
+ self.blocks = nn.LayerList(sublayers=[interpolation, mapping] if mode == "up" else [mapping, interpolation])
+
+ def forward(self, x):
+ for block in self.blocks:
+ x = block(x.astype(paddle.float32))
+ return x
+
+
+class StageC(nn.Layer):
+ def __init__(
+ self,
+ c_in=16,
+ c_out=16,
+ c_r=64,
+ patch_size=1,
+ c_cond=2048,
+ c_hidden=[2048, 2048],
+ nhead=[32, 32],
+ blocks=[[8, 24], [24, 8]],
+ block_repeat=[[1, 1], [1, 1]],
+ level_config=["CTA", "CTA"],
+ c_clip_text=1280,
+ c_clip_text_pooled=1280,
+ c_clip_img=768,
+ c_clip_seq=4,
+ kernel_size=3,
+ dropout=[0.1, 0.1],
+ # dropout=[0, 0],
+ self_attn=True,
+ t_conds=["sca", "crp"],
+ switch_level=[False],
+ ):
+ super().__init__()
+ self.c_r = c_r
+ self.t_conds = t_conds
+ self.c_clip_seq = c_clip_seq
+ if not isinstance(dropout, list):
+ dropout = [dropout] * len(c_hidden)
+ if not isinstance(self_attn, list):
+ self_attn = [self_attn] * len(c_hidden)
+ # CONDITIONING
+ self.clip_txt_mapper = nn.Linear(c_clip_text, c_cond)
+ self.clip_txt_pooled_mapper = nn.Linear(c_clip_text_pooled, c_cond * c_clip_seq)
+ self.clip_img_mapper = nn.Linear(c_clip_img, c_cond * c_clip_seq)
+ self.clip_norm = nn.LayerNorm(c_cond, weight_attr=False, bias_attr=False, epsilon=1e-6)
+
+ self.embedding = nn.Sequential(
+ nn.PixelUnshuffle(patch_size),
+ nn.Conv2D(c_in * (patch_size**2), c_hidden[0], kernel_size=1),
+ LayerNorm2d(c_hidden[0], weight_attr=False, bias_attr=False, epsilon=1e-6),
+ )
+
+ def get_block(block_type, c_hidden, nhead, c_skip=0, dropout=0, self_attn=True):
+ if block_type == "C":
+ return ResBlock(c_hidden, c_skip, kernel_size=kernel_size, dropout=dropout)
+ elif block_type == "A":
+ return AttnBlock(c_hidden, c_cond, nhead, self_attn=self_attn, dropout=dropout)
+ elif block_type == "F":
+ return FeedForwardBlock(c_hidden, dropout=dropout)
+ elif block_type == "T":
+ return TimestepBlock(c_hidden, c_r, conds=t_conds)
+ else:
+ raise Exception(f"Block type {block_type} not supported")
+
+ self.down_blocks = nn.LayerList()
+ self.down_downscalers = nn.LayerList()
+ self.down_repeat_mappers = nn.LayerList()
+ for i in range(len(c_hidden)):
+ if i > 0:
+ self.down_downscalers.append(
+ nn.Sequential(
+ LayerNorm2d(
+ c_hidden[i - 1],
+ weight_attr=False,
+ bias_attr=False,
+ epsilon=1e-06,
+ ),
+ UpDownBlock2d(
+ c_hidden[i - 1],
+ c_hidden[i],
+ mode="down",
+ enabled=switch_level[i - 1],
+ ),
+ )
+ )
+ else:
+ self.down_downscalers.append(nn.Identity())
+ down_block = nn.LayerList()
+ for _ in range(blocks[0][i]):
+ for block_type in level_config[i]:
+ block = get_block(
+ block_type,
+ c_hidden[i],
+ nhead[i],
+ dropout=dropout[i],
+ self_attn=self_attn[i],
+ )
+ down_block.append(block)
+ self.down_blocks.append(down_block)
+ if block_repeat is not None:
+ block_repeat_mappers = nn.LayerList()
+ for _ in range(block_repeat[0][i] - 1):
+ block_repeat_mappers.append(nn.Conv2D(c_hidden[i], c_hidden[i], kernel_size=1))
+ self.down_repeat_mappers.append(block_repeat_mappers)
+ self.up_blocks = nn.LayerList()
+ self.up_upscalers = nn.LayerList()
+ self.up_repeat_mappers = nn.LayerList()
+ for i in reversed(range(len(c_hidden))):
+ if i > 0:
+ self.up_upscalers.append(
+ nn.Sequential(
+ LayerNorm2d(c_hidden[i], weight_attr=False, bias_attr=False, epsilon=1e-6),
+ UpDownBlock2d(
+ c_hidden[i],
+ c_hidden[i - 1],
+ mode="up",
+ enabled=switch_level[i - 1],
+ ),
+ )
+ )
+ else:
+ self.up_upscalers.append(nn.Identity())
+ up_block = nn.LayerList()
+ for j in range(blocks[1][::-1][i]):
+ for k, block_type in enumerate(level_config[i]):
+ c_skip = c_hidden[i] if i < len(c_hidden) - 1 and j == k == 0 else 0
+ block = get_block(
+ block_type,
+ c_hidden[i],
+ nhead[i],
+ c_skip=c_skip,
+ dropout=dropout[i],
+ self_attn=self_attn[i],
+ )
+ up_block.append(block)
+ self.up_blocks.append(up_block)
+ if block_repeat is not None:
+ block_repeat_mappers = nn.LayerList()
+ for _ in range(block_repeat[1][::-1][i] - 1):
+ block_repeat_mappers.append(nn.Conv2D(c_hidden[i], c_hidden[i], kernel_size=1))
+ self.up_repeat_mappers.append(block_repeat_mappers)
+ self.clf = nn.Sequential(
+ LayerNorm2d(c_hidden[0], weight_attr=False, bias_attr=False, epsilon=1e-06),
+ nn.Conv2D(c_hidden[0], c_out * (patch_size**2), kernel_size=1),
+ nn.PixelShuffle(upscale_factor=patch_size),
+ )
+ self.apply(self._init_weights)
+ init_Normal = nn.initializer.Normal(std=0.02)
+ init_Normal(self.clip_txt_mapper.weight)
+ init_Normal = nn.initializer.Normal(std=0.02)
+ init_Normal(self.clip_txt_pooled_mapper.weight)
+ init_Normal = nn.initializer.Normal(std=0.02)
+ init_Normal(self.clip_img_mapper.weight)
+ init_Xavier = nn.initializer.XavierUniform()
+ self.embedding[1].weight = self.create_parameter(
+ shape=self.embedding[1].weight.shape, default_initializer=init_Xavier
+ )
+ init_Constant = nn.initializer.Constant(value=0)
+ init_Constant(self.clf[1].weight)
+
+ for level_list in (self.down_blocks, self.up_blocks):
+ for level_block in level_list:
+ for block in level_block:
+ if isinstance(block, ResBlock) or isinstance(block, FeedForwardBlock):
+ block.channelwise[-1].weight.multiply(np.sqrt(1 / sum(blocks[0])))
+ elif isinstance(block, TimestepBlock):
+ for layer in block.sublayers():
+ if isinstance(layer, nn.Linear):
+ init_Constant = nn.initializer.Constant(value=0)
+ init_Constant(layer.weight)
+
+ def _init_weights(self, m):
+ if isinstance(m, (nn.Conv2D, nn.Linear)):
+ init_XavierUniform = nn.initializer.XavierUniform()
+ init_XavierUniform(m.weight)
+ if m.bias is not None:
+ init_Constant = nn.initializer.Constant(value=0)
+ init_Constant(m.bias)
+
+ def gen_r_embedding(self, r, max_positions=10000):
+ r = r * max_positions
+ half_dim = self.c_r // 2
+ emb = math.log(max_positions) / (half_dim - 1)
+ emb = paddle.arange(end=half_dim).astype(dtype="float32").mul(-emb).exp()
+ emb = r[:, None] * emb[None, :]
+ emb = paddle.concat(x=[emb.sin(), emb.cos()], axis=1)
+ if self.c_r % 2 == 1:
+ emb = nn.functional.pad(emb, [0, 1], mode="constant")
+ return emb
+
+ def gen_c_embeddings(self, clip_txt, clip_txt_pooled, clip_img):
+ clip_txt = self.clip_txt_mapper(clip_txt)
+ if len(clip_txt_pooled.shape) == 2:
+ clip_txt_pool = clip_txt_pooled.unsqueeze(axis=1)
+ if len(clip_img.shape) == 2:
+ clip_img = paddle.unsqueeze(clip_img, axis=1)
+
+ clip_txt_pool = self.clip_txt_pooled_mapper(clip_txt_pooled).reshape(
+ [clip_txt_pooled.shape[0], clip_txt_pooled.shape[1] * self.c_clip_seq, -1]
+ )
+
+ clip_img = self.clip_img_mapper(clip_img).reshape([clip_img.shape[0], clip_img.shape[1] * self.c_clip_seq, -1])
+
+ clip = paddle.concat(x=[clip_txt, clip_txt_pool, clip_img], axis=1)
+ clip = self.clip_norm(clip)
+
+ return clip
+
+ def _down_encode(self, x, r_embed, clip, cnet=None):
+ level_outputs = []
+ block_group = zip(self.down_blocks, self.down_downscalers, self.down_repeat_mappers)
+ for down_block, downscaler, repmap in block_group:
+ x = downscaler(x)
+ for i in range(len(repmap) + 1):
+ for block in down_block:
+ if (
+ isinstance(block, ResBlock)
+ or hasattr(block, "_fsdp_wrapped_module")
+ and isinstance(block._fsdp_wrapped_module, ResBlock)
+ ):
+ if cnet is not None:
+ next_cnet = cnet()
+ if next_cnet is not None:
+ x = x + nn.functional.interpolate(
+ next_cnet,
+ size=x.shape[-2:],
+ mode="bilinear",
+ align_corners=True,
+ )
+ x = block(x)
+
+ elif (
+ isinstance(block, AttnBlock)
+ or hasattr(block, "_fsdp_wrapped_module")
+ and isinstance(block._fsdp_wrapped_module, AttnBlock)
+ ):
+ x = block(x, clip)
+
+ elif (
+ isinstance(block, TimestepBlock)
+ or hasattr(block, "_fsdp_wrapped_module")
+ and isinstance(block._fsdp_wrapped_module, TimestepBlock)
+ ):
+ x = block(x, r_embed)
+ else:
+ x = block(x)
+
+ if i < len(repmap):
+ x = repmap[i](x)
+ level_outputs.insert(0, x)
+ return level_outputs
+
+ def _up_decode(self, level_outputs, r_embed, clip, cnet=None):
+ x = level_outputs[0]
+ block_group = zip(self.up_blocks, self.up_upscalers, self.up_repeat_mappers)
+ count_i = 0
+ for i, (up_block, upscaler, repmap) in enumerate(block_group):
+ count_i += 1
+ count_j = 0
+ for j in range(len(repmap) + 1):
+ count_j += 1
+ count_k = 0
+ for k, block in enumerate(up_block):
+ count_k += 1
+
+ if (
+ isinstance(block, ResBlock)
+ or hasattr(block, "_fsdp_wrapped_module")
+ and isinstance(block._fsdp_wrapped_module, ResBlock)
+ ):
+ skip = level_outputs[i] if k == 0 and i > 0 else None
+ if skip is not None and (x.shape[-1] != skip.shape[-1] or x.shape[-2] != skip.shape[-2]):
+ x = nn.functional.interpolate(
+ x=x.astype(paddle.float32),
+ size=skip.shape[-2:],
+ mode="bilinear",
+ align_corners=True,
+ )
+ x = block(x, skip)
+ elif (
+ isinstance(block, AttnBlock)
+ or hasattr(block, "_fsdp_wrapped_module")
+ and isinstance(block._fsdp_wrapped_module, AttnBlock)
+ ):
+ x = block(x, clip)
+ elif (
+ isinstance(block, TimestepBlock)
+ or hasattr(block, "_fsdp_wrapped_module")
+ and isinstance(block._fsdp_wrapped_module, TimestepBlock)
+ ):
+ x = block(x, r_embed)
+ else:
+ x = block(x)
+
+ if j < len(repmap):
+ x = repmap[j](x)
+
+ x = upscaler(x)
+
+ return x
+
+ def forward(self, x, r, clip_text, clip_text_pooled, clip_img, cnet=None, **kwargs):
+
+ r_embed = self.gen_r_embedding(r)
+ for c in self.t_conds:
+ t_cond = kwargs.get(c, paddle.zeros_like(r))
+ r_embed = paddle.concat(x=[r_embed, self.gen_r_embedding(t_cond)], axis=1)
+ clip = self.gen_c_embeddings(clip_text, clip_text_pooled, clip_img)
+
+ x = self.embedding(x)
+ level_outputs = self._down_encode(x, r_embed, clip, cnet)
+ x = self._up_decode(level_outputs, r_embed, clip, cnet)
+ x = self.clf(x)
+ # x.register_hook(lambda grad: print("@@@ before-clf-x @@@", grad.shape, grad.abs().mean()))
+
+ return x
+
+ def update_weights_ema(self, src_model, beta=0.999):
+ for self_params, src_params in zip(self.parameters(), src_model.parameters()):
+ self_params.data = self_params.data * beta + src_params.data.clone() * (1 - beta)
+ for self_buffers, src_buffers in zip(self.buffers(), src_model.buffers()):
+ self_buffers.data = self_buffers.data * beta + src_buffers.data.clone() * (1 - beta)
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b68e33e511a5ed3eee62e9397a9abd6c05d54086
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/__init__.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL
+from PIL import Image
+
+from ...utils import (
+ OptionalDependencyNotAvailable,
+ is_paddle_available,
+ is_paddlenlp_available,
+)
+
+try:
+ if not (is_paddlenlp_available() and is_paddle_available()):
+ raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+ from ...utils.dummy_paddle_and_paddlenlp_objects import ShapEPipeline
+else:
+ from .blip_image_processing import BlipImageProcessor
+ from .modeling_blip2 import Blip2QFormerModel
+ from .modeling_ctx_clip import ContextCLIPTextModel
+ from .pipeline_blip_diffusion import BlipDiffusionPipeline
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/modeling_blip2.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/modeling_blip2.py
new file mode 100644
index 0000000000000000000000000000000000000000..484577c2d8ec3c86d85cc0afb335db649d88fa14
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/modeling_blip2.py
@@ -0,0 +1,659 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+
+import paddle
+from paddle import nn
+from paddlenlp.transformers.activations import QuickGELUActivation as QuickGELU
+from paddlenlp.transformers.blip_2.configuration import Blip2Config, Blip2VisionConfig
+from paddlenlp.transformers.blip_2.modeling import (
+ Blip2Encoder,
+ Blip2QFormerAttention,
+ Blip2QFormerIntermediate,
+ Blip2QFormerOutput,
+)
+from paddlenlp.transformers.model_outputs import (
+ BaseModelOutputWithPastAndCrossAttentions,
+ BaseModelOutputWithPooling,
+ BaseModelOutputWithPoolingAndCrossAttentions,
+)
+from paddlenlp.transformers.model_utils import apply_chunking_to_forward
+
+from ppdiffusers.transformers import BertTokenizer, PretrainedModel
+
+from ...utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class Blip2PretrainedModel(PretrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = Blip2Config
+ base_model_prefix = "blip"
+ supports_gradient_checkpointing = True
+ _keys_to_ignore_on_load_missing = [
+ r"position_ids",
+ r"language_model.encoder.embed_tokens.weight",
+ r"language_model.decoder.embed_tokens.weight",
+ ]
+ _no_split_modules = ["Blip2Attention", "T5Block", "OPTDecoderLayer"]
+ _keep_in_fp32_modules = ["wo"]
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ factor = self.config.initializer_range
+ if isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
+ nn.init.normal_(module.weight, mean=0.0, std=factor)
+ if hasattr(module, "padding_idx") and module.padding_idx is not None:
+ module.weight[module.padding_idx] = 0.0
+ if hasattr(module, "bias") and module.bias is not None:
+ nn.init.zeros_(module.bias)
+ if isinstance(module, Blip2VisionEmbeddings):
+ if hasattr(self.config, "vision_config"):
+ factor = self.config.vision_config.initializer_range
+ trunc_normal_ = nn.initializer.TruncatedNormal(mean=0.0, std=factor)
+ trunc_normal_(module.position_embedding)
+ trunc_normal_(
+ module.class_embedding,
+ )
+ elif isinstance(module, nn.LayerNorm):
+ nn.init.zeros_(module.bias)
+ nn.init.ones_(module.weight)
+ elif isinstance(module, nn.Linear) and module.bias is not None:
+ nn.init.zeros_(module.bias)
+
+
+# There is an implementation of Blip2 in `transformers` : https://github.com/huggingface/transformers/blob/main/src/transformers/models/blip_2/modeling_blip_2.py.
+# But it doesn't support getting multimodal embeddings. So, this module can be
+# replaced with a future `transformers` version supports that.
+class Blip2TextEmbeddings(nn.Layer):
+ """Construct the embeddings from word and position embeddings."""
+
+ def __init__(self, config):
+ super().__init__()
+ self.word_embeddings = nn.Embedding(
+ config.vocab_size, config.hidden_size
+ ) # padding_idx=config.pad_token_id NOTE, donot set padding_idx
+ self.word_embeddings.padding_idx = config.pad_token_id
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+ # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+ # any TensorFlow checkpoint file
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+ self.register_buffer(
+ "position_ids", paddle.arange(config.max_position_embeddings, dtype=paddle.int64).expand((1, -1))
+ )
+ self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+
+ self.config = config
+
+ def forward(
+ self,
+ input_ids=None,
+ position_ids=None,
+ query_embeds=None,
+ past_key_values_length=0,
+ ):
+ if input_ids is not None:
+ seq_length = input_ids.shape[1]
+ else:
+ seq_length = 0
+
+ if position_ids is None:
+ position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()
+
+ if input_ids is not None:
+ embeddings = self.word_embeddings(input_ids)
+ if self.position_embedding_type == "absolute":
+ position_embeddings = self.position_embeddings(position_ids)
+ embeddings = embeddings + position_embeddings
+
+ if query_embeds is not None:
+ batch_size = embeddings.shape[0]
+ # repeat the query embeddings for batch size
+ query_embeds = query_embeds.tile([batch_size, 1, 1])
+ embeddings = paddle.concat((query_embeds, embeddings), axis=1)
+ else:
+ embeddings = query_embeds
+ embeddings = embeddings.cast(query_embeds.dtype)
+ embeddings = self.LayerNorm(embeddings)
+ embeddings = self.dropout(embeddings)
+ return embeddings
+
+
+# Copy-pasted from transformers.models.blip.modeling_blip.BlipVisionEmbeddings with Blip->Blip2
+class Blip2VisionEmbeddings(nn.Layer):
+ def __init__(self, config: Blip2VisionConfig):
+ super().__init__()
+ self.config = config
+ self.embed_dim = config.hidden_size
+ self.image_size = config.image_size
+ self.patch_size = config.patch_size
+
+ self.class_embedding = nn.Parameter(paddle.randn([1, 1, self.embed_dim]))
+
+ self.patch_embedding = nn.Conv2D(
+ in_channels=3,
+ out_channels=self.embed_dim,
+ kernel_size=self.patch_size,
+ stride=self.patch_size,
+ bias_attr=False,
+ )
+
+ self.num_patches = (self.image_size // self.patch_size) ** 2
+ self.num_positions = self.num_patches + 1
+
+ self.position_embedding = nn.Parameter(paddle.randn([1, self.num_positions, self.embed_dim]))
+
+ def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor:
+ batch_size = pixel_values.shape[0]
+ target_dtype = self.patch_embedding.weight.dtype
+ patch_embeds = self.patch_embedding(pixel_values.cast(dtype=target_dtype)) # shape = [*, width, grid, grid]
+ patch_embeds = patch_embeds.flatten(2).transpose([0, 2, 1])
+
+ class_embeds = self.class_embedding.expand([batch_size, 1, -1]).cast(target_dtype)
+ embeddings = paddle.concat([class_embeds, patch_embeds], axis=1)
+ embeddings = embeddings + self.position_embedding[:, : embeddings.shape[1], :].cast(target_dtype)
+ return embeddings
+
+
+# The Qformer encoder, which takes the visual embeddings, and the text input, to get multimodal embeddings
+class Blip2QFormerEncoder(nn.Layer):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.layer = nn.LayerList(
+ [Blip2QFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+ )
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_values=None,
+ use_cache=None,
+ output_attentions=False,
+ output_hidden_states=False,
+ return_dict=True,
+ query_length=0,
+ ):
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attentions = () if output_attentions else None
+ all_cross_attentions = () if output_attentions else None
+
+ next_decoder_cache = () if use_cache else None
+
+ if getattr(self.config, "gradient_checkpointing", False) and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ for i in range(self.config.num_hidden_layers):
+ layer_module = self.layer[i]
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ past_key_value = past_key_values[i] if past_key_values is not None else None
+
+ if self.gradient_checkpointing and not hidden_states.stop_gradient:
+ layer_outputs = self._gradient_checkpointing_func(
+ layer_module.__call__,
+ hidden_states,
+ attention_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ past_key_value,
+ output_attentions,
+ query_length,
+ )
+
+ else:
+ layer_outputs = layer_module(
+ hidden_states,
+ attention_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ past_key_value,
+ output_attentions,
+ query_length,
+ )
+
+ hidden_states = layer_outputs[0]
+ if use_cache:
+ next_decoder_cache += (layer_outputs[-1],)
+ if output_attentions:
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
+ if layer_module.has_cross_attention:
+ all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(
+ v
+ for v in [
+ hidden_states,
+ next_decoder_cache,
+ all_hidden_states,
+ all_self_attentions,
+ all_cross_attentions,
+ ]
+ if v is not None
+ )
+ return BaseModelOutputWithPastAndCrossAttentions(
+ last_hidden_state=hidden_states,
+ past_key_values=next_decoder_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ cross_attentions=all_cross_attentions,
+ )
+
+
+# The layers making up the Qformer encoder
+class Blip2QFormerLayer(nn.Layer):
+ def __init__(self, config, layer_idx):
+ super().__init__()
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
+ self.seq_len_dim = 1
+ self.attention = Blip2QFormerAttention(config)
+
+ self.layer_idx = layer_idx
+
+ if layer_idx % config.cross_attention_frequency == 0:
+ self.crossattention = Blip2QFormerAttention(config, is_cross_attention=True)
+ self.has_cross_attention = True
+ else:
+ self.has_cross_attention = False
+
+ self.intermediate = Blip2QFormerIntermediate(config)
+ self.intermediate_query = Blip2QFormerIntermediate(config)
+ self.output_query = Blip2QFormerOutput(config)
+ self.output = Blip2QFormerOutput(config)
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_value=None,
+ output_attentions=False,
+ query_length=0,
+ ):
+ # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+ self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+ self_attention_outputs = self.attention(
+ hidden_states,
+ attention_mask,
+ output_attentions=output_attentions,
+ past_key_value=self_attn_past_key_value,
+ )
+ attention_output = self_attention_outputs[0]
+ outputs = self_attention_outputs[1:-1]
+
+ present_key_value = self_attention_outputs[-1]
+
+ if query_length > 0:
+ query_attention_output = attention_output[:, :query_length, :]
+
+ if self.has_cross_attention:
+ if encoder_hidden_states is None:
+ raise ValueError("encoder_hidden_states must be given for cross-attention layers")
+ cross_attention_outputs = self.crossattention(
+ query_attention_output,
+ attention_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ output_attentions=output_attentions,
+ )
+ query_attention_output = cross_attention_outputs[0]
+ # add cross attentions if we output attention weights
+ outputs = outputs + cross_attention_outputs[1:-1]
+
+ layer_output = apply_chunking_to_forward(
+ self.feed_forward_chunk_query,
+ self.chunk_size_feed_forward,
+ self.seq_len_dim,
+ query_attention_output,
+ )
+
+ if attention_output.shape[1] > query_length:
+ layer_output_text = apply_chunking_to_forward(
+ self.feed_forward_chunk,
+ self.chunk_size_feed_forward,
+ self.seq_len_dim,
+ attention_output[:, query_length:, :],
+ )
+ layer_output = paddle.concat([layer_output, layer_output_text], axis=1)
+ else:
+ layer_output = apply_chunking_to_forward(
+ self.feed_forward_chunk,
+ self.chunk_size_feed_forward,
+ self.seq_len_dim,
+ attention_output,
+ )
+ outputs = (layer_output,) + outputs
+
+ outputs = outputs + (present_key_value,)
+
+ return outputs
+
+ def feed_forward_chunk(self, attention_output):
+ intermediate_output = self.intermediate(attention_output)
+ layer_output = self.output(intermediate_output, attention_output)
+ return layer_output
+
+ def feed_forward_chunk_query(self, attention_output):
+ intermediate_output = self.intermediate_query(attention_output)
+ layer_output = self.output_query(intermediate_output, attention_output)
+ return layer_output
+
+
+# ProjLayer used to project the multimodal Blip2 embeddings to be used in the text encoder
+class ProjLayer(nn.Layer):
+ def __init__(self, in_dim, out_dim, hidden_dim, drop_p=0.1, eps=1e-12):
+ super().__init__()
+
+ # Dense1 -> Act -> Dense2 -> Drop -> Res -> Norm
+ self.dense1 = nn.Linear(in_dim, hidden_dim)
+ self.act_fn = QuickGELU()
+ self.dense2 = nn.Linear(hidden_dim, out_dim)
+ self.dropout = nn.Dropout(drop_p)
+
+ self.LayerNorm = nn.LayerNorm(out_dim, epsilon=eps)
+
+ def forward(self, x):
+ x_in = x
+
+ x = self.LayerNorm(x)
+ x = self.dropout(self.dense2(self.act_fn(self.dense1(x)))) + x_in
+
+ return x
+
+
+# Copy-pasted from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->Blip2, BLIP->BLIP_2
+class Blip2VisionModel(Blip2PretrainedModel):
+ main_input_name = "pixel_values"
+ config_class = Blip2VisionConfig
+
+ def __init__(self, config: Blip2VisionConfig):
+ super().__init__(config)
+ self.config = config
+ embed_dim = config.hidden_size
+ self.embeddings = Blip2VisionEmbeddings(config)
+ self.pre_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
+ self.encoder = Blip2Encoder(config)
+ self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
+
+ self.post_init()
+
+ def forward(
+ self,
+ pixel_values: Optional[paddle.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+
+ hidden_states = self.embeddings(pixel_values)
+ hidden_states = self.pre_layernorm(hidden_states)
+ encoder_outputs = self.encoder(
+ inputs_embeds=hidden_states,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ last_hidden_state = encoder_outputs[0]
+ last_hidden_state = self.post_layernorm(last_hidden_state)
+
+ pooled_output = last_hidden_state[:, 0, :]
+ pooled_output = self.post_layernorm(pooled_output)
+
+ if not return_dict:
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+ return BaseModelOutputWithPooling(
+ last_hidden_state=last_hidden_state,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+ def get_input_embeddings(self):
+ return self.embeddings
+
+
+# Qformer model, used to get multimodal embeddings from the text and image inputs
+class Blip2QFormerModel(Blip2PretrainedModel):
+ """
+ Querying Transformer (Q-Former), used in BLIP-2.
+ """
+
+ def __init__(self, config: Blip2Config):
+ super().__init__(config)
+ self.config = config
+ self.embeddings = Blip2TextEmbeddings(config.qformer_config)
+ self.visual_encoder = Blip2VisionModel(config.vision_config)
+ self.query_tokens = nn.Parameter(paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size]))
+ if not hasattr(config, "tokenizer") or config.tokenizer is None:
+ self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", truncation_side="right")
+ else:
+ self.tokenizer = BertTokenizer.from_pretrained(config.tokenizer, truncation_side="right")
+ self.tokenizer.add_special_tokens({"bos_token": "[DEC]"})
+ self.proj_layer = ProjLayer(
+ in_dim=config.qformer_config.hidden_size,
+ out_dim=config.qformer_config.hidden_size,
+ hidden_dim=config.qformer_config.hidden_size * 4,
+ drop_p=0.1,
+ eps=1e-12,
+ )
+
+ self.encoder = Blip2QFormerEncoder(config.qformer_config)
+
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embeddings.word_embeddings
+
+ def set_input_embeddings(self, value):
+ self.embeddings.word_embeddings = value
+
+ def get_extended_attention_mask(
+ self,
+ attention_mask: paddle.Tensor,
+ input_shape: Tuple[int],
+ has_query: bool = False,
+ ) -> paddle.Tensor:
+ """
+ Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+ Arguments:
+ attention_mask (`paddle.Tensor`):
+ Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+ input_shape (`Tuple[int]`):
+ The shape of the input to the model.
+
+ Returns:
+ `paddle.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+ """
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+ # ourselves in which case we just need to make it broadcastable to all heads.
+ if attention_mask.dim() == 3:
+ extended_attention_mask = attention_mask[:, None, :, :]
+ elif attention_mask.dim() == 2:
+ # Provided a padding mask of dimensions [batch_size, seq_length]
+ # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+ extended_attention_mask = attention_mask[:, None, None, :]
+ else:
+ raise ValueError(
+ "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+ input_shape, attention_mask.shape
+ )
+ )
+
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+ # masked positions, this operation will create a tensor which is 0.0 for
+ # positions we want to attend and -10000.0 for masked positions.
+ # Since we are adding it to the raw scores before the softmax, this is
+ # effectively the same as removing these entirely.
+ extended_attention_mask = extended_attention_mask.cast(dtype=self.dtype) # fp16 compatibility
+ extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+ return extended_attention_mask
+
+ def forward(
+ self,
+ text_input=None,
+ image_input=None,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_values=None,
+ use_cache=None,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ ):
+ r"""
+ encoder_hidden_states (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+ the model is configured as a decoder.
+ encoder_attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, `optional`):
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+ past_key_values (`tuple(tuple(paddle.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of:
+ shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+ value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+ used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+ value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+ `(batch_size, sequence_length)`.
+ use_cache (`bool`, `optional`):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ """
+
+ text = self.tokenizer(text_input, return_tensors="pd", padding=True, return_attention_mask=True)
+ input_ids = text.input_ids
+ batch_size = input_ids.shape[0]
+ query_atts = paddle.ones((batch_size, self.query_tokens.shape[1]), dtype=paddle.int64)
+ attention_mask = paddle.concat([query_atts, text.attention_mask], axis=1)
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # past_key_values_length
+ past_key_values_length = (
+ past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+ )
+
+ query_length = self.query_tokens.shape[1]
+
+ embedding_output = self.embeddings(
+ input_ids=input_ids,
+ query_embeds=self.query_tokens,
+ past_key_values_length=past_key_values_length,
+ )
+
+ # embedding_output = self.layernorm(query_embeds)
+ # embedding_output = self.dropout(embedding_output)
+
+ input_shape = embedding_output.shape[:-1]
+ batch_size, seq_length = input_shape
+
+ image_embeds_frozen = self.visual_encoder(image_input).last_hidden_state
+ # image_embeds_frozen = paddle.ones_like(image_embeds_frozen)
+ encoder_hidden_states = image_embeds_frozen
+
+ if attention_mask is None:
+ attention_mask = paddle.ones(
+ ((batch_size, seq_length + past_key_values_length)),
+ )
+
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+ # ourselves in which case we just need to make it broadcastable to all heads.
+ extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+ # If a 2D or 3D attention mask is provided for the cross-attention
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+ if encoder_hidden_states is not None:
+ if isinstance(encoder_hidden_states, list):
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].shape
+ else:
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+ if isinstance(encoder_attention_mask, list):
+ encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+ elif encoder_attention_mask is None:
+ encoder_attention_mask = paddle.ones(encoder_hidden_shape)
+ encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+ else:
+ encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+ else:
+ encoder_extended_attention_mask = None
+
+ encoder_outputs = self.encoder(
+ embedding_output,
+ attention_mask=extended_attention_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_extended_attention_mask,
+ past_key_values=past_key_values,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ query_length=query_length,
+ )
+ sequence_output = encoder_outputs[0]
+ pooled_output = sequence_output[:, 0, :]
+
+ if not return_dict:
+ return self.proj_layer(sequence_output[:, :query_length, :])
+
+ return BaseModelOutputWithPoolingAndCrossAttentions(
+ last_hidden_state=sequence_output,
+ pooler_output=pooled_output,
+ past_key_values=encoder_outputs.past_key_values,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ cross_attentions=encoder_outputs.cross_attentions,
+ )
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/modeling_ctx_clip.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/modeling_ctx_clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..b78442c52e0b777e6a601f0b205cf9ccb5c75991
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/modeling_ctx_clip.py
@@ -0,0 +1,248 @@
+# Copyright 2023 Salesforce.com, inc.
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+
+import paddle
+from paddle import nn
+from paddlenlp.transformers.model_outputs import BaseModelOutputWithPooling
+
+from ppdiffusers.transformers import CLIPPretrainedModel
+from ppdiffusers.transformers.clip.configuration import CLIPTextConfig
+from ppdiffusers.transformers.clip.modeling import CLIPEncoder
+
+
+def _expand_mask(mask: paddle.Tensor, dtype, tgt_len: Optional[int] = None):
+ """
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+ """
+ bsz, src_len = mask.shape
+ tgt_len = tgt_len if tgt_len is not None else src_len
+
+ expanded_mask = mask[:, None, None, :].expand([bsz, 1, tgt_len, src_len]).cast(dtype)
+
+ inverted_mask = 1.0 - expanded_mask
+
+ return paddle.masked_fill(inverted_mask, inverted_mask.cast(paddle.bool), paddle.finfo(dtype).min)
+
+
+# This is a modified version of the CLIPTextModel from transformers.models.clip.modeling_clip
+# Which allows for an extra input of "context embeddings", which are the query embeddings used in Qformer
+# They pass through the clip model, along with the text embeddings, and interact with them using self attention
+class ContextCLIPTextModel(CLIPPretrainedModel):
+ config_class = CLIPTextConfig
+
+ _no_split_modules = ["CLIPEncoderLayer"]
+
+ def __init__(self, config: CLIPTextConfig):
+ super().__init__(config)
+ self.text_model = ContextCLIPTextTransformer(config)
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def forward(
+ self,
+ ctx_embeddings: paddle.Tensor = None,
+ ctx_begin_pos: list = None,
+ input_ids: Optional[paddle.Tensor] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ position_ids: Optional[paddle.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ return self.text_model(
+ ctx_embeddings=ctx_embeddings,
+ ctx_begin_pos=ctx_begin_pos,
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+
+class ContextCLIPTextTransformer(nn.Layer):
+ def __init__(self, config: CLIPTextConfig):
+ super().__init__()
+ self.config = config
+ embed_dim = config.hidden_size
+ self.embeddings = ContextCLIPTextEmbeddings(config)
+ self.encoder = CLIPEncoder(config)
+ self.final_layer_norm = nn.LayerNorm(embed_dim)
+ self.eos_token_id = config.eos_token_id
+
+ def forward(
+ self,
+ ctx_embeddings: paddle.Tensor,
+ ctx_begin_pos: list,
+ input_ids: Optional[paddle.Tensor] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ position_ids: Optional[paddle.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ r"""
+ Returns:
+
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if input_ids is None:
+ raise ValueError("You have to specify either input_ids")
+
+ input_shape = input_ids.shape
+ input_ids = input_ids.reshape([-1, input_shape[-1]])
+
+ hidden_states = self.embeddings(
+ input_ids=input_ids,
+ position_ids=position_ids,
+ ctx_embeddings=ctx_embeddings,
+ ctx_begin_pos=ctx_begin_pos,
+ )
+
+ bsz, seq_len = input_shape
+ if ctx_embeddings is not None:
+ seq_len += ctx_embeddings.shape[1]
+ # CLIP's text model uses causal mask, prepare it here.
+ # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+ causal_attention_mask = self._build_causal_attention_mask(
+ bsz,
+ seq_len,
+ hidden_states.dtype,
+ )
+ # expand attention_mask
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+ encoder_outputs = self.encoder(
+ inputs_embeds=hidden_states,
+ attention_mask=attention_mask,
+ causal_attention_mask=causal_attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ last_hidden_state = encoder_outputs[0]
+ last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+ if self.eos_token_id == 2:
+ # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+ # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+ # ------------------------------------------------------------
+ # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+ # take features from the eot embedding (eot_token is the highest number in each sequence)
+ # casting to paddle.int32 for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+ pooled_output = last_hidden_state.gather_nd(
+ paddle.stack(
+ [paddle.arange(last_hidden_state.shape[0], dtype="int32"), input_ids.argmax(-1, dtype="int32")],
+ axis=-1,
+ )
+ )
+ else:
+ # The config gets updated `eos_token_id` from PR #24773 (so the use of extra new tokens is possible)
+ # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
+ pooled_output = last_hidden_state.gather_nd(
+ paddle.stack(
+ [
+ paddle.arange(last_hidden_state.shape[0], dtype="int32"),
+ (input_ids == paddle.to_tensor([self.eos_token_id]))
+ .cast("int32")
+ .argmax(axis=-1, dtype="int32"),
+ ],
+ axis=-1,
+ )
+ )
+
+ if not return_dict:
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+ return BaseModelOutputWithPooling(
+ last_hidden_state=last_hidden_state,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+ def _build_causal_attention_mask(self, bsz, seq_len, dtype):
+ mask = paddle.triu(
+ # paddle.full((bsz, 1, seq_len, seq_len), paddle.finfo(dtype).min, dtype=dtype),
+ paddle.ones((bsz, paddle.to_tensor([1]), seq_len, seq_len), dtype=dtype) * paddle.finfo(dtype).min,
+ diagonal=1,
+ )
+ return mask
+
+
+class ContextCLIPTextEmbeddings(nn.Layer):
+ def __init__(self, config: CLIPTextConfig):
+ super().__init__()
+ embed_dim = config.hidden_size
+
+ self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+ self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+ self.register_buffer(
+ "position_ids", paddle.arange(config.max_position_embeddings, dtype=paddle.int64).expand((1, -1))
+ )
+
+ def forward(
+ self,
+ ctx_embeddings: paddle.Tensor,
+ ctx_begin_pos: list,
+ input_ids: Optional[paddle.Tensor] = None,
+ position_ids: Optional[paddle.Tensor] = None,
+ inputs_embeds: Optional[paddle.Tensor] = None,
+ ) -> paddle.Tensor:
+ if ctx_embeddings is None:
+ ctx_len = 0
+ else:
+ ctx_len = ctx_embeddings.shape[1]
+
+ seq_length = (input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]) + ctx_len
+
+ if position_ids is None:
+ position_ids = self.position_ids[:, :seq_length].cast(paddle.int64)
+
+ if inputs_embeds is None:
+ inputs_embeds = self.token_embedding(input_ids)
+
+ # for each input embeddings, add the ctx embeddings at the correct position
+ input_embeds_ctx = []
+ bsz = inputs_embeds.shape[0]
+
+ if ctx_embeddings is not None:
+ for i in range(bsz):
+ cbp = ctx_begin_pos[i]
+
+ prefix = inputs_embeds[i, :cbp]
+ # remove the special token embedding
+ suffix = inputs_embeds[i, cbp:]
+
+ input_embeds_ctx.append(paddle.concat([prefix, ctx_embeddings[i], suffix], axis=0))
+
+ inputs_embeds = paddle.stack(input_embeds_ctx, axis=0)
+
+ position_embeddings = self.position_embedding(position_ids)
+ embeddings = inputs_embeds + position_embeddings
+
+ return embeddings
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/consistency_models/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/consistency_models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0de04fa38c3109ea181a0c289564b10ae9e49a92
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/consistency_models/__init__.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import PPDIFFUSERS_SLOW_IMPORT, _LazyModule
+
+_import_structure = {
+ "pipeline_consistency_models": ["ConsistencyModelPipeline"],
+}
+
+if TYPE_CHECKING or PPDIFFUSERS_SLOW_IMPORT:
+ from .pipeline_consistency_models import ConsistencyModelPipeline
+
+else:
+ import sys
+
+ sys.modules[__name__] = _LazyModule(
+ __name__,
+ globals()["__file__"],
+ _import_structure,
+ module_spec=__spec__,
+ )
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f22c95d57a1680ad45f763c5bd76591c8d56d2c
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
@@ -0,0 +1,1308 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+import PIL.Image
+
+from ppdiffusers.transformers import (
+ CLIPImageProcessor,
+ CLIPTextModel,
+ CLIPTextModelWithProjection,
+ CLIPTokenizer,
+ CLIPVisionModelWithProjection,
+)
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+ FromSingleFileMixin,
+ IPAdapterMixin,
+ StableDiffusionXLLoraLoaderMixin,
+ TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
+from ...models.attention_processor import (
+ AttnProcessor2_5,
+ LoRAAttnProcessor2_5,
+ LoRAXFormersAttnProcessor,
+ XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+ USE_PEFT_BACKEND,
+ deprecate,
+ is_pp_invisible_watermark_available,
+ logging,
+ replace_example_docstring,
+)
+from ...utils.paddle_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+
+if is_pp_invisible_watermark_available():
+ from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+from .multicontrolnet import MultiControlNetModel
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+ Examples:
+ ```py
+ >>> # !pip install opencv-python paddlenlp ppdiffusers
+ >>> from ppdiffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL
+ >>> from ppdiffusers.utils import load_image
+ >>> import numpy as np
+ >>> import paddle
+
+ >>> import cv2
+ >>> from PIL import Image
+
+ >>> prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
+ >>> negative_prompt = "low quality, bad quality, sketches"
+
+ >>> # download an image
+ >>> image = load_image(
+ ... "https://hf-mirror.com/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
+ ... )
+
+ >>> # initialize the models and pipeline
+ >>> controlnet_conditioning_scale = 0.5 # recommended for good generalization
+ >>> controlnet = ControlNetModel.from_pretrained(
+ ... "diffusers/controlnet-canny-sdxl-1.0", paddle_dtype=paddle.float16
+ ... )
+ >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", paddle_dtype=paddle.float16)
+ >>> pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+ ... "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, vae=vae, paddle_dtype=paddle.float16
+ ... )
+
+ >>> # get canny image
+ >>> image = np.array(image)
+ >>> image = cv2.Canny(image, 100, 200)
+ >>> image = image[:, :, None]
+ >>> image = np.concatenate([image, image, image], axis=2)
+ >>> canny_image = Image.fromarray(image)
+
+ >>> # generate image
+ >>> image = pipe(
+ ... prompt, controlnet_conditioning_scale=controlnet_conditioning_scale, image=canny_image
+ ... ).images[0]
+ ```
+"""
+
+
+class StableDiffusionXLControlNetPipeline(
+ DiffusionPipeline,
+ TextualInversionLoaderMixin,
+ StableDiffusionXLLoraLoaderMixin,
+ IPAdapterMixin,
+ FromSingleFileMixin,
+):
+ r"""
+ Pipeline for text-to-image generation using Stable Diffusion XL with ControlNet guidance.
+
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+ The pipeline also inherits the following loading methods:
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+ - [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+ - [`loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
+ Args:
+ vae ([`AutoencoderKL`]):
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+ text_encoder ([`~transformers.CLIPTextModel`]):
+ Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+ text_encoder_2 ([`~transformers.CLIPTextModelWithProjection`]):
+ Second frozen text-encoder
+ ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
+ tokenizer ([`~transformers.CLIPTokenizer`]):
+ A `CLIPTokenizer` to tokenize text.
+ tokenizer_2 ([`~transformers.CLIPTokenizer`]):
+ A `CLIPTokenizer` to tokenize text.
+ unet ([`UNet2DConditionModel`]):
+ A `UNet2DConditionModel` to denoise the encoded image latents.
+ controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+ Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+ ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+ additional conditioning.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+ force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+ Whether the negative prompt embeddings should always be set to 0. Also see the config of
+ `stabilityai/stable-diffusion-xl-base-1-0`.
+ add_watermarker (`bool`, *optional*):
+ Whether to use the [pp_invisible_watermark](https://github.com/junnyu/pp-invisible-watermark/) library to
+ watermark output images. If not defined, it defaults to `True` if the package is installed; otherwise no
+ watermarker is used.
+ """
+
+ # leave controlnet out on purpose because it iterates with unet
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+ _optional_components = [
+ "tokenizer",
+ "tokenizer_2",
+ "text_encoder",
+ "text_encoder_2",
+ "feature_extractor",
+ "image_encoder",
+ ]
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+ def __init__(
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ text_encoder_2: CLIPTextModelWithProjection,
+ tokenizer: CLIPTokenizer,
+ tokenizer_2: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+ scheduler: KarrasDiffusionSchedulers,
+ force_zeros_for_empty_prompt: bool = True,
+ add_watermarker: Optional[bool] = None,
+ feature_extractor: CLIPImageProcessor = None,
+ image_encoder: CLIPVisionModelWithProjection = None,
+ ):
+ super().__init__()
+
+ if isinstance(controlnet, (list, tuple)):
+ controlnet = MultiControlNetModel(controlnet)
+
+ self.register_modules(
+ vae=vae,
+ text_encoder=text_encoder,
+ text_encoder_2=text_encoder_2,
+ tokenizer=tokenizer,
+ tokenizer_2=tokenizer_2,
+ unet=unet,
+ controlnet=controlnet,
+ scheduler=scheduler,
+ feature_extractor=feature_extractor,
+ image_encoder=image_encoder,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+ self.control_image_processor = VaeImageProcessor(
+ vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+ )
+ add_watermarker = add_watermarker if add_watermarker is not None else is_pp_invisible_watermark_available()
+
+ if add_watermarker:
+ self.watermark = StableDiffusionXLWatermarker()
+ else:
+ self.watermark = None
+
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+
+ # Copied from ppdiffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+ def encode_prompt(
+ self,
+ prompt: str,
+ prompt_2: Optional[str] = None,
+ num_images_per_prompt: int = 1,
+ do_classifier_free_guidance: bool = True,
+ negative_prompt: Optional[str] = None,
+ negative_prompt_2: Optional[str] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ pooled_prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_pooled_prompt_embeds: Optional[paddle.Tensor] = None,
+ lora_scale: Optional[float] = None,
+ clip_skip: Optional[int] = None,
+ ):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ prompt to be encoded
+ prompt_2 (`str` or `List[str]`, *optional*):
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+ used in both text-encoders
+ num_images_per_prompt (`int`):
+ number of images that should be generated per prompt
+ do_classifier_free_guidance (`bool`):
+ whether to use classifier free guidance or not
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+ prompt_embeds (`paddle.Tensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ pooled_prompt_embeds (`paddle.Tensor`, *optional*):
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
+ negative_pooled_prompt_embeds (`paddle.Tensor`, *optional*):
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+ input argument.
+ lora_scale (`float`, *optional*):
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+ clip_skip (`int`, *optional*):
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+ the output of the pre-final layer will be used for computing the prompt embeddings.
+ """
+ # set lora scale so that monkey patched LoRA
+ # function of text encoder can correctly access it
+ if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+ self._lora_scale = lora_scale
+
+ # dynamically adjust the LoRA scale
+ if self.text_encoder is not None:
+ if not USE_PEFT_BACKEND:
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+
+ if self.text_encoder_2 is not None:
+ if not USE_PEFT_BACKEND:
+ adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+
+ prompt = [prompt] if isinstance(prompt, str) else prompt
+
+ if prompt is not None:
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ # Define tokenizers and text encoders
+ tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+ text_encoders = (
+ [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+ )
+
+ if prompt_embeds is None:
+ prompt_2 = prompt_2 or prompt
+ prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+ # textual inversion: process multi-vector tokens if necessary
+ prompt_embeds_list = []
+ prompts = [prompt, prompt_2]
+ for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+ if isinstance(self, TextualInversionLoaderMixin):
+ prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+ text_inputs = tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=tokenizer.model_max_length,
+ truncation=True,
+ return_tensors="pd",
+ )
+
+ text_input_ids = text_inputs.input_ids
+ untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
+
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
+ text_input_ids, untruncated_ids
+ ):
+ removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {tokenizer.model_max_length} tokens: {removed_text}"
+ )
+
+ prompt_embeds = text_encoder(text_input_ids, output_hidden_states=True)
+
+ # We are only ALWAYS interested in the pooled output of the final text encoder
+ pooled_prompt_embeds = prompt_embeds[0]
+ if clip_skip is None:
+ prompt_embeds = prompt_embeds.hidden_states[-2]
+ else:
+ # "2" because SDXL always indexes from the penultimate layer.
+ prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+ prompt_embeds_list.append(prompt_embeds)
+
+ prompt_embeds = paddle.concat(prompt_embeds_list, axis=-1)
+
+ # get unconditional embeddings for classifier free guidance
+ zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+ if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+ negative_prompt_embeds = paddle.zeros_like(prompt_embeds)
+ negative_pooled_prompt_embeds = paddle.zeros_like(pooled_prompt_embeds)
+ elif do_classifier_free_guidance and negative_prompt_embeds is None:
+ negative_prompt = negative_prompt or ""
+ negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+ # normalize str to list
+ negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+ negative_prompt_2 = (
+ batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+ )
+
+ uncond_tokens: List[str]
+ if prompt is not None and type(prompt) is not type(negative_prompt):
+ raise TypeError(
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+ f" {type(prompt)}."
+ )
+ elif batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ else:
+ uncond_tokens = [negative_prompt, negative_prompt_2]
+
+ negative_prompt_embeds_list = []
+ for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+ if isinstance(self, TextualInversionLoaderMixin):
+ negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+ max_length = prompt_embeds.shape[1]
+ uncond_input = tokenizer(
+ negative_prompt,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_tensors="pd",
+ )
+
+ negative_prompt_embeds = text_encoder(
+ uncond_input.input_ids,
+ output_hidden_states=True,
+ )
+ # We are only ALWAYS interested in the pooled output of the final text encoder
+ negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+ negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+ negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+ negative_prompt_embeds = paddle.concat(negative_prompt_embeds_list, axis=-1)
+
+ if self.text_encoder_2 is not None:
+ prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder_2.dtype)
+ else:
+ prompt_embeds = prompt_embeds.cast(dtype=self.unet.dtype)
+
+ bs_embed, seq_len, _ = prompt_embeds.shape
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
+ prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
+
+ if do_classifier_free_guidance:
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+ seq_len = negative_prompt_embeds.shape[1]
+
+ if self.text_encoder_2 is not None:
+ negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder_2.dtype)
+ else:
+ negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.unet.dtype)
+
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
+
+ pooled_prompt_embeds = pooled_prompt_embeds.tile([1, num_images_per_prompt]).reshape(
+ [bs_embed * num_images_per_prompt, -1]
+ )
+ if do_classifier_free_guidance:
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.tile([1, num_images_per_prompt]).reshape(
+ [bs_embed * num_images_per_prompt, -1]
+ )
+
+ return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+ def encode_image(self, image, num_images_per_prompt):
+ dtype = next(self.image_encoder.named_parameters())[1].dtype
+
+ if not isinstance(image, paddle.Tensor):
+ image = self.feature_extractor(image, return_tensors="pd").pixel_values
+
+ image = image.cast(dtype=dtype)
+ image_embeds = self.image_encoder(image).image_embeds
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, axis=0)
+
+ uncond_image_embeds = paddle.zeros_like(image_embeds)
+ return image_embeds, uncond_image_embeds
+
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+ def prepare_extra_step_kwargs(self, generator, eta):
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+
+ # check if the scheduler accepts generator
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ if accepts_generator:
+ extra_step_kwargs["generator"] = generator
+ return extra_step_kwargs
+
+ def check_inputs(
+ self,
+ prompt,
+ prompt_2,
+ image,
+ callback_steps,
+ negative_prompt=None,
+ negative_prompt_2=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ pooled_prompt_embeds=None,
+ negative_pooled_prompt_embeds=None,
+ controlnet_conditioning_scale=1.0,
+ control_guidance_start=0.0,
+ control_guidance_end=1.0,
+ callback_on_step_end_tensor_inputs=None,
+ ):
+ if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+ raise ValueError(
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+ f" {type(callback_steps)}."
+ )
+
+ if callback_on_step_end_tensor_inputs is not None and not all(
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+ ):
+ raise ValueError(
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+ )
+
+ if prompt is not None and prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+ " only forward one of the two."
+ )
+ elif prompt_2 is not None and prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+ " only forward one of the two."
+ )
+ elif prompt is None and prompt_embeds is None:
+ raise ValueError(
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+ )
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+ if negative_prompt is not None and negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+ elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
+ raise ValueError(
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+ f" {negative_prompt_embeds.shape}."
+ )
+
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
+ raise ValueError(
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+ )
+
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+ raise ValueError(
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+ )
+
+ # `prompt` needs more sophisticated handling when there are multiple
+ # conditionings.
+ if isinstance(self.controlnet, MultiControlNetModel):
+ if isinstance(prompt, list):
+ logger.warning(
+ f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+ " prompts. The conditionings will be fixed across the prompts."
+ )
+
+ # Check `image`
+ if isinstance(self.controlnet, ControlNetModel):
+ self.check_image(image, prompt, prompt_embeds)
+ elif isinstance(self.controlnet, MultiControlNetModel):
+ if not isinstance(image, list):
+ raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+ # When `image` is a nested list:
+ # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+ elif any(isinstance(i, list) for i in image):
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+ elif len(image) != len(self.controlnet.nets):
+ raise ValueError(
+ f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+ )
+
+ for image_ in image:
+ self.check_image(image_, prompt, prompt_embeds)
+ else:
+ assert False
+
+ # Check `controlnet_conditioning_scale`
+ if isinstance(self.controlnet, ControlNetModel):
+ if not isinstance(controlnet_conditioning_scale, float):
+ raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+ elif isinstance(self.controlnet, MultiControlNetModel):
+ if isinstance(controlnet_conditioning_scale, list):
+ if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+ elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+ self.controlnet.nets
+ ):
+ raise ValueError(
+ "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+ " the same length as the number of controlnets"
+ )
+ else:
+ assert False
+
+ if not isinstance(control_guidance_start, (tuple, list)):
+ control_guidance_start = [control_guidance_start]
+
+ if not isinstance(control_guidance_end, (tuple, list)):
+ control_guidance_end = [control_guidance_end]
+
+ if len(control_guidance_start) != len(control_guidance_end):
+ raise ValueError(
+ f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+ )
+
+ if isinstance(self.controlnet, MultiControlNetModel):
+ if len(control_guidance_start) != len(self.controlnet.nets):
+ raise ValueError(
+ f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+ )
+
+ for start, end in zip(control_guidance_start, control_guidance_end):
+ if start >= end:
+ raise ValueError(
+ f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+ )
+ if start < 0.0:
+ raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+ if end > 1.0:
+ raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+ # Copied from ppdiffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+ def check_image(self, image, prompt, prompt_embeds):
+ image_is_pil = isinstance(image, PIL.Image.Image)
+ image_is_tensor = isinstance(image, paddle.Tensor)
+ image_is_np = isinstance(image, np.ndarray)
+ image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+ image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor)
+ image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+ if (
+ not image_is_pil
+ and not image_is_tensor
+ and not image_is_np
+ and not image_is_pil_list
+ and not image_is_tensor_list
+ and not image_is_np_list
+ ):
+ raise TypeError(
+ f"image must be passed and be one of PIL image, numpy array, paddle tensor, list of PIL images, list of numpy arrays or list of paddle tensors, but is {type(image)}"
+ )
+
+ if image_is_pil:
+ image_batch_size = 1
+ else:
+ image_batch_size = len(image)
+
+ if prompt is not None and isinstance(prompt, str):
+ prompt_batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ prompt_batch_size = len(prompt)
+ elif prompt_embeds is not None:
+ prompt_batch_size = prompt_embeds.shape[0]
+
+ if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+ raise ValueError(
+ f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+ )
+
+ # Copied from ppdiffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+ def prepare_image(
+ self,
+ image,
+ width,
+ height,
+ batch_size,
+ num_images_per_prompt,
+ dtype,
+ do_classifier_free_guidance=False,
+ guess_mode=False,
+ ):
+ image = self.control_image_processor.preprocess(image, height=height, width=width).cast(dtype=paddle.float32)
+ image_batch_size = image.shape[0]
+
+ if image_batch_size == 1:
+ repeat_by = batch_size
+ else:
+ # image batch size is the same as prompt batch size
+ repeat_by = num_images_per_prompt
+
+ image = image.repeat_interleave(repeat_by, axis=0)
+
+ image = image.cast(dtype=dtype)
+
+ if do_classifier_free_guidance and not guess_mode:
+ image = paddle.concat([image] * 2)
+
+ return image
+
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+ if isinstance(generator, list) and len(generator) != batch_size:
+ raise ValueError(
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+ )
+
+ if latents is None:
+ latents = randn_tensor(shape, generator=generator, dtype=dtype)
+ else:
+ latents = latents.cast(dtype)
+
+ # scale the initial noise by the standard deviation required by the scheduler
+ latents = latents * self.scheduler.init_noise_sigma
+ return latents
+
+ # Copied from ppdiffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+ def _get_add_time_ids(
+ self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+ ):
+ add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+ passed_add_embed_dim = (
+ self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+ )
+ expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+ if expected_add_embed_dim != passed_add_embed_dim:
+ raise ValueError(
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+ )
+
+ add_time_ids = paddle.to_tensor([add_time_ids], dtype=dtype)
+ return add_time_ids
+
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+ def upcast_vae(self):
+ dtype = self.vae.dtype
+ self.vae.to(dtype=paddle.float32)
+ use_paddle_2_5_or_ppxformers = isinstance(
+ self.vae.decoder.mid_block.attentions[0].processor,
+ (
+ AttnProcessor2_5,
+ XFormersAttnProcessor,
+ LoRAXFormersAttnProcessor,
+ LoRAAttnProcessor2_5,
+ ),
+ )
+ # if xformers or torch_2_0 is used attention block does not need
+ # to be in float32 which can save lots of memory
+ if use_paddle_2_5_or_ppxformers:
+ self.vae.post_quant_conv.to(dtype=dtype)
+ self.vae.decoder.conv_in.to(dtype=dtype)
+ self.vae.decoder.mid_block.to(dtype=dtype)
+
+ # Copied from ppdiffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=paddle.float32):
+ """
+ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+ Args:
+ timesteps (`paddle.Tensor`):
+ generate embedding vectors at these timesteps
+ embedding_dim (`int`, *optional*, defaults to 512):
+ dimension of the embeddings to generate
+ dtype:
+ data type of the generated embeddings
+
+ Returns:
+ `paddle.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+ """
+ assert len(w.shape) == 1
+ w = w * 1000.0
+
+ half_dim = embedding_dim // 2
+ emb = paddle.log(paddle.to_tensor(10000.0)) / (half_dim - 1)
+ emb = paddle.exp(paddle.arange(half_dim, dtype=dtype) * -emb)
+ emb = w.cast(dtype=dtype)[:, None] * emb[None, :]
+ emb = paddle.concat([paddle.sin(emb), paddle.cos(emb)], axis=1)
+ if embedding_dim % 2 == 1:
+ emb = paddle.concat(emb, paddle.zeros([emb.shape[0], 1]), axis=-1)
+ assert emb.shape == [w.shape[0], embedding_dim]
+ return emb
+
+ @property
+ def guidance_scale(self):
+ return self._guidance_scale
+
+ @property
+ def clip_skip(self):
+ return self._clip_skip
+
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ @property
+ def do_classifier_free_guidance(self):
+ return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+ @property
+ def cross_attention_kwargs(self):
+ return self._cross_attention_kwargs
+
+ @property
+ def num_timesteps(self):
+ return self._num_timesteps
+
+ @paddle.no_grad()
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
+ def __call__(
+ self,
+ prompt: Union[str, List[str]] = None,
+ prompt_2: Optional[Union[str, List[str]]] = None,
+ image: PipelineImageInput = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 5.0,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ negative_prompt_2: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ latents: Optional[paddle.Tensor] = None,
+ prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
+ pooled_prompt_embeds: Optional[paddle.Tensor] = None,
+ negative_pooled_prompt_embeds: Optional[paddle.Tensor] = None,
+ ip_adapter_image: Optional[PipelineImageInput] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+ guess_mode: bool = False,
+ control_guidance_start: Union[float, List[float]] = 0.0,
+ control_guidance_end: Union[float, List[float]] = 1.0,
+ original_size: Tuple[int, int] = None,
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
+ target_size: Tuple[int, int] = None,
+ negative_original_size: Optional[Tuple[int, int]] = None,
+ negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+ negative_target_size: Optional[Tuple[int, int]] = None,
+ clip_skip: Optional[int] = None,
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+ **kwargs,
+ ):
+ r"""
+ The call function to the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+ prompt_2 (`str` or `List[str]`, *optional*):
+ The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+ used in both text-encoders.
+ image (`paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[paddle.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+ `List[List[paddle.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+ The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+ specified as `paddle.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
+ accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
+ and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
+ `init`, images must be passed as a list such that each element of the list can be correctly batched for
+ input to a single ControlNet.
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+ The height in pixels of the generated image. Anything below 512 pixels won't work well for
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+ and checkpoints that are not specifically fine-tuned on low resolutions.
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+ The width in pixels of the generated image. Anything below 512 pixels won't work well for
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+ and checkpoints that are not specifically fine-tuned on low resolutions.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ guidance_scale (`float`, *optional*, defaults to 5.0):
+ A higher guidance scale value encourages the model to generate images closely linked to the text
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide what to not include in image generation. This is sent to `tokenizer_2`
+ and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders.
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+ generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
+ A [`paddle.Generator`] to make generation deterministic.
+
+ latents (`paddle.Tensor`, *optional*):
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor is generated by sampling using the supplied random `generator`.
+ prompt_embeds (`paddle.Tensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+ provided, text embeddings are generated from the `prompt` input argument.
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+ pooled_prompt_embeds (`paddle.Tensor`, *optional*):
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+ not provided, pooled text embeddings are generated from `prompt` input argument.
+ negative_pooled_prompt_embeds (`paddle.Tensor`, *optional*):
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt
+ weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input
+ argument.
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+ plain tuple.
+ cross_attention_kwargs (`dict`, *optional*):
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+ controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+ The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+ to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+ the corresponding scale as a list.
+ guess_mode (`bool`, *optional*, defaults to `False`):
+ The ControlNet encoder tries to recognize the content of the input image even if you remove all
+ prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+ control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+ The percentage of total steps at which the ControlNet starts applying.
+ control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+ The percentage of total steps at which the ControlNet stops applying.
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+ If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+ `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+ explained in section 2.2 of
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+ `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+ `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+ `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+ target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+ For most cases, `target_size` should be set to the desired height and width of the generated image. If
+ not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+ section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+ negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+ To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+ micro-conditioning as explained in section 2.2 of
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+ negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+ To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+ micro-conditioning as explained in section 2.2 of
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+ negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+ To negatively condition the generation process based on a target image resolution. It should be as same
+ as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+ clip_skip (`int`, *optional*):
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+ the output of the pre-final layer will be used for computing the prompt embeddings.
+ callback_on_step_end (`Callable`, *optional*):
+ A function that calls at the end of each denoising steps during the inference. The function is called
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+ `callback_on_step_end_tensor_inputs`.
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+ `._callback_tensor_inputs` attribute of your pipeine class.
+
+ Examples:
+
+ Returns:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+ otherwise a `tuple` is returned containing the output images.
+ """
+
+ callback = kwargs.pop("callback", None)
+ callback_steps = kwargs.pop("callback_steps", None)
+
+ if callback is not None:
+ deprecate(
+ "callback",
+ "1.0.0",
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+ )
+ if callback_steps is not None:
+ deprecate(
+ "callback_steps",
+ "1.0.0",
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+ )
+
+ controlnet = self.controlnet
+
+ # align format for control guidance
+ if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+ control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+ elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+ control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+ elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+ mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+ control_guidance_start, control_guidance_end = (
+ mult * [control_guidance_start],
+ mult * [control_guidance_end],
+ )
+
+ # 1. Check inputs. Raise error if not correct
+ self.check_inputs(
+ prompt,
+ prompt_2,
+ image,
+ callback_steps,
+ negative_prompt,
+ negative_prompt_2,
+ prompt_embeds,
+ negative_prompt_embeds,
+ pooled_prompt_embeds,
+ negative_pooled_prompt_embeds,
+ controlnet_conditioning_scale,
+ control_guidance_start,
+ control_guidance_end,
+ callback_on_step_end_tensor_inputs,
+ )
+
+ self._guidance_scale = guidance_scale
+ self._clip_skip = clip_skip
+ self._cross_attention_kwargs = cross_attention_kwargs
+
+ # 2. Define call parameters
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+ global_pool_conditions = (
+ controlnet.config.global_pool_conditions
+ if isinstance(controlnet, ControlNetModel)
+ else controlnet.nets[0].config.global_pool_conditions
+ )
+ guess_mode = guess_mode or global_pool_conditions
+
+ # 3.1 Encode input prompt
+ text_encoder_lora_scale = (
+ self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+ )
+ (
+ prompt_embeds,
+ negative_prompt_embeds,
+ pooled_prompt_embeds,
+ negative_pooled_prompt_embeds,
+ ) = self.encode_prompt(
+ prompt,
+ prompt_2,
+ num_images_per_prompt,
+ self.do_classifier_free_guidance,
+ negative_prompt,
+ negative_prompt_2,
+ prompt_embeds=prompt_embeds,
+ negative_prompt_embeds=negative_prompt_embeds,
+ pooled_prompt_embeds=pooled_prompt_embeds,
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+ lora_scale=text_encoder_lora_scale,
+ clip_skip=self.clip_skip,
+ )
+
+ # 3.2 Encode ip_adapter_image
+ if ip_adapter_image is not None:
+ image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, num_images_per_prompt)
+ if self.do_classifier_free_guidance:
+ image_embeds = paddle.concat([negative_image_embeds, image_embeds])
+
+ # 4. Prepare image
+ if isinstance(controlnet, ControlNetModel):
+ image = self.prepare_image(
+ image=image,
+ width=width,
+ height=height,
+ batch_size=batch_size * num_images_per_prompt,
+ num_images_per_prompt=num_images_per_prompt,
+ dtype=controlnet.dtype,
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
+ guess_mode=guess_mode,
+ )
+ height, width = image.shape[-2:]
+ elif isinstance(controlnet, MultiControlNetModel):
+ images = []
+
+ for image_ in image:
+ image_ = self.prepare_image(
+ image=image_,
+ width=width,
+ height=height,
+ batch_size=batch_size * num_images_per_prompt,
+ num_images_per_prompt=num_images_per_prompt,
+ dtype=controlnet.dtype,
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
+ guess_mode=guess_mode,
+ )
+
+ images.append(image_)
+
+ image = images
+ height, width = image[0].shape[-2:]
+ else:
+ assert False
+
+ # 5. Prepare timesteps
+ self.scheduler.set_timesteps(num_inference_steps)
+ timesteps = self.scheduler.timesteps
+ self._num_timesteps = len(timesteps)
+
+ # 6. Prepare latent variables
+ num_channels_latents = self.unet.config.in_channels
+ latents = self.prepare_latents(
+ batch_size * num_images_per_prompt,
+ num_channels_latents,
+ height,
+ width,
+ prompt_embeds.dtype,
+ generator,
+ latents,
+ )
+
+ # 6.5 Optionally get Guidance Scale Embedding
+ timestep_cond = None
+ if self.unet.config.time_cond_proj_dim is not None:
+ guidance_scale_tensor = paddle.to_tensor([self.guidance_scale - 1]).tile(
+ [
+ batch_size * num_images_per_prompt,
+ ]
+ )
+ timestep_cond = self.get_guidance_scale_embedding(
+ guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+ ).cast(dtype=latents.dtype)
+
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+ # 7.1 Create tensor stating which controlnets to keep
+ controlnet_keep = []
+ for i in range(len(timesteps)):
+ keeps = [
+ 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+ for s, e in zip(control_guidance_start, control_guidance_end)
+ ]
+ controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+ # 7.2 Prepare added time ids & embeddings
+ if isinstance(image, list):
+ original_size = original_size or tuple(image[0].shape[-2:])
+ else:
+ original_size = original_size or tuple(image.shape[-2:])
+ target_size = target_size or (height, width)
+
+ add_text_embeds = pooled_prompt_embeds
+ if self.text_encoder_2 is None:
+ text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+ else:
+ text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+ add_time_ids = self._get_add_time_ids(
+ original_size,
+ crops_coords_top_left,
+ target_size,
+ dtype=prompt_embeds.dtype,
+ text_encoder_projection_dim=text_encoder_projection_dim,
+ )
+
+ if negative_original_size is not None and negative_target_size is not None:
+ negative_add_time_ids = self._get_add_time_ids(
+ negative_original_size,
+ negative_crops_coords_top_left,
+ negative_target_size,
+ dtype=prompt_embeds.dtype,
+ text_encoder_projection_dim=text_encoder_projection_dim,
+ )
+ else:
+ negative_add_time_ids = add_time_ids
+
+ if self.do_classifier_free_guidance:
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds], axis=0)
+ add_text_embeds = paddle.concat([negative_pooled_prompt_embeds, add_text_embeds], axis=0)
+ add_time_ids = paddle.concat([negative_add_time_ids, add_time_ids], axis=0)
+
+ add_time_ids = add_time_ids.tile([batch_size * num_images_per_prompt, 1])
+
+ # 8. Denoising loop
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
+ for i, t in enumerate(timesteps):
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = paddle.concat([latents] * 2) if self.do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+ added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+ # controlnet(s) inference
+ if guess_mode and self.do_classifier_free_guidance:
+ # Infer ControlNet only for the conditional batch.
+ control_model_input = latents
+ control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+ controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+ controlnet_added_cond_kwargs = {
+ "text_embeds": add_text_embeds.chunk(2)[1],
+ "time_ids": add_time_ids.chunk(2)[1],
+ }
+ else:
+ control_model_input = latent_model_input
+ controlnet_prompt_embeds = prompt_embeds
+ controlnet_added_cond_kwargs = added_cond_kwargs
+
+ if isinstance(controlnet_keep[i], list):
+ cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+ else:
+ controlnet_cond_scale = controlnet_conditioning_scale
+ if isinstance(controlnet_cond_scale, list):
+ controlnet_cond_scale = controlnet_cond_scale[0]
+ cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
+ control_model_input,
+ t,
+ encoder_hidden_states=controlnet_prompt_embeds,
+ controlnet_cond=image,
+ conditioning_scale=cond_scale,
+ guess_mode=guess_mode,
+ added_cond_kwargs=controlnet_added_cond_kwargs,
+ return_dict=False,
+ )
+
+ if guess_mode and self.do_classifier_free_guidance:
+ # Infered ControlNet only for the conditional batch.
+ # To apply the output of ControlNet to both the unconditional and conditional batches,
+ # add 0 to the unconditional batch to keep it unchanged.
+ down_block_res_samples = [paddle.concat([paddle.zeros_like(d), d]) for d in down_block_res_samples]
+ mid_block_res_sample = paddle.concat(
+ [paddle.zeros_like(mid_block_res_sample), mid_block_res_sample]
+ )
+
+ if ip_adapter_image is not None:
+ added_cond_kwargs["image_embeds"] = image_embeds
+
+ # predict the noise residual
+ noise_pred = self.unet(
+ latent_model_input,
+ t,
+ encoder_hidden_states=prompt_embeds,
+ timestep_cond=timestep_cond,
+ cross_attention_kwargs=self.cross_attention_kwargs,
+ down_block_additional_residuals=down_block_res_samples,
+ mid_block_additional_residual=mid_block_res_sample,
+ added_cond_kwargs=added_cond_kwargs,
+ return_dict=False,
+ )[0]
+
+ # perform guidance
+ if self.do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+ if callback_on_step_end is not None:
+ callback_kwargs = {}
+ for k in callback_on_step_end_tensor_inputs:
+ callback_kwargs[k] = locals()[k]
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+ latents = callback_outputs.pop("latents", latents)
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+ # call the callback, if provided
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+ progress_bar.update()
+ if callback is not None and i % callback_steps == 0:
+ step_idx = i // getattr(self.scheduler, "order", 1)
+ callback(step_idx, t, latents)
+
+ # manually for max memory savings
+ # if self.vae.dtype in [paddle.float16, "float16"] and self.vae.config.force_upcast:
+ # self.upcast_vae()
+ # latents = latents.cast(dtype=next(iter(self.vae.post_quant_conv.named_parameters()))[1].dtype)
+
+ if not output_type == "latent":
+ # make sure the VAE is in float32 mode, as it overflows in float16
+ needs_upcasting = self.vae.dtype in [paddle.float16, "float16"] and self.vae.config.force_upcast
+
+ if needs_upcasting:
+ self.upcast_vae()
+ latents = latents.cast(dtype=next(iter(self.vae.post_quant_conv.named_parameters()))[1].dtype)
+
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+ # cast back to fp16 if needed
+ if needs_upcasting:
+ self.vae.to(dtype=paddle.float16)
+ else:
+ image = latents
+
+ if not output_type == "latent":
+ # apply watermark if available
+ if self.watermark is not None:
+ image = self.watermark.apply_watermark(image)
+
+ image = self.image_processor.postprocess(image, output_type=output_type)
+
+ if not return_dict:
+ return (image,)
+
+ return StableDiffusionXLPipelineOutput(images=image)
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/dit/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/dit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..62eae6ad873171dc0f578593d9dabb88271c519f
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/dit/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import PPDIFFUSERS_SLOW_IMPORT, _LazyModule
+
+_import_structure = {"pipeline_dit": ["DiTPipeline"]}
+
+if TYPE_CHECKING or PPDIFFUSERS_SLOW_IMPORT:
+ from .pipeline_dit import DiTPipeline
+
+else:
+ import sys
+
+ sys.modules[__name__] = _LazyModule(
+ __name__,
+ globals()["__file__"],
+ _import_structure,
+ module_spec=__spec__,
+ )
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/deprecated/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/deprecated/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ab67b337fc3f0f4ab5030f5ea3f81110734f191
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/deprecated/__init__.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import (
+ PPDIFFUSERS_SLOW_IMPORT,
+ OptionalDependencyNotAvailable,
+ _LazyModule,
+ get_objects_from_module,
+ is_paddle_available,
+ is_paddlenlp_available,
+)
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+ if not (is_paddlenlp_available() and is_paddle_available()):
+ raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+ from ...utils import dummy_paddle_objects # noqa F403
+
+ _dummy_objects.update(get_objects_from_module(dummy_paddle_objects))
+else:
+ _import_structure["scheduling_karras_ve"] = ["KarrasVeScheduler"]
+ _import_structure["scheduling_sde_vp"] = ["ScoreSdeVpScheduler"]
+
+if TYPE_CHECKING or PPDIFFUSERS_SLOW_IMPORT:
+ try:
+ if not is_paddle_available():
+ raise OptionalDependencyNotAvailable()
+
+ except OptionalDependencyNotAvailable:
+ from ..utils.dummy_pd_objects import * # noqa F403
+ else:
+ from .scheduling_karras_ve import KarrasVeScheduler
+ from .scheduling_sde_vp import ScoreSdeVpScheduler
+
+
+else:
+ import sys
+
+ sys.modules[__name__] = _LazyModule(
+ __name__,
+ globals()["__file__"],
+ _import_structure,
+ module_spec=__spec__,
+ )
+
+ for name, value in _dummy_objects.items():
+ setattr(sys.modules[__name__], name, value)
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/deprecated/scheduling_karras_ve.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/deprecated/scheduling_karras_ve.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d50991869b7224e45d57950da5f44dc369adeac
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/deprecated/scheduling_karras_ve.py
@@ -0,0 +1,243 @@
+# Copyright 2023 NVIDIA and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import paddle
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput
+from ...utils.paddle_utils import randn_tensor
+from ..scheduling_utils import SchedulerMixin
+
+
+@dataclass
+class KarrasVeOutput(BaseOutput):
+ """
+ Output class for the scheduler's step function output.
+
+ Args:
+ prev_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+ Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+ denoising loop.
+ derivative (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+ Derivative of predicted original image sample (x_0).
+ pred_original_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+ The predicted denoised sample (x_{0}) based on the model output from the current timestep.
+ `pred_original_sample` can be used to preview progress or for guidance.
+ """
+
+ prev_sample: paddle.Tensor
+ derivative: paddle.Tensor
+ pred_original_sample: Optional[paddle.Tensor] = None
+
+
+class KarrasVeScheduler(SchedulerMixin, ConfigMixin):
+ """
+ A stochastic scheduler tailored to variance-expanding models.
+
+ This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+ methods the library implements for all schedulers such as loading and saving.
+
+
+
+ For more details on the parameters, see [Appendix E](https://arxiv.org/abs/2206.00364). The grid search values used
+ to find the optimal `{s_noise, s_churn, s_min, s_max}` for a specific model are described in Table 5 of the paper.
+
+
+
+ Args:
+ sigma_min (`float`, defaults to 0.02):
+ The minimum noise magnitude.
+ sigma_max (`float`, defaults to 100):
+ The maximum noise magnitude.
+ s_noise (`float`, defaults to 1.007):
+ The amount of additional noise to counteract loss of detail during sampling. A reasonable range is [1.000,
+ 1.011].
+ s_churn (`float`, defaults to 80):
+ The parameter controlling the overall amount of stochasticity. A reasonable range is [0, 100].
+ s_min (`float`, defaults to 0.05):
+ The start value of the sigma range to add noise (enable stochasticity). A reasonable range is [0, 10].
+ s_max (`float`, defaults to 50):
+ The end value of the sigma range to add noise. A reasonable range is [0.2, 80].
+ """
+
+ order = 2
+
+ @register_to_config
+ def __init__(
+ self,
+ sigma_min: float = 0.02,
+ sigma_max: float = 100,
+ s_noise: float = 1.007,
+ s_churn: float = 80,
+ s_min: float = 0.05,
+ s_max: float = 50,
+ ):
+ # standard deviation of the initial noise distribution
+ self.init_noise_sigma = sigma_max
+
+ # setable values
+ self.num_inference_steps: int = None
+ self.timesteps: paddle.Tensor = None
+ self.schedule: paddle.Tensor = None # sigma(t_i)
+
+ def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
+ """
+ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+ current timestep.
+
+ Args:
+ sample (`paddle.Tensor`):
+ The input sample.
+ timestep (`int`, *optional*):
+ The current timestep in the diffusion chain.
+
+ Returns:
+ `paddle.Tensor`:
+ A scaled input sample.
+ """
+ return sample
+
+ def set_timesteps(self, num_inference_steps: int):
+ """
+ Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+ Args:
+ num_inference_steps (`int`):
+ The number of diffusion steps used when generating samples with a pre-trained model.
+ device (`str` or `torch.device`, *optional*):
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+ """
+ self.num_inference_steps = num_inference_steps
+ timesteps = np.arange(0, self.num_inference_steps)[::-1].copy()
+ self.timesteps = paddle.to_tensor(timesteps)
+ schedule = [
+ (
+ self.config.sigma_max**2
+ * (self.config.sigma_min**2 / self.config.sigma_max**2) ** (i / (num_inference_steps - 1))
+ )
+ for i in self.timesteps
+ ]
+ self.schedule = paddle.to_tensor(schedule, dtype=paddle.float32)
+
+ def add_noise_to_input(
+ self, sample: paddle.Tensor, sigma: float, generator: Optional[paddle.Generator] = None
+ ) -> Tuple[paddle.Tensor, float]:
+ """
+ Explicit Langevin-like "churn" step of adding noise to the sample according to a `gamma_i ≥ 0` to reach a
+ higher noise level `sigma_hat = sigma_i + gamma_i*sigma_i`.
+
+ Args:
+ sample (`paddle.Tensor`):
+ The input sample.
+ sigma (`float`):
+ generator (`paddle.Generator`, *optional*):
+ A random number generator.
+ """
+ if self.config.s_min <= sigma <= self.config.s_max:
+ gamma = min(self.config.s_churn / self.num_inference_steps, 2**0.5 - 1)
+ else:
+ gamma = 0
+
+ # sample eps ~ N(0, S_noise^2 * I)
+ eps = self.config.s_noise * randn_tensor(sample.shape, generator=generator)
+ sigma_hat = sigma + gamma * sigma
+ sample_hat = sample + ((sigma_hat**2 - sigma**2) ** 0.5 * eps)
+
+ return sample_hat, sigma_hat
+
+ def step(
+ self,
+ model_output: paddle.Tensor,
+ sigma_hat: float,
+ sigma_prev: float,
+ sample_hat: paddle.Tensor,
+ return_dict: bool = True,
+ ) -> Union[KarrasVeOutput, Tuple]:
+ """
+ Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ Args:
+ model_output (`paddle.Tensor`):
+ The direct output from learned diffusion model.
+ sigma_hat (`float`):
+ sigma_prev (`float`):
+ sample_hat (`paddle.Tensor`):
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~schedulers.scheduling_karras_ve.KarrasVESchedulerOutput`] or `tuple`.
+
+ Returns:
+ [`~schedulers.scheduling_karras_ve.KarrasVESchedulerOutput`] or `tuple`:
+ If return_dict is `True`, [`~schedulers.scheduling_karras_ve.KarrasVESchedulerOutput`] is returned,
+ otherwise a tuple is returned where the first element is the sample tensor.
+
+ """
+
+ pred_original_sample = sample_hat + sigma_hat * model_output
+ derivative = (sample_hat - pred_original_sample) / sigma_hat
+ sample_prev = sample_hat + (sigma_prev - sigma_hat) * derivative
+
+ if not return_dict:
+ return (sample_prev, derivative)
+
+ return KarrasVeOutput(
+ prev_sample=sample_prev, derivative=derivative, pred_original_sample=pred_original_sample
+ )
+
+ def step_correct(
+ self,
+ model_output: paddle.Tensor,
+ sigma_hat: float,
+ sigma_prev: float,
+ sample_hat: paddle.Tensor,
+ sample_prev: paddle.Tensor,
+ derivative: paddle.Tensor,
+ return_dict: bool = True,
+ ) -> Union[KarrasVeOutput, Tuple]:
+ """
+ Corrects the predicted sample based on the `model_output` of the network.
+
+ Args:
+ model_output (`paddle.Tensor`):
+ The direct output from learned diffusion model.
+ sigma_hat (`float`): TODO
+ sigma_prev (`float`): TODO
+ sample_hat (`paddle.Tensor`): TODO
+ sample_prev (`paddle.Tensor`): TODO
+ derivative (`paddle.Tensor`): TODO
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`.
+
+ Returns:
+ prev_sample (TODO): updated sample in the diffusion chain. derivative (TODO): TODO
+
+ """
+ pred_original_sample = sample_prev + sigma_prev * model_output
+ derivative_corr = (sample_prev - pred_original_sample) / sigma_prev
+ sample_prev = sample_hat + (sigma_prev - sigma_hat) * (0.5 * derivative + 0.5 * derivative_corr)
+
+ if not return_dict:
+ return (sample_prev, derivative)
+
+ return KarrasVeOutput(
+ prev_sample=sample_prev, derivative=derivative, pred_original_sample=pred_original_sample
+ )
+
+ def add_noise(self, original_samples, noise, timesteps):
+ raise NotImplementedError()
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/deprecated/scheduling_sde_vp.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/deprecated/scheduling_sde_vp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c63036c9f4b894e8ca1f33701b9082e0606ff52e
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/deprecated/scheduling_sde_vp.py
@@ -0,0 +1,110 @@
+# Copyright 2023 Google Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/yang-song/score_sde_pytorch
+
+import math
+
+import paddle
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils.paddle_utils import randn_tensor
+from ..scheduling_utils import SchedulerMixin
+
+
+class ScoreSdeVpScheduler(SchedulerMixin, ConfigMixin):
+ """
+ `ScoreSdeVpScheduler` is a variance preserving stochastic differential equation (SDE) scheduler.
+
+ This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+ methods the library implements for all schedulers such as loading and saving.
+
+ Args:
+ num_train_timesteps (`int`, defaults to 2000):
+ The number of diffusion steps to train the model.
+ beta_min (`int`, defaults to 0.1):
+ beta_max (`int`, defaults to 20):
+ sampling_eps (`int`, defaults to 1e-3):
+ The end value of sampling where timesteps decrease progressively from 1 to epsilon.
+ """
+
+ order = 1
+
+ @register_to_config
+ def __init__(self, num_train_timesteps=2000, beta_min=0.1, beta_max=20, sampling_eps=1e-3):
+ self.sigmas = None
+ self.discrete_sigmas = None
+ self.timesteps = None
+
+ def set_timesteps(self, num_inference_steps):
+ """
+ Sets the continuous timesteps used for the diffusion chain (to be run before inference).
+
+ Args:
+ num_inference_steps (`int`):
+ The number of diffusion steps used when generating samples with a pre-trained model.
+ device (`str` or `torch.device`, *optional*):
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+ """
+ self.timesteps = paddle.linspace(1, self.config.sampling_eps, num_inference_steps)
+
+ def step_pred(self, score, x, t, generator=None):
+ """
+ Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ Args:
+ score ():
+ x ():
+ t ():
+ generator (`paddle.Generator`, *optional*):
+ A random number generator.
+ """
+ if self.timesteps is None:
+ raise ValueError(
+ "`self.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
+ )
+
+ # TODO(Patrick) better comments + non-Paddle
+ # postprocess model score
+ log_mean_coeff = (
+ -0.25 * t**2 * (self.config.beta_max - self.config.beta_min) - 0.5 * t * self.config.beta_min
+ )
+ std = paddle.sqrt(1.0 - paddle.exp(2.0 * log_mean_coeff))
+ std = std.flatten()
+ while len(std.shape) < len(score.shape):
+ std = std.unsqueeze(-1)
+ score = -score / std
+
+ # compute
+ dt = -1.0 / len(self.timesteps)
+
+ beta_t = self.config.beta_min + t * (self.config.beta_max - self.config.beta_min)
+ beta_t = beta_t.flatten()
+ while len(beta_t.shape) < len(x.shape):
+ beta_t = beta_t.unsqueeze(-1)
+ drift = -0.5 * beta_t * x
+
+ diffusion = paddle.sqrt(beta_t)
+ drift = drift - diffusion**2 * score
+ x_mean = x + drift * dt
+
+ # add noise
+ noise = randn_tensor(x.shape, generator=generator, dtype=x.dtype)
+ x = x_mean + diffusion * math.sqrt(-dt) * noise
+
+ return x, x_mean
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecff93753b32dea4e0625006b6d457681611a8d6
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# flake8: noqa
+
+from ...utils import (
+ OptionalDependencyNotAvailable,
+ is_paddle_available,
+ is_scipy_available,
+)
+
+try:
+ if not is_paddle_available():
+ raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+ from ...utils.dummy_paddle_objects import * # noqa F403
+else:
+ from .preconfig_scheduling_euler_ancestral_discrete import (
+ PreconfigEulerAncestralDiscreteScheduler,
+ )
+try:
+ if not (is_paddle_available() and is_scipy_available()):
+ raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+ from ...utils.dummy_paddle_and_scipy_objects import * # noqa F403
+else:
+ from .preconfig_scheduling_lms_discrete import PreconfigLMSDiscreteScheduler
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py
new file mode 100644
index 0000000000000000000000000000000000000000..b45428de9e12c35acf25d98c53df0f773e1e5ed1
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py
@@ -0,0 +1,313 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput, logging, randn_tensor
+from ..scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+@dataclass
+# Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->EulerAncestralDiscrete
+class PreconfigEulerAncestralDiscreteSchedulerOutput(BaseOutput):
+ """
+ Output class for the scheduler's step function output.
+
+ Args:
+ prev_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+ Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+ denoising loop.
+ pred_original_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+ The predicted denoised sample (x_{0}) based on the model output from the current timestep.
+ `pred_original_sample` can be used to preview progress or for guidance.
+ """
+
+ prev_sample: paddle.Tensor
+ pred_original_sample: Optional[paddle.Tensor] = None
+
+
+# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
+ """
+ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+ (1-beta) over time from t = [0,1].
+
+ Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+ to that part of the diffusion process.
+
+
+ Args:
+ num_diffusion_timesteps (`int`): the number of betas to produce.
+ max_beta (`float`): the maximum beta to use; use values lower than 1 to
+ prevent singularities.
+
+ Returns:
+ betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+ """
+
+ def alpha_bar(time_step):
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+
+ betas = []
+ for i in range(num_diffusion_timesteps):
+ t1 = i / num_diffusion_timesteps
+ t2 = (i + 1) / num_diffusion_timesteps
+ betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+ return paddle.to_tensor(betas, dtype=paddle.float32)
+
+
+class PreconfigEulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
+ """
+ Ancestral sampling with Euler method steps. Based on the original k-diffusion implementation by Katherine Crowson:
+ https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+ [`~SchedulerMixin.from_pretrained`] functions.
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model.
+ beta_start (`float`): the starting `beta` value of inference.
+ beta_end (`float`): the final `beta` value.
+ beta_schedule (`str`):
+ the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+ `linear` or `scaled_linear`.
+ trained_betas (`np.ndarray`, optional):
+ option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+ prediction_type (`str`, default `epsilon`, optional):
+ prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
+ process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
+ https://imagen.research.google/video/paper.pdf)
+
+ """
+
+ _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+ order = 1
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ prediction_type: str = "epsilon",
+ preconfig: bool = True,
+ ):
+ if trained_betas is not None:
+ self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
+ elif beta_schedule == "linear":
+ self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+ elif beta_schedule == "scaled_linear":
+ # this schedule is very specific to the latent diffusion model.
+ self.betas = (
+ paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2
+ )
+ elif beta_schedule == "squaredcos_cap_v2":
+ # Glide cosine schedule
+ self.betas = betas_for_alpha_bar(num_train_timesteps)
+ else:
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+ self.alphas = 1.0 - self.betas
+ self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
+
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+ sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
+ self.sigmas = paddle.to_tensor(sigmas)
+
+ # standard deviation of the initial noise distribution
+ self.init_noise_sigma = self.sigmas.max()
+
+ # setable values
+ self.num_inference_steps = None
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
+ self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
+ self.is_scale_input_called = False
+ self.preconfig = preconfig
+ self.step_index_offset = 0
+
+ def scale_model_input(
+ self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor], **kwargs
+ ) -> paddle.Tensor:
+ """
+ Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+
+ Args:
+ sample (`paddle.Tensor`): input sample
+ timestep (`float` or `paddle.Tensor`): the current timestep in the diffusion chain
+
+ Returns:
+ `paddle.Tensor`: scaled input sample
+ """
+ self.is_scale_input_called = True
+ if kwargs.get("step_index") is not None:
+ step_index = kwargs["step_index"] + self.step_index_offset
+ else:
+ step_index = (self.timesteps == timestep).nonzero().item()
+
+ if not self.preconfig:
+ sigma = self.sigmas[step_index]
+ sample = sample / ((sigma**2 + 1) ** 0.5)
+ return sample
+ else:
+ if step_index > (len(self.latent_scales) - 1):
+ step_index = -1
+ return sample * self.latent_scales[step_index]
+
+ def set_timesteps(self, num_inference_steps: int):
+ """
+ Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ """
+ self.num_inference_steps = num_inference_steps
+ self.step_index_offset = 0
+
+ timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+ sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+ sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+ self.sigmas = paddle.to_tensor(sigmas)
+ self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
+ if self.preconfig:
+ self.sigma_up = []
+ self.sigma_down = []
+ for step_index_i in range(len(self.timesteps)):
+ sigma_from = self.sigmas[step_index_i]
+ sigma_to = self.sigmas[step_index_i + 1]
+ sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+ sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+ self.sigma_up.append(sigma_up)
+ self.sigma_down.append(sigma_down)
+ self.latent_scales = 1 / ((self.sigmas**2 + 1) ** 0.5)
+
+ def step(
+ self,
+ model_output: paddle.Tensor,
+ timestep: Union[float, paddle.Tensor],
+ sample: paddle.Tensor,
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
+ return_dict: bool = True,
+ **kwargs
+ ) -> Union[PreconfigEulerAncestralDiscreteSchedulerOutput, Tuple]:
+ """
+ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ Args:
+ model_output (`paddle.Tensor`): direct output from learned diffusion model.
+ timestep (`float`): current timestep in the diffusion chain.
+ sample (`paddle.Tensor`):
+ current instance of sample being created by diffusion process.
+ generator (`paddle.Generator`, optional): Random number generator.
+ return_dict (`bool`): option for returning tuple rather than PreconfigEulerAncestralDiscreteSchedulerOutput class
+
+ Returns:
+ [`~schedulers.scheduling_utils.PreconfigEulerAncestralDiscreteSchedulerOutput`] or `tuple`:
+ [`~schedulers.scheduling_utils.PreconfigEulerAncestralDiscreteSchedulerOutput`] if `return_dict` is True, otherwise
+ a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+ """
+ if not self.is_scale_input_called:
+ logger.warning(
+ "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+ "See `StableDiffusionPipeline` for a usage example."
+ )
+ if kwargs.get("return_pred_original_sample") is not None:
+ return_pred_original_sample = kwargs["return_pred_original_sample"]
+ else:
+ return_pred_original_sample = True
+ if kwargs.get("step_index") is not None:
+ step_index = kwargs["step_index"] + self.step_index_offset
+ else:
+ step_index = (self.timesteps == timestep).nonzero().item()
+ sigma = self.sigmas[step_index]
+ if self.config.prediction_type == "epsilon" and not return_pred_original_sample:
+ derivative = model_output
+ pred_original_sample = None
+ else:
+ # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+ if self.config.prediction_type == "epsilon":
+ pred_original_sample = sample - sigma * model_output
+ elif self.config.prediction_type == "v_prediction":
+ # * c_out + input * c_skip
+ pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+ elif self.config.prediction_type == "sample":
+ raise NotImplementedError("prediction_type not implemented yet: sample")
+ else:
+ raise ValueError(
+ f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+ )
+ derivative = (sample - pred_original_sample) / sigma
+ if not self.preconfig:
+ sigma_from = self.sigmas[step_index]
+ sigma_to = self.sigmas[step_index + 1]
+ sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+ sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+ else:
+ sigma_up = self.sigma_up[step_index]
+ sigma_down = self.sigma_down[step_index]
+ # 2. Convert to an ODE derivative
+ dt = sigma_down - sigma
+ prev_sample = sample + derivative * dt
+ noise = randn_tensor(model_output.shape, dtype=model_output.dtype, generator=generator)
+ prev_sample = prev_sample + noise * sigma_up
+ if not return_dict:
+ if not return_pred_original_sample:
+ return (prev_sample,)
+ else:
+ return (prev_sample, pred_original_sample)
+
+ return PreconfigEulerAncestralDiscreteSchedulerOutput(
+ prev_sample=prev_sample, pred_original_sample=pred_original_sample
+ )
+
+ def add_noise(
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
+ # Fix 0D tensor
+ if paddle.is_tensor(timesteps) and timesteps.ndim == 0:
+ timesteps = timesteps.unsqueeze(0)
+ # Make sure sigmas and timesteps have the same dtype as original_samples
+ self.sigmas = self.sigmas.cast(original_samples.dtype)
+
+ schedule_timesteps = self.timesteps
+ step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+ sigma = self.sigmas[step_indices].flatten()
+ while len(sigma.shape) < len(original_samples.shape):
+ sigma = sigma.unsqueeze(-1)
+
+ noisy_samples = original_samples + noise * sigma
+ return noisy_samples
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py
new file mode 100644
index 0000000000000000000000000000000000000000..450dcb635843e07edd7737d2230b9c6ab7502cd3
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py
@@ -0,0 +1,340 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import warnings
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+from scipy import integrate
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput
+from ..scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+
+@dataclass
+# Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->LMSDiscrete
+class PreconfigLMSDiscreteSchedulerOutput(BaseOutput):
+ """
+ Output class for the scheduler's step function output.
+
+ Args:
+ prev_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+ Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+ denoising loop.
+ pred_original_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+ The predicted denoised sample (x_{0}) based on the model output from the current timestep.
+ `pred_original_sample` can be used to preview progress or for guidance.
+ """
+
+ prev_sample: paddle.Tensor
+ pred_original_sample: Optional[paddle.Tensor] = None
+
+
+# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+ """
+ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+ (1-beta) over time from t = [0,1].
+
+ Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+ to that part of the diffusion process.
+
+
+ Args:
+ num_diffusion_timesteps (`int`): the number of betas to produce.
+ max_beta (`float`): the maximum beta to use; use values lower than 1 to
+ prevent singularities.
+
+ Returns:
+ betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+ """
+
+ def alpha_bar(time_step):
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+
+ betas = []
+ for i in range(num_diffusion_timesteps):
+ t1 = i / num_diffusion_timesteps
+ t2 = (i + 1) / num_diffusion_timesteps
+ betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+ return paddle.to_tensor(betas, dtype=paddle.float32)
+
+
+class PreconfigLMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
+ """
+ Linear Multistep Scheduler for discrete beta schedules. Based on the original k-diffusion implementation by
+ Katherine Crowson:
+ https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+ [`~SchedulerMixin.from_pretrained`] functions.
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model.
+ beta_start (`float`): the starting `beta` value of inference.
+ beta_end (`float`): the final `beta` value.
+ beta_schedule (`str`):
+ the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+ `linear` or `scaled_linear`.
+ trained_betas (`np.ndarray`, optional):
+ option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+ prediction_type (`str`, default `epsilon`, optional):
+ prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
+ process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
+ https://imagen.research.google/video/paper.pdf)
+ """
+
+ _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+ order = 1
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+ prediction_type: str = "epsilon",
+ preconfig=True,
+ ):
+ if trained_betas is not None:
+ self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
+ elif beta_schedule == "linear":
+ self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
+ elif beta_schedule == "scaled_linear":
+ # this schedule is very specific to the latent diffusion model.
+ self.betas = (
+ paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2
+ )
+ elif beta_schedule == "squaredcos_cap_v2":
+ # Glide cosine schedule
+ self.betas = betas_for_alpha_bar(num_train_timesteps)
+ else:
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+ self.alphas = 1.0 - self.betas
+ self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
+
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+ sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
+ self.sigmas = paddle.to_tensor(sigmas)
+
+ # standard deviation of the initial noise distribution
+ self.init_noise_sigma = self.sigmas.max()
+
+ # setable values
+ self.num_inference_steps = None
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
+ self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
+ self.derivatives = []
+ self.is_scale_input_called = False
+ self.preconfig = preconfig
+
+ def scale_model_input(
+ self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor], **kwargs
+ ) -> paddle.Tensor:
+ """
+ Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the K-LMS algorithm.
+
+ Args:
+ sample (`paddle.Tensor`): input sample
+ timestep (`float` or `paddle.Tensor`): the current timestep in the diffusion chain
+
+ Returns:
+ `paddle.Tensor`: scaled input sample
+ """
+ if kwargs.get("step_index") is not None:
+ step_index = kwargs["step_index"]
+ else:
+ step_index = (self.timesteps == timestep).nonzero().item()
+ self.is_scale_input_called = True
+ if not self.preconfig:
+ sigma = self.sigmas[step_index]
+ sample = sample / ((sigma**2 + 1) ** 0.5)
+ return sample
+ else:
+ return sample * self.latent_scales[step_index]
+
+ def get_lms_coefficient(self, order, t, current_order):
+ """
+ Compute a linear multistep coefficient.
+
+ Args:
+ order (TODO):
+ t (TODO):
+ current_order (TODO):
+ """
+
+ def lms_derivative(tau):
+ prod = 1.0
+ for k in range(order):
+ if current_order == k:
+ continue
+ prod *= (tau - self.sigmas[t - k]) / (self.sigmas[t - current_order] - self.sigmas[t - k])
+ return prod
+
+ integrated_coeff = integrate.quad(lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0]
+
+ return integrated_coeff
+
+ def set_timesteps(self, num_inference_steps: int, preconfig_order: int = 4):
+ """
+ Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ """
+ self.num_inference_steps = num_inference_steps
+
+ timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+ sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+ sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+ self.sigmas = paddle.to_tensor(sigmas)
+ self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
+
+ self.derivatives = []
+ if self.preconfig:
+ self.order = preconfig_order
+ self.lms_coeffs = []
+ self.latent_scales = [1.0 / ((sigma**2 + 1) ** 0.5) for sigma in self.sigmas]
+ for step_index in range(self.num_inference_steps):
+ order = min(step_index + 1, preconfig_order)
+ self.lms_coeffs.append(
+ [self.get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)]
+ )
+
+ def step(
+ self,
+ model_output: paddle.Tensor,
+ timestep: Union[float, paddle.Tensor],
+ sample: paddle.Tensor,
+ order: int = 4,
+ return_dict: bool = True,
+ **kwargs
+ ) -> Union[PreconfigLMSDiscreteSchedulerOutput, Tuple]:
+ """
+ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ Args:
+ model_output (`paddle.Tensor`): direct output from learned diffusion model.
+ timestep (`float`): current timestep in the diffusion chain.
+ sample (`paddle.Tensor`):
+ current instance of sample being created by diffusion process.
+ order: coefficient for multi-step inference.
+ return_dict (`bool`): option for returning tuple rather than PreconfigLMSDiscreteSchedulerOutput class
+ Args in kwargs:
+ step_index (`int`):
+ return_pred_original_sample (`bool`): option for return pred_original_sample
+
+ Returns:
+ [`~schedulers.scheduling_utils.PreconfigLMSDiscreteSchedulerOutput`] or `tuple`:
+ [`~schedulers.scheduling_utils.PreconfigLMSDiscreteSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`.
+ When returning a tuple, the first element is the sample tensor.
+
+ """
+ if not self.is_scale_input_called:
+ warnings.warn(
+ "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+ "See `StableDiffusionPipeline` for a usage example."
+ )
+ if kwargs.get("return_pred_original_sample") is not None:
+ return_pred_original_sample = kwargs["return_pred_original_sample"]
+ else:
+ return_pred_original_sample = True
+ if kwargs.get("step_index") is not None:
+ step_index = kwargs["step_index"]
+ else:
+ step_index = (self.timesteps == timestep).nonzero().item()
+ if self.config.prediction_type == "epsilon" and not return_pred_original_sample:
+ # if pred_original_sample is no need
+ self.derivatives.append(model_output)
+ pred_original_sample = None
+ else:
+ sigma = self.sigmas[step_index]
+ # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+ if self.config.prediction_type == "epsilon":
+ pred_original_sample = sample - sigma * model_output
+ elif self.config.prediction_type == "v_prediction":
+ # * c_out + input * c_skip
+ pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+ elif self.config.prediction_type == "sample":
+ pred_original_sample = model_output
+ else:
+ raise ValueError(
+ f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+ )
+ # 2. Convert to an ODE derivative
+ derivative = (sample - pred_original_sample) / sigma
+ self.derivatives.append(derivative)
+
+ if len(self.derivatives) > order:
+ self.derivatives.pop(0)
+
+ if not self.preconfig:
+ # 3. If not preconfiged, compute linear multistep coefficients.
+ order = min(step_index + 1, order)
+ lms_coeffs = [self.get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)]
+ # 4. Compute previous sample based on the derivatives path
+ prev_sample = sample + sum(
+ coeff * derivative for coeff, derivative in zip(lms_coeffs, reversed(self.derivatives))
+ )
+ else:
+ # 3. If preconfiged, direct compute previous sample based on the derivatives path
+ prev_sample = sample + sum(
+ coeff * derivative
+ for coeff, derivative in zip(self.lms_coeffs[step_index], reversed(self.derivatives))
+ )
+
+ if not return_dict:
+ if not return_pred_original_sample:
+ return (prev_sample,)
+ else:
+ return (prev_sample, pred_original_sample)
+
+ return PreconfigLMSDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+ def add_noise(
+ self,
+ original_samples: paddle.Tensor,
+ noise: paddle.Tensor,
+ timesteps: paddle.Tensor,
+ ) -> paddle.Tensor:
+ # Fix 0D tensor
+ if paddle.is_tensor(timesteps) and timesteps.ndim == 0:
+ timesteps = timesteps.unsqueeze(0)
+ # Make sure sigmas and timesteps have the same dtype as original_samples
+ sigmas = self.sigmas.cast(original_samples.dtype)
+ schedule_timesteps = self.timesteps
+
+ step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+ sigma = sigmas[step_indices].flatten()
+ while len(sigma.shape) < len(original_samples.shape):
+ sigma = sigma.unsqueeze(-1)
+
+ noisy_samples = original_samples + noise * sigma
+ return noisy_samples
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/__init__.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f82ac7ab81aa87600e3dfab5ecd9550fee617c4f
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/__init__.py
@@ -0,0 +1,218 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from pathlib import Path
+from typing import Dict, Literal, Optional, Union
+
+from huggingface_hub.utils import (
+ EntryNotFoundError,
+ LocalEntryNotFoundError,
+ RepositoryNotFoundError,
+ RevisionNotFoundError,
+)
+from requests import HTTPError
+
+from .aistudio_hub_download import (
+ aistudio_hub_download,
+ aistudio_hub_file_exists,
+ aistudio_hub_try_to_load_from_cache,
+)
+from .bos_download import bos_download, bos_file_exists, bos_try_to_load_from_cache
+from .hf_hub_download import (
+ hf_hub_download,
+ hf_hub_file_exists,
+ hf_hub_try_to_load_from_cache,
+)
+
+
+def bos_aistudio_hf_download(
+ repo_id: str = None,
+ filename: str = None,
+ subfolder: Optional[str] = None,
+ repo_type: Optional[str] = None,
+ revision: Optional[str] = None,
+ library_name: Optional[str] = None,
+ library_version: Optional[str] = None,
+ cache_dir: Union[str, Path, None] = None,
+ local_dir: Union[str, Path, None] = None,
+ local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
+ user_agent: Union[Dict, str, None] = None,
+ force_download: bool = False,
+ proxies: Optional[Dict] = None,
+ etag_timeout: float = 10,
+ resume_download: bool = False,
+ token: Union[bool, str, None] = None,
+ local_files_only: bool = False,
+ endpoint: Optional[str] = None,
+ url: Optional[str] = None,
+ from_bos: bool = True,
+ from_aistudio: bool = False,
+ from_hf_hub: bool = False,
+) -> str:
+ assert repo_id is not None, "repo_id cannot be None"
+ assert filename is not None, "filename cannot be None"
+
+ download_kwargs = dict(
+ repo_id=repo_id,
+ filename=filename,
+ subfolder=subfolder if subfolder is not None else "",
+ repo_type=repo_type,
+ revision=revision,
+ library_name=library_name,
+ library_version=library_version,
+ cache_dir=cache_dir,
+ local_dir=local_dir,
+ local_dir_use_symlinks=local_dir_use_symlinks,
+ user_agent=user_agent,
+ force_download=force_download,
+ proxies=proxies,
+ etag_timeout=etag_timeout,
+ resume_download=resume_download,
+ token=token,
+ local_files_only=local_files_only,
+ endpoint=endpoint,
+ )
+ cached_file = None
+ log_endpoint = "N/A"
+ log_filename = os.path.join(download_kwargs["subfolder"], filename)
+ try:
+ if from_aistudio:
+ log_endpoint = "Aistudio Hub"
+ cached_file = aistudio_hub_download(
+ **download_kwargs,
+ )
+ elif from_hf_hub:
+ log_endpoint = "Huggingface Hub"
+ cached_file = hf_hub_download(
+ **download_kwargs,
+ )
+ else:
+ log_endpoint = "BOS"
+ download_kwargs["url"] = url
+ cached_file = bos_download(
+ **download_kwargs,
+ )
+ except LocalEntryNotFoundError:
+ raise EnvironmentError(
+ "Cannot find the requested files in the cached path and"
+ " outgoing traffic has been disabled. To enable model look-ups"
+ " and downloads online, set 'local_files_only' to False."
+ )
+ except RepositoryNotFoundError:
+ raise EnvironmentError(
+ f"{repo_id} is not a local folder and is not a valid model identifier "
+ f"listed on '{log_endpoint}'\nIf this is a private repository, make sure to pass a "
+ "token having permission to this repo."
+ )
+ except RevisionNotFoundError:
+ raise EnvironmentError(
+ f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
+ "this model name. Check the model page at "
+ f"'{log_endpoint}' for available revisions."
+ )
+ except EntryNotFoundError:
+ raise EnvironmentError(f"{repo_id} does not appear to have a file named {log_filename}.")
+ except HTTPError as err:
+ raise EnvironmentError(f"There was a specific connection error when trying to load {repo_id}:\n{err}")
+ except ValueError:
+ raise EnvironmentError(
+ f"We couldn't connect to '{log_endpoint}' to load this model, couldn't find it"
+ f" in the cached files and it looks like {repo_id} is not the path to a"
+ f" directory containing a file named {log_filename} or"
+ " \nCheckout your internet connection or see how to run the library in offline mode."
+ )
+ except EnvironmentError:
+ raise EnvironmentError(
+ f"Can't load the model for '{repo_id}'. If you were trying to load it from "
+ f"'{log_endpoint}', make sure you don't have a local directory with the same name. "
+ f"Otherwise, make sure '{repo_id}' is the correct path to a directory "
+ f"containing a file named {log_filename}"
+ )
+ return cached_file
+
+
+def bos_aistudio_hf_file_exist(
+ repo_id: str = None,
+ filename: str = None,
+ subfolder: Optional[str] = None,
+ repo_type: Optional[str] = None,
+ revision: Optional[str] = None,
+ token: Optional[str] = None,
+ endpoint: Optional[str] = None,
+ from_bos: bool = True,
+ from_aistudio: bool = False,
+ from_hf_hub: bool = False,
+):
+ assert repo_id is not None, "repo_id cannot be None"
+ assert filename is not None, "filename cannot be None"
+
+ if subfolder is None:
+ subfolder = ""
+ filename = os.path.join(subfolder, filename)
+ if from_aistudio:
+ out = aistudio_hub_file_exists(
+ repo_id=repo_id,
+ filename=filename,
+ repo_type=repo_type,
+ revision=revision,
+ token=token,
+ endpoint=endpoint,
+ )
+ elif from_hf_hub:
+ out = hf_hub_file_exists(
+ repo_id=repo_id,
+ filename=filename,
+ repo_type=repo_type,
+ revision=revision,
+ token=token,
+ )
+ else:
+ out = bos_file_exists(
+ repo_id=repo_id,
+ filename=filename,
+ repo_type=repo_type,
+ revision=revision,
+ token=token, # donot need token
+ endpoint=endpoint,
+ )
+ return out
+
+
+def bos_aistudio_hf_try_to_load_from_cache(
+ repo_id: str,
+ filename: str,
+ cache_dir: Union[str, Path, None] = None,
+ subfolder: str = None,
+ revision: Optional[str] = None,
+ repo_type: Optional[str] = None,
+ from_bos: bool = True,
+ from_aistudio: bool = False,
+ from_hf_hub: bool = False,
+):
+ if subfolder is None:
+ subfolder = ""
+ load_kwargs = dict(
+ repo_id=repo_id,
+ filename=os.path.join(subfolder, filename),
+ cache_dir=cache_dir,
+ revision=revision,
+ repo_type=repo_type,
+ )
+ if from_aistudio:
+ return aistudio_hub_try_to_load_from_cache(**load_kwargs)
+ elif from_hf_hub:
+ return hf_hub_try_to_load_from_cache(**load_kwargs)
+ else:
+ return bos_try_to_load_from_cache(**load_kwargs)
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/aistudio_hub_download.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/aistudio_hub_download.py
new file mode 100644
index 0000000000000000000000000000000000000000..de8f4bc00cb4db6efbdb36248877f452e216434f
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/aistudio_hub_download.py
@@ -0,0 +1,729 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import logging
+import os
+import re
+import shutil
+import tempfile
+from contextlib import contextmanager
+from functools import partial
+from pathlib import Path
+from typing import Dict, Generator, Literal, Optional, Union
+from urllib.parse import quote
+
+import requests
+from filelock import FileLock
+from huggingface_hub.utils import (
+ EntryNotFoundError,
+ FileMetadataError,
+ GatedRepoError,
+ HfHubHTTPError,
+ LocalEntryNotFoundError,
+ RepositoryNotFoundError,
+ RevisionNotFoundError,
+)
+
+logger = logging.getLogger(__name__)
+
+from .common import (
+ _CACHED_NO_EXIST,
+ DEFAULT_ETAG_TIMEOUT,
+ DEFAULT_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD,
+ DEFAULT_REQUEST_TIMEOUT,
+ AistudioBosFileMetadata,
+ OfflineModeIsEnabled,
+ _cache_commit_hash_for_specific_revision,
+ _check_disk_space,
+ _chmod_and_replace,
+ _create_symlink,
+ _get_pointer_path,
+ _is_true,
+ _normalize_etag,
+ _request_wrapper,
+ _to_local_dir,
+ http_get,
+ raise_for_status,
+ repo_folder_name,
+)
+
+VERSION = "0.1.5"
+ENDPOINT = os.getenv("AISTUDIO_ENDPOINT", "http://git.aistudio.baidu.com")
+
+AISTUDIO_URL_TEMPLATE = ENDPOINT + "/api/v1/repos/{user_name}/{repo_name}/contents/{filename}"
+
+
+default_home = os.path.join(os.path.expanduser("~"), ".cache")
+AISTUDIO_HOME = os.path.expanduser(
+ os.getenv(
+ "AISTUDIO_HOME",
+ os.path.join(os.getenv("XDG_CACHE_HOME", default_home), "paddle"),
+ )
+)
+default_cache_path = os.path.join(AISTUDIO_HOME, "aistudio")
+AISTUDIO_HUB_CACHE = os.getenv("AISTUDIO_HUB_CACHE", default_cache_path)
+
+
+DEFAULT_REVISION = "master"
+REPO_TYPE_MODEL = "model"
+REPO_TYPES = [None, REPO_TYPE_MODEL]
+
+
+REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
+
+
+# TOKEN
+AISTUDIO_TOKEN_PATH = os.path.join(AISTUDIO_HOME, "token")
+AISTUDIO_HUB_DISABLE_IMPLICIT_TOKEN: bool = _is_true(os.environ.get("AISTUDIO_HUB_DISABLE_IMPLICIT_TOKEN"))
+
+
+class LocalTokenNotFoundError(EnvironmentError):
+ """Raised if local token is required but not found."""
+
+
+def _clean_token(token: Optional[str]) -> Optional[str]:
+ """Clean token by removing trailing and leading spaces and newlines.
+
+ If token is an empty string, return None.
+ """
+ if token is None:
+ return None
+ return token.replace("\r", "").replace("\n", "").strip() or None
+
+
+def _get_token_from_environment() -> Optional[str]:
+ return _clean_token(os.environ.get("AISTUDIO_ACCESS_TOKEN") or os.environ.get("AISTUDIO_TOKEN"))
+
+
+def _get_token_from_file() -> Optional[str]:
+ try:
+ return _clean_token(Path(AISTUDIO_TOKEN_PATH).read_text())
+ except FileNotFoundError:
+ return None
+
+
+def get_token() -> Optional[str]:
+ """
+ Get token if user is logged in.
+
+ Note: in most cases, you should use [`build_aistudio_headers`] instead. This method is only useful
+ if you want to retrieve the token for other purposes than sending an HTTP request.
+
+ Token is retrieved in priority from the `AISTUDIO_ACCESS_TOKEN` environment variable. Otherwise, we read the token file located
+ in the Aistudio home folder. Returns None if user is not logged in.
+
+ Returns:
+ `str` or `None`: The token, `None` if it doesn't exist.
+ """
+ return _get_token_from_environment() or _get_token_from_file()
+
+
+def get_token_to_send(token: Optional[Union[bool, str]]) -> Optional[str]:
+ """Select the token to send from either `token` or the cache."""
+ # Case token is explicitly provided
+ if isinstance(token, str):
+ return token
+
+ # Case token is explicitly forbidden
+ if token is False:
+ return None
+
+ # Token is not provided: we get it from local cache
+ cached_token = get_token()
+
+ # Case token is explicitly required
+ if token is True:
+ if cached_token is None:
+ raise LocalTokenNotFoundError(
+ "Token is required (`token=True`), but no token found. You"
+ " to provide a token or be logged in to Aistudio Hub . See"
+ "https://ai.baidu.com/ai-doc/AISTUDIO/slmkadt9z#2-%E5%A6%82%E4%BD%95%E4%BD%BF%E7%94%A8%E8%AE%BF%E9%97%AE%E4%BB%A4%E7%89%8C."
+ )
+ return cached_token
+
+ # Case implicit use of the token is forbidden by env variable
+ if AISTUDIO_HUB_DISABLE_IMPLICIT_TOKEN:
+ return None
+
+ # Otherwise: we use the cached token as the user has not explicitly forbidden it
+ return cached_token
+
+
+def _validate_token_to_send(token: Optional[str], is_write_action: bool) -> None:
+ if is_write_action:
+ if token is None:
+ raise ValueError(
+ "Token is required (write-access action) but no token found. You need"
+ " to provide a token or be logged in to Aistudio Hub . See"
+ "https://ai.baidu.com/ai-doc/AISTUDIO/slmkadt9z#2-%E5%A6%82%E4%BD%95%E4%BD%BF%E7%94%A8%E8%AE%BF%E9%97%AE%E4%BB%A4%E7%89%8C."
+ )
+
+
+def build_aistudio_headers(
+ *,
+ token: Optional[Union[bool, str]] = None,
+ is_write_action: bool = False,
+ library_name: Optional[str] = None,
+ library_version: Optional[str] = None,
+ user_agent: Union[Dict, str, None] = None,
+) -> Dict[str, str]:
+ # Get auth token to send
+ token_to_send = get_token_to_send(token)
+ _validate_token_to_send(token_to_send, is_write_action=is_write_action)
+
+ # Combine headers
+ headers = {"Content-Type": "application/json", "SDK-Version": str(VERSION)}
+ if token_to_send is not None:
+ headers["Authorization"] = f"token {token_to_send}"
+ return headers
+
+
+def get_aistudio_file_metadata(
+ url: str,
+ token: Union[bool, str, None] = None,
+ proxies: Optional[Dict] = None,
+ timeout: Optional[float] = DEFAULT_REQUEST_TIMEOUT,
+ library_name: Optional[str] = None,
+ library_version: Optional[str] = None,
+ user_agent: Union[Dict, str, None] = None,
+):
+ """Fetch metadata of a file versioned on the Hub for a given url.
+
+ Args:
+ url (`str`):
+ File url, for example returned by [`aistudio_hub_url`].
+ token (`str` or `bool`, *optional*):
+ A token to be used for the download.
+ - If `True`, the token is read from the Aistudio config
+ folder.
+ - If `False` or `None`, no token is provided.
+ - If a string, it's used as the authentication token.
+ proxies (`dict`, *optional*):
+ Dictionary mapping protocol to the URL of the proxy passed to
+ `requests.request`.
+ timeout (`float`, *optional*, defaults to 10):
+ How many seconds to wait for the server to send metadata before giving up.
+ library_name (`str`, *optional*):
+ The name of the library to which the object corresponds.
+ library_version (`str`, *optional*):
+ The version of the library.
+ user_agent (`dict`, `str`, *optional*):
+ The user-agent info in the form of a dictionary or a string.
+
+ Returns:
+ A [`AistudioBosFileMetadata`] object containing metadata such as location, etag, size and
+ commit_hash.
+ """
+ headers = build_aistudio_headers(
+ token=token, library_name=library_name, library_version=library_version, user_agent=user_agent
+ )
+ headers["Accept-Encoding"] = "identity" # prevent any compression => we want to know the real size of the file
+
+ # Retrieve metadata
+ r = _request_wrapper(
+ method="GET",
+ url=url,
+ headers=headers,
+ allow_redirects=False,
+ follow_relative_redirects=True,
+ proxies=proxies,
+ timeout=timeout,
+ )
+ raise_for_status(r)
+ res = r.json()
+
+ # Return
+ return AistudioBosFileMetadata(
+ commit_hash=res["sha"],
+ etag=_normalize_etag(res["last_commit_sha"]),
+ location=res["git_url"],
+ size=res["size"],
+ )
+
+
+def aistudio_hub_url(
+ repo_id: str,
+ filename: str,
+ *,
+ subfolder: Optional[str] = None,
+ repo_type: Optional[str] = None,
+ revision: Optional[str] = None,
+ endpoint: Optional[str] = None,
+) -> str:
+ if subfolder == "":
+ subfolder = None
+ if subfolder is not None:
+ filename = f"{subfolder}/{filename}"
+
+ if repo_type is None:
+ repo_type = REPO_TYPES[-1]
+ if repo_type not in REPO_TYPES:
+ raise ValueError("Invalid repo type")
+ if revision is None:
+ revision = DEFAULT_REVISION
+
+ # NEW ADD
+ if "/" not in repo_id:
+ raise ValueError("repo_id must be in the format of 'namespace/name'")
+ user_name, repo_name = repo_id.split("/")
+ user_name = user_name.strip()
+ repo_name = repo_name.strip()
+
+ url = AISTUDIO_URL_TEMPLATE.format(
+ user_name=quote(user_name, safe=""), repo_name=quote(repo_name, safe=""), filename=quote(filename)
+ )
+ # Update endpoint if provided
+ if endpoint is not None and url.startswith(ENDPOINT):
+ url = endpoint + url[len(ENDPOINT) :]
+
+ if revision != "master":
+ url += f"?ref={quote(revision, safe='')}"
+ return url
+
+
+def aistudio_hub_download(
+ repo_id: str = None,
+ filename: str = None,
+ subfolder: Optional[str] = None,
+ repo_type: Optional[str] = None,
+ revision: Optional[str] = None,
+ library_name: Optional[str] = None,
+ library_version: Optional[str] = None,
+ cache_dir: Union[str, Path, None] = None,
+ local_dir: Union[str, Path, None] = None,
+ local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
+ # TODO
+ user_agent: Union[Dict, str, None] = None,
+ force_download: bool = False,
+ proxies: Optional[Dict] = None,
+ etag_timeout: float = DEFAULT_ETAG_TIMEOUT,
+ resume_download: bool = False,
+ token: Optional[str] = None,
+ local_files_only: bool = False,
+ endpoint: Optional[str] = None,
+ **kwargs,
+):
+
+ if cache_dir is None:
+ cache_dir = AISTUDIO_HUB_CACHE
+ if revision is None:
+ revision = DEFAULT_REVISION
+ if isinstance(cache_dir, Path):
+ cache_dir = str(cache_dir)
+ if isinstance(local_dir, Path):
+ local_dir = str(local_dir)
+ locks_dir = os.path.join(cache_dir, ".locks")
+
+ if subfolder == "":
+ subfolder = None
+ if subfolder is not None:
+ # This is used to create a URL, and not a local path, hence the forward slash.
+ filename = f"{subfolder}/{filename}"
+
+ if repo_type is None:
+ repo_type = REPO_TYPES[-1]
+ if repo_type not in REPO_TYPES:
+ raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}")
+
+ storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
+ os.makedirs(storage_folder, exist_ok=True)
+
+ # cross platform transcription of filename, to be used as a local file path.
+ relative_filename = os.path.join(*filename.split("/"))
+ if os.name == "nt":
+ if relative_filename.startswith("..\\") or "\\..\\" in relative_filename:
+ raise ValueError(
+ f"Invalid filename: cannot handle filename '{relative_filename}' on Windows. Please ask the repository"
+ " owner to rename this file."
+ )
+
+ # if user provides a commit_hash and they already have the file on disk,
+ # shortcut everything.
+ # TODO, 当前不支持commit id下载,因此这个肯定跑的。
+ if not force_download: # REGEX_COMMIT_HASH.match(revision)
+ pointer_path = _get_pointer_path(storage_folder, revision, relative_filename)
+ if os.path.exists(pointer_path):
+ if local_dir is not None:
+ return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
+ return pointer_path
+
+ url = aistudio_hub_url(repo_id, filename, repo_type=repo_type, revision=revision, endpoint=endpoint)
+
+ headers = build_aistudio_headers(
+ token=token,
+ library_name=library_name,
+ library_version=library_version,
+ user_agent=user_agent,
+ )
+ url_to_download = url.replace("/contents/", "/media/")
+
+ etag = None
+ commit_hash = None
+ expected_size = None
+ head_call_error: Optional[Exception] = None
+ if not local_files_only:
+ try:
+ try:
+ metadata = get_aistudio_file_metadata(
+ url=url,
+ token=token,
+ proxies=proxies,
+ timeout=etag_timeout,
+ library_name=library_name,
+ library_version=library_version,
+ user_agent=user_agent,
+ )
+ except EntryNotFoundError as http_error: # noqa: F841
+ raise
+ # Commit hash must exist
+ # TODO,这里修改了commit hash,强迫为revision了。
+ commit_hash = revision # metadata.commit_hash
+ if commit_hash is None:
+ raise FileMetadataError(
+ "Distant resource does not seem to be on aistudio hub. It is possible that a configuration issue"
+ " prevents you from downloading resources from aistudio hub. Please check your firewall"
+ " and proxy settings and make sure your SSL certificates are updated."
+ )
+
+ # Etag must exist
+ etag = metadata.etag
+ # We favor a custom header indicating the etag of the linked resource, and
+ # we fallback to the regular etag header.
+ # If we don't have any of those, raise an error.
+ if etag is None:
+ raise FileMetadataError(
+ "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
+ )
+
+ # Expected (uncompressed) size
+ expected_size = metadata.size
+
+ except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
+ # Actually raise for those subclasses of ConnectionError
+ raise
+ except (
+ requests.exceptions.ConnectionError,
+ requests.exceptions.Timeout,
+ OfflineModeIsEnabled,
+ ) as error:
+ # Otherwise, our Internet connection is down.
+ # etag is None
+ head_call_error = error
+ pass
+ except (RevisionNotFoundError, EntryNotFoundError):
+ # The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted)
+ raise
+ except requests.HTTPError as error:
+ # Multiple reasons for an http error:
+ # - Repository is private and invalid/missing token sent
+ # - Repository is gated and invalid/missing token sent
+ # - Hub is down (error 500 or 504)
+ # => let's switch to 'local_files_only=True' to check if the files are already cached.
+ # (if it's not the case, the error will be re-raised)
+ head_call_error = error
+ pass
+ except FileMetadataError as error:
+ # Multiple reasons for a FileMetadataError:
+ # - Wrong network configuration (proxy, firewall, SSL certificates)
+ # - Inconsistency on the Hub
+ # => let's switch to 'local_files_only=True' to check if the files are already cached.
+ # (if it's not the case, the error will be re-raised)
+ head_call_error = error
+ pass
+
+ # etag can be None for several reasons:
+ # 1. we passed local_files_only.
+ # 2. we don't have a connection
+ # 3. Hub is down (HTTP 500 or 504)
+ # 4. repo is not found -for example private or gated- and invalid/missing token sent
+ # 5. Hub is blocked by a firewall or proxy is not set correctly.
+ # => Try to get the last downloaded one from the specified revision.
+ #
+ # If the specified revision is a commit hash, look inside "snapshots".
+ # If the specified revision is a branch or tag, look inside "refs".
+ if etag is None:
+ # In those cases, we cannot force download.
+ if force_download:
+ raise ValueError(
+ "We have no connection or you passed local_files_only, so force_download is not an accepted option."
+ )
+
+ # Try to get "commit_hash" from "revision"
+ commit_hash = None
+ if REGEX_COMMIT_HASH.match(revision):
+ commit_hash = revision
+ else:
+ ref_path = os.path.join(storage_folder, "refs", revision)
+ if os.path.isfile(ref_path):
+ with open(ref_path) as f:
+ commit_hash = f.read()
+
+ # Return pointer file if exists
+ if commit_hash is not None:
+ pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
+ if os.path.exists(pointer_path):
+ if local_dir is not None:
+ return _to_local_dir(
+ pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks
+ )
+ return pointer_path
+
+ # If we couldn't find an appropriate file on disk, raise an error.
+ # If files cannot be found and local_files_only=True,
+ # the models might've been found if local_files_only=False
+ # Notify the user about that
+ if local_files_only:
+ raise LocalEntryNotFoundError(
+ "Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable"
+ " aistudio hub look-ups and downloads online, set 'local_files_only' to False."
+ )
+ elif isinstance(head_call_error, RepositoryNotFoundError) or isinstance(head_call_error, GatedRepoError):
+ # Repo not found => let's raise the actual error
+ raise head_call_error
+ else:
+ # Otherwise: most likely a connection issue or Hub downtime => let's warn the user
+ raise LocalEntryNotFoundError(
+ "An error happened while trying to locate the file on the Hub and we cannot find the requested files"
+ " in the local cache. Please check your connection and try again or make sure your Internet connection"
+ " is on."
+ ) from head_call_error
+
+ # From now on, etag and commit_hash are not None.
+ assert etag is not None, "etag must have been retrieved from server"
+ assert commit_hash is not None, "commit_hash must have been retrieved from server"
+ blob_path = os.path.join(storage_folder, "blobs", etag)
+ pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
+
+ os.makedirs(os.path.dirname(blob_path), exist_ok=True)
+ os.makedirs(os.path.dirname(pointer_path), exist_ok=True)
+ # if passed revision is not identical to commit_hash
+ # then revision has to be a branch name or tag name.
+ # In that case store a ref.
+ _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
+
+ if os.path.exists(pointer_path) and not force_download:
+ if local_dir is not None:
+ return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
+ return pointer_path
+
+ if os.path.exists(blob_path) and not force_download:
+ # we have the blob already, but not the pointer
+ if local_dir is not None: # to local dir
+ return _to_local_dir(blob_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
+ else: # or in snapshot cache
+ _create_symlink(blob_path, pointer_path, new_blob=False)
+ return pointer_path
+
+ # Prevent parallel downloads of the same file with a lock.
+ # etag could be duplicated across repos,
+ lock_path = os.path.join(locks_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type), f"{etag}.lock")
+
+ # Some Windows versions do not allow for paths longer than 255 characters.
+ # In this case, we must specify it is an extended path by using the "\\?\" prefix.
+ if os.name == "nt" and len(os.path.abspath(lock_path)) > 255:
+ lock_path = "\\\\?\\" + os.path.abspath(lock_path)
+
+ if os.name == "nt" and len(os.path.abspath(blob_path)) > 255:
+ blob_path = "\\\\?\\" + os.path.abspath(blob_path)
+
+ Path(lock_path).parent.mkdir(parents=True, exist_ok=True)
+ with FileLock(lock_path):
+ # If the download just completed while the lock was activated.
+ if os.path.exists(pointer_path) and not force_download:
+ # Even if returning early like here, the lock will be released.
+ if local_dir is not None:
+ return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
+ return pointer_path
+
+ if resume_download:
+ incomplete_path = blob_path + ".incomplete"
+
+ @contextmanager
+ def _resumable_file_manager() -> Generator[io.BufferedWriter, None, None]:
+ with open(incomplete_path, "ab") as f:
+ yield f
+
+ temp_file_manager = _resumable_file_manager
+ if os.path.exists(incomplete_path):
+ resume_size = os.stat(incomplete_path).st_size
+ else:
+ resume_size = 0
+ else:
+ temp_file_manager = partial( # type: ignore
+ tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False
+ )
+ resume_size = 0
+
+ # Download to temporary file, then copy to cache dir once finished.
+ # Otherwise you get corrupt cache entries if the download gets interrupted.
+ with temp_file_manager() as temp_file:
+ logger.info("downloading %s to %s", url, temp_file.name)
+
+ if expected_size is not None: # might be None if HTTP header not set correctly
+ # Check tmp path
+ _check_disk_space(expected_size, os.path.dirname(temp_file.name))
+
+ # Check destination
+ _check_disk_space(expected_size, os.path.dirname(blob_path))
+ if local_dir is not None:
+ _check_disk_space(expected_size, local_dir)
+
+ http_get(
+ url_to_download,
+ temp_file,
+ proxies=proxies,
+ resume_size=resume_size,
+ headers=headers,
+ expected_size=expected_size,
+ )
+ if local_dir is None:
+ logger.debug(f"Storing {url} in cache at {blob_path}")
+ _chmod_and_replace(temp_file.name, blob_path)
+ _create_symlink(blob_path, pointer_path, new_blob=True)
+ else:
+ local_dir_filepath = os.path.join(local_dir, relative_filename)
+ os.makedirs(os.path.dirname(local_dir_filepath), exist_ok=True)
+
+ # If "auto" (default) copy-paste small files to ease manual editing but symlink big files to save disk
+ # In both cases, blob file is cached.
+ is_big_file = os.stat(temp_file.name).st_size > DEFAULT_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD
+ if local_dir_use_symlinks is True or (local_dir_use_symlinks == "auto" and is_big_file):
+ logger.debug(f"Storing {url} in cache at {blob_path}")
+ _chmod_and_replace(temp_file.name, blob_path)
+ logger.debug("Create symlink to local dir")
+ _create_symlink(blob_path, local_dir_filepath, new_blob=False)
+ elif local_dir_use_symlinks == "auto" and not is_big_file:
+ logger.debug(f"Storing {url} in cache at {blob_path}")
+ _chmod_and_replace(temp_file.name, blob_path)
+ logger.debug("Duplicate in local dir (small file and use_symlink set to 'auto')")
+ shutil.copyfile(blob_path, local_dir_filepath)
+ else:
+ logger.debug(f"Storing {url} in local_dir at {local_dir_filepath} (not cached).")
+ _chmod_and_replace(temp_file.name, local_dir_filepath)
+ pointer_path = local_dir_filepath # for return value
+
+ return pointer_path
+
+
+def aistudio_hub_file_exists(
+ repo_id: str,
+ filename: str,
+ *,
+ repo_type: Optional[str] = None,
+ revision: Optional[str] = None,
+ token: Optional[str] = None,
+ endpoint: Optional[str] = None,
+) -> bool:
+ """
+ Checks if a file exists in a repository on the Aistudio Hub.
+
+ Args:
+ repo_id (`str`):
+ A namespace (user or an organization) and a repo name separated
+ by a `/`.
+ filename (`str`):
+ The name of the file to check, for example:
+ `"config.json"`
+ repo_type (`str`, *optional*):
+ Set to `"dataset"` or `"space"` if getting repository info from a dataset or a space,
+ `None` or `"model"` if getting repository info from a model. Default is `None`.
+ revision (`str`, *optional*):
+ The revision of the repository from which to get the information. Defaults to `"main"` branch.
+ token (`bool` or `str`, *optional*):
+ A valid authentication token (see https://huggingface.co/settings/token).
+ If `None` or `True` and machine is logged in (through `huggingface-cli login`
+ or [`~login`]), token will be retrieved from the cache.
+ If `False`, token is not sent in the request header.
+
+ Returns:
+ True if the file exists, False otherwise.
+
+
+
+ Examples:
+ ```py
+ >>> from huggingface_hub import file_exists
+ >>> file_exists("bigcode/starcoder", "config.json")
+ True
+ >>> file_exists("bigcode/starcoder", "not-a-file")
+ False
+ >>> file_exists("bigcode/not-a-repo", "config.json")
+ False
+ ```
+
+
+ """
+ url = aistudio_hub_url(
+ repo_id=repo_id, repo_type=repo_type, revision=revision, filename=filename, endpoint=endpoint
+ )
+ try:
+ if token is None:
+ token = get_token()
+ get_aistudio_file_metadata(url, token=token)
+ return True
+ except GatedRepoError: # raise specifically on gated repo
+ raise
+ except (RepositoryNotFoundError, EntryNotFoundError, RevisionNotFoundError, HfHubHTTPError):
+ return False
+
+
+def aistudio_hub_try_to_load_from_cache(
+ repo_id: str,
+ filename: str,
+ cache_dir: Union[str, Path, None] = None,
+ revision: Optional[str] = None,
+ repo_type: Optional[str] = None,
+):
+ if revision is None:
+ revision = DEFAULT_REVISION
+ if repo_type is None:
+ repo_type = REPO_TYPES[-1]
+ if repo_type not in REPO_TYPES:
+ raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}")
+ if cache_dir is None:
+ cache_dir = AISTUDIO_HUB_CACHE
+
+ object_id = repo_id.replace("/", "--")
+ repo_cache = os.path.join(cache_dir, f"{repo_type}s--{object_id}")
+ if not os.path.isdir(repo_cache):
+ # No cache for this model
+ return None
+
+ refs_dir = os.path.join(repo_cache, "refs")
+ snapshots_dir = os.path.join(repo_cache, "snapshots")
+ no_exist_dir = os.path.join(repo_cache, ".no_exist")
+
+ # Resolve refs (for instance to convert main to the associated commit sha)
+ if os.path.isdir(refs_dir):
+ revision_file = os.path.join(refs_dir, revision)
+ if os.path.isfile(revision_file):
+ with open(revision_file) as f:
+ revision = f.read()
+
+ # Check if file is cached as "no_exist"
+ if os.path.isfile(os.path.join(no_exist_dir, revision, filename)):
+ return _CACHED_NO_EXIST
+
+ # Check if revision folder exists
+ if not os.path.exists(snapshots_dir):
+ return None
+ cached_shas = os.listdir(snapshots_dir)
+ if revision not in cached_shas:
+ # No cache for this revision and we won't try to return a random revision
+ return None
+
+ # Check if file exists in cache
+ cached_file = os.path.join(snapshots_dir, revision, filename)
+ return cached_file if os.path.isfile(cached_file) else None
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/bos_download.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/bos_download.py
new file mode 100644
index 0000000000000000000000000000000000000000..372784b9a0888898962f4a136e7efd74ef69cd40
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/bos_download.py
@@ -0,0 +1,637 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import logging
+import os
+import re
+import shutil
+import tempfile
+from contextlib import contextmanager
+from functools import partial
+from pathlib import Path
+from typing import Dict, Generator, Literal, Optional, Union
+from urllib.parse import quote
+
+import requests
+from filelock import FileLock
+from huggingface_hub.utils import (
+ EntryNotFoundError,
+ FileMetadataError,
+ GatedRepoError,
+ HfHubHTTPError,
+ LocalEntryNotFoundError,
+ RepositoryNotFoundError,
+ RevisionNotFoundError,
+)
+
+logger = logging.getLogger(__name__)
+
+from .common import (
+ _CACHED_NO_EXIST,
+ DEFAULT_ETAG_TIMEOUT,
+ DEFAULT_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD,
+ DEFAULT_REQUEST_TIMEOUT,
+ REPO_ID_SEPARATOR,
+ AistudioBosFileMetadata,
+ OfflineModeIsEnabled,
+ _as_int,
+ _cache_commit_hash_for_specific_revision,
+ _check_disk_space,
+ _chmod_and_replace,
+ _create_symlink,
+ _get_pointer_path,
+ _normalize_etag,
+ _request_wrapper,
+ _to_local_dir,
+ http_get,
+ raise_for_status,
+)
+
+
+def repo_folder_name(*, repo_id: str, repo_type: str) -> str:
+ """Return a serialized version of a aistudio repo name and type, safe for disk storage
+ as a single non-nested folder.
+
+ Example: models--julien-c--EsperBERTo-small
+ """
+ # remove all `/` occurrences to correctly convert repo to directory name
+ parts = [f"{repo_type}", *repo_id.split("/")]
+ return REPO_ID_SEPARATOR.join(parts)
+
+
+ENDPOINT = os.getenv("PPNLP_ENDPOINT", "https://bj.bcebos.com/paddlenlp")
+ENDPOINT_v2 = "https://paddlenlp.bj.bcebos.com"
+
+BOS_URL_TEMPLATE = ENDPOINT + "/{repo_type}/community/{repo_id}/{revision}/{filename}"
+BOS_URL_TEMPLATE_WITHOUT_REVISION = ENDPOINT + "/{repo_type}/community/{repo_id}/{filename}"
+
+
+default_home = os.path.join(os.path.expanduser("~"), ".cache")
+BOS_HOME = os.path.expanduser(
+ os.getenv(
+ "BOS_HOME",
+ os.path.join(os.getenv("XDG_CACHE_HOME", default_home), "paddle"),
+ )
+)
+default_cache_path = os.path.join(BOS_HOME, "bos")
+BOS_CACHE = os.getenv("BOS_CACHE", default_cache_path)
+
+
+DEFAULT_REVISION = "main"
+REPO_TYPE_MODEL = "models"
+REPO_TYPES = [None, REPO_TYPE_MODEL]
+
+
+REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
+
+
+def get_bos_file_metadata(
+ url: str,
+ token: Union[bool, str, None] = None,
+ proxies: Optional[Dict] = None,
+ timeout: Optional[float] = DEFAULT_REQUEST_TIMEOUT,
+ library_name: Optional[str] = None,
+ library_version: Optional[str] = None,
+ user_agent: Union[Dict, str, None] = None,
+):
+ """Fetch metadata of a file versioned on the Hub for a given url.
+
+ Args:
+ url (`str`):
+ File url, for example returned by [`bos_url`].
+ token (`str` or `bool`, *optional*):
+ A token to be used for the download.
+ - If `True`, the token is read from the BOS config
+ folder.
+ - If `False` or `None`, no token is provided.
+ - If a string, it's used as the authentication token.
+ proxies (`dict`, *optional*):
+ Dictionary mapping protocol to the URL of the proxy passed to
+ `requests.request`.
+ timeout (`float`, *optional*, defaults to 10):
+ How many seconds to wait for the server to send metadata before giving up.
+ library_name (`str`, *optional*):
+ The name of the library to which the object corresponds.
+ library_version (`str`, *optional*):
+ The version of the library.
+ user_agent (`dict`, `str`, *optional*):
+ The user-agent info in the form of a dictionary or a string.
+
+ Returns:
+ A [`AistudioBosFileMetadata`] object containing metadata such as location, etag, size and
+ commit_hash.
+ """
+ headers = {}
+ headers["Accept-Encoding"] = "identity" # prevent any compression => we want to know the real size of the file
+
+ # Retrieve metadata
+ r = _request_wrapper(
+ method="HEAD",
+ url=url,
+ headers=headers,
+ allow_redirects=False,
+ follow_relative_redirects=True,
+ proxies=proxies,
+ timeout=timeout,
+ )
+ raise_for_status(r)
+
+ # Return
+ return AistudioBosFileMetadata(
+ commit_hash=None,
+ etag=_normalize_etag(r.headers.get("ETag")),
+ location=url,
+ size=_as_int(r.headers.get("Content-Length")),
+ )
+
+
+def bos_url(
+ repo_id: str,
+ filename: str,
+ *,
+ subfolder: Optional[str] = None,
+ repo_type: Optional[str] = None,
+ revision: Optional[str] = None,
+ endpoint: Optional[str] = None,
+) -> str:
+ if subfolder == "":
+ subfolder = None
+ if subfolder is not None:
+ filename = f"{subfolder}/{filename}"
+
+ if repo_type is None:
+ repo_type = REPO_TYPES[-1]
+ if repo_type not in REPO_TYPES:
+ raise ValueError("Invalid repo type")
+ if revision is None:
+ revision = DEFAULT_REVISION
+
+ if revision == DEFAULT_REVISION:
+ url = BOS_URL_TEMPLATE_WITHOUT_REVISION.format(
+ repo_type=repo_type,
+ repo_id=repo_id,
+ filename=filename,
+ )
+ else:
+ url = BOS_URL_TEMPLATE.format(
+ repo_type=repo_type,
+ repo_id=repo_id,
+ revision=quote(revision, safe=""),
+ filename=filename,
+ )
+ # Update endpoint if provided
+ if endpoint is not None and url.startswith(ENDPOINT):
+ url = endpoint + url[len(ENDPOINT) :]
+ return url
+
+
+def bos_download(
+ repo_id: str = None,
+ filename: str = None,
+ subfolder: Optional[str] = None,
+ repo_type: Optional[str] = None,
+ revision: Optional[str] = None,
+ library_name: Optional[str] = None,
+ library_version: Optional[str] = None,
+ cache_dir: Union[str, Path, None] = None,
+ local_dir: Union[str, Path, None] = None,
+ local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
+ # TODO
+ user_agent: Union[Dict, str, None] = None,
+ force_download: bool = False,
+ proxies: Optional[Dict] = None,
+ etag_timeout: float = DEFAULT_ETAG_TIMEOUT,
+ resume_download: bool = False,
+ token: Optional[str] = None,
+ local_files_only: bool = False,
+ endpoint: Optional[str] = None,
+ url: Optional[str] = None,
+ **kwargs,
+):
+ if url is not None:
+ assert url.startswith(ENDPOINT) or url.startswith(
+ ENDPOINT_v2
+ ), f"URL must start with {ENDPOINT} or {ENDPOINT_v2}"
+ if repo_id is None:
+ if url.startswith(ENDPOINT):
+ repo_id = "/".join(url[len(ENDPOINT) + 1 :].split("/")[:-1])
+ else:
+ repo_id = "/".join(url[len(ENDPOINT_v2) + 1 :].split("/")[:-1])
+ if filename is None:
+ filename = url.split("/")[-1]
+ subfolder = None
+
+ if cache_dir is None:
+ cache_dir = BOS_CACHE
+ if revision is None:
+ revision = DEFAULT_REVISION
+ if isinstance(cache_dir, Path):
+ cache_dir = str(cache_dir)
+ if isinstance(local_dir, Path):
+ local_dir = str(local_dir)
+ locks_dir = os.path.join(cache_dir, ".locks")
+
+ if subfolder == "":
+ subfolder = None
+ if subfolder is not None:
+ # This is used to create a URL, and not a local path, hence the forward slash.
+ filename = f"{subfolder}/{filename}"
+
+ if repo_type is None:
+ repo_type = REPO_TYPES[-1]
+ if repo_type not in REPO_TYPES:
+ raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}")
+
+ storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
+ os.makedirs(storage_folder, exist_ok=True)
+
+ # cross platform transcription of filename, to be used as a local file path.
+ relative_filename = os.path.join(*filename.split("/"))
+ if os.name == "nt":
+ if relative_filename.startswith("..\\") or "\\..\\" in relative_filename:
+ raise ValueError(
+ f"Invalid filename: cannot handle filename '{relative_filename}' on Windows. Please ask the repository"
+ " owner to rename this file."
+ )
+
+ # if user provides a commit_hash and they already have the file on disk,
+ # shortcut everything.
+ # TODO, 当前不支持commit id下载,因此这个肯定跑的。
+ if not force_download: # REGEX_COMMIT_HASH.match(revision)
+ pointer_path = _get_pointer_path(storage_folder, revision, relative_filename)
+ if os.path.exists(pointer_path):
+ if local_dir is not None:
+ return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
+ return pointer_path
+
+ if url is None:
+ url = bos_url(repo_id, filename, repo_type=repo_type, revision=revision, endpoint=endpoint)
+ headers = None
+ url_to_download = url
+
+ etag = None
+ commit_hash = None
+ expected_size = None
+ head_call_error: Optional[Exception] = None
+ if not local_files_only:
+ try:
+ try:
+ metadata = get_bos_file_metadata(
+ url=url,
+ token=token,
+ proxies=proxies,
+ timeout=etag_timeout,
+ library_name=library_name,
+ library_version=library_version,
+ user_agent=user_agent,
+ )
+ except EntryNotFoundError as http_error: # noqa: F841
+ raise
+ # Commit hash must exist
+ # TODO,这里修改了commit hash,强迫为revision了。
+ commit_hash = revision # metadata.commit_hash
+ if commit_hash is None:
+ raise FileMetadataError(
+ "Distant resource does not seem to be on aistudio hub. It is possible that a configuration issue"
+ " prevents you from downloading resources from aistudio hub. Please check your firewall"
+ " and proxy settings and make sure your SSL certificates are updated."
+ )
+
+ # Etag must exist
+ etag = metadata.etag
+ # We favor a custom header indicating the etag of the linked resource, and
+ # we fallback to the regular etag header.
+ # If we don't have any of those, raise an error.
+ if etag is None:
+ raise FileMetadataError(
+ "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
+ )
+
+ # Expected (uncompressed) size
+ expected_size = metadata.size
+
+ except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
+ # Actually raise for those subclasses of ConnectionError
+ raise
+ except (
+ requests.exceptions.ConnectionError,
+ requests.exceptions.Timeout,
+ OfflineModeIsEnabled,
+ ) as error:
+ # Otherwise, our Internet connection is down.
+ # etag is None
+ head_call_error = error
+ pass
+ except (RevisionNotFoundError, EntryNotFoundError):
+ # The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted)
+ raise
+ except requests.HTTPError as error:
+ # Multiple reasons for an http error:
+ # - Repository is private and invalid/missing token sent
+ # - Repository is gated and invalid/missing token sent
+ # - Hub is down (error 500 or 504)
+ # => let's switch to 'local_files_only=True' to check if the files are already cached.
+ # (if it's not the case, the error will be re-raised)
+ head_call_error = error
+ pass
+ except FileMetadataError as error:
+ # Multiple reasons for a FileMetadataError:
+ # - Wrong network configuration (proxy, firewall, SSL certificates)
+ # - Inconsistency on the Hub
+ # => let's switch to 'local_files_only=True' to check if the files are already cached.
+ # (if it's not the case, the error will be re-raised)
+ head_call_error = error
+ pass
+
+ # etag can be None for several reasons:
+ # 1. we passed local_files_only.
+ # 2. we don't have a connection
+ # 3. Hub is down (HTTP 500 or 504)
+ # 4. repo is not found -for example private or gated- and invalid/missing token sent
+ # 5. Hub is blocked by a firewall or proxy is not set correctly.
+ # => Try to get the last downloaded one from the specified revision.
+ #
+ # If the specified revision is a commit hash, look inside "snapshots".
+ # If the specified revision is a branch or tag, look inside "refs".
+ if etag is None:
+ # In those cases, we cannot force download.
+ if force_download:
+ raise ValueError(
+ "We have no connection or you passed local_files_only, so force_download is not an accepted option."
+ )
+
+ # Try to get "commit_hash" from "revision"
+ commit_hash = None
+ if REGEX_COMMIT_HASH.match(revision):
+ commit_hash = revision
+ else:
+ ref_path = os.path.join(storage_folder, "refs", revision)
+ if os.path.isfile(ref_path):
+ with open(ref_path) as f:
+ commit_hash = f.read()
+
+ # Return pointer file if exists
+ if commit_hash is not None:
+ pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
+ if os.path.exists(pointer_path):
+ if local_dir is not None:
+ return _to_local_dir(
+ pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks
+ )
+ return pointer_path
+
+ # If we couldn't find an appropriate file on disk, raise an error.
+ # If files cannot be found and local_files_only=True,
+ # the models might've been found if local_files_only=False
+ # Notify the user about that
+ if local_files_only:
+ raise LocalEntryNotFoundError(
+ "Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable"
+ " BOS look-ups and downloads online, set 'local_files_only' to False."
+ )
+ elif isinstance(head_call_error, RepositoryNotFoundError) or isinstance(head_call_error, GatedRepoError):
+ # Repo not found => let's raise the actual error
+ raise head_call_error
+ else:
+ # Otherwise: most likely a connection issue or Hub downtime => let's warn the user
+ raise LocalEntryNotFoundError(
+ "An error happened while trying to locate the file on the Hub and we cannot find the requested files"
+ " in the local cache. Please check your connection and try again or make sure your Internet connection"
+ " is on."
+ ) from head_call_error
+
+ # From now on, etag and commit_hash are not None.
+ assert etag is not None, "etag must have been retrieved from server"
+ assert commit_hash is not None, "commit_hash must have been retrieved from server"
+ blob_path = os.path.join(storage_folder, "blobs", etag)
+ pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
+
+ os.makedirs(os.path.dirname(blob_path), exist_ok=True)
+ os.makedirs(os.path.dirname(pointer_path), exist_ok=True)
+ # if passed revision is not identical to commit_hash
+ # then revision has to be a branch name or tag name.
+ # In that case store a ref.
+ _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
+
+ if os.path.exists(pointer_path) and not force_download:
+ if local_dir is not None:
+ return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
+ return pointer_path
+
+ if os.path.exists(blob_path) and not force_download:
+ # we have the blob already, but not the pointer
+ if local_dir is not None: # to local dir
+ return _to_local_dir(blob_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
+ else: # or in snapshot cache
+ _create_symlink(blob_path, pointer_path, new_blob=False)
+ return pointer_path
+
+ # Prevent parallel downloads of the same file with a lock.
+ # etag could be duplicated across repos,
+ lock_path = os.path.join(locks_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type), f"{etag}.lock")
+
+ # Some Windows versions do not allow for paths longer than 255 characters.
+ # In this case, we must specify it is an extended path by using the "\\?\" prefix.
+ if os.name == "nt" and len(os.path.abspath(lock_path)) > 255:
+ lock_path = "\\\\?\\" + os.path.abspath(lock_path)
+
+ if os.name == "nt" and len(os.path.abspath(blob_path)) > 255:
+ blob_path = "\\\\?\\" + os.path.abspath(blob_path)
+
+ Path(lock_path).parent.mkdir(parents=True, exist_ok=True)
+ with FileLock(lock_path):
+ # If the download just completed while the lock was activated.
+ if os.path.exists(pointer_path) and not force_download:
+ # Even if returning early like here, the lock will be released.
+ if local_dir is not None:
+ return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
+ return pointer_path
+
+ if resume_download:
+ incomplete_path = blob_path + ".incomplete"
+
+ @contextmanager
+ def _resumable_file_manager() -> Generator[io.BufferedWriter, None, None]:
+ with open(incomplete_path, "ab") as f:
+ yield f
+
+ temp_file_manager = _resumable_file_manager
+ if os.path.exists(incomplete_path):
+ resume_size = os.stat(incomplete_path).st_size
+ else:
+ resume_size = 0
+ else:
+ temp_file_manager = partial( # type: ignore
+ tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False
+ )
+ resume_size = 0
+
+ # Download to temporary file, then copy to cache dir once finished.
+ # Otherwise you get corrupt cache entries if the download gets interrupted.
+ with temp_file_manager() as temp_file:
+ logger.info("downloading %s to %s", url, temp_file.name)
+
+ if expected_size is not None: # might be None if HTTP header not set correctly
+ # Check tmp path
+ _check_disk_space(expected_size, os.path.dirname(temp_file.name))
+
+ # Check destination
+ _check_disk_space(expected_size, os.path.dirname(blob_path))
+ if local_dir is not None:
+ _check_disk_space(expected_size, local_dir)
+
+ http_get(
+ url_to_download,
+ temp_file,
+ proxies=proxies,
+ resume_size=resume_size,
+ headers=headers,
+ expected_size=expected_size,
+ )
+ if local_dir is None:
+ logger.debug(f"Storing {url} in cache at {blob_path}")
+ _chmod_and_replace(temp_file.name, blob_path)
+ _create_symlink(blob_path, pointer_path, new_blob=True)
+ else:
+ local_dir_filepath = os.path.join(local_dir, relative_filename)
+ os.makedirs(os.path.dirname(local_dir_filepath), exist_ok=True)
+
+ # If "auto" (default) copy-paste small files to ease manual editing but symlink big files to save disk
+ # In both cases, blob file is cached.
+ is_big_file = os.stat(temp_file.name).st_size > DEFAULT_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD
+ if local_dir_use_symlinks is True or (local_dir_use_symlinks == "auto" and is_big_file):
+ logger.debug(f"Storing {url} in cache at {blob_path}")
+ _chmod_and_replace(temp_file.name, blob_path)
+ logger.debug("Create symlink to local dir")
+ _create_symlink(blob_path, local_dir_filepath, new_blob=False)
+ elif local_dir_use_symlinks == "auto" and not is_big_file:
+ logger.debug(f"Storing {url} in cache at {blob_path}")
+ _chmod_and_replace(temp_file.name, blob_path)
+ logger.debug("Duplicate in local dir (small file and use_symlink set to 'auto')")
+ shutil.copyfile(blob_path, local_dir_filepath)
+ else:
+ logger.debug(f"Storing {url} in local_dir at {local_dir_filepath} (not cached).")
+ _chmod_and_replace(temp_file.name, local_dir_filepath)
+ pointer_path = local_dir_filepath # for return value
+
+ return pointer_path
+
+
+def bos_file_exists(
+ repo_id: str,
+ filename: str,
+ *,
+ repo_type: Optional[str] = None,
+ revision: Optional[str] = None,
+ token: Optional[str] = None,
+ endpoint: Optional[str] = None,
+) -> bool:
+ """
+ Checks if a file exists in a repository on the Aistudio Hub.
+
+ Args:
+ repo_id (`str`):
+ A namespace (user or an organization) and a repo name separated
+ by a `/`.
+ filename (`str`):
+ The name of the file to check, for example:
+ `"config.json"`
+ repo_type (`str`, *optional*):
+ Set to `"dataset"` or `"space"` if getting repository info from a dataset or a space,
+ `None` or `"model"` if getting repository info from a model. Default is `None`.
+ revision (`str`, *optional*):
+ The revision of the repository from which to get the information. Defaults to `"main"` branch.
+ token (`bool` or `str`, *optional*):
+ A valid authentication token (see https://huggingface.co/settings/token).
+ If `None` or `True` and machine is logged in (through `huggingface-cli login`
+ or [`~login`]), token will be retrieved from the cache.
+ If `False`, token is not sent in the request header.
+
+ Returns:
+ True if the file exists, False otherwise.
+
+
+
+ Examples:
+ ```py
+ >>> from huggingface_hub import file_exists
+ >>> file_exists("bigcode/starcoder", "config.json")
+ True
+ >>> file_exists("bigcode/starcoder", "not-a-file")
+ False
+ >>> file_exists("bigcode/not-a-repo", "config.json")
+ False
+ ```
+
+
+ """
+ url = bos_url(repo_id=repo_id, repo_type=repo_type, revision=revision, filename=filename, endpoint=endpoint)
+ try:
+ get_bos_file_metadata(url, token=token)
+ return True
+ except GatedRepoError: # raise specifically on gated repo
+ raise
+ except (RepositoryNotFoundError, EntryNotFoundError, RevisionNotFoundError, HfHubHTTPError):
+ return False
+
+
+def bos_try_to_load_from_cache(
+ repo_id: str,
+ filename: str,
+ cache_dir: Union[str, Path, None] = None,
+ revision: Optional[str] = None,
+ repo_type: Optional[str] = None,
+):
+ if revision is None:
+ revision = DEFAULT_REVISION
+ if repo_type is None:
+ repo_type = REPO_TYPES[-1]
+ if repo_type not in REPO_TYPES:
+ raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}")
+ if cache_dir is None:
+ cache_dir = BOS_CACHE
+
+ object_id = repo_id.replace("/", "--")
+ repo_cache = os.path.join(cache_dir, f"{repo_type}--{object_id}")
+ if not os.path.isdir(repo_cache):
+ # No cache for this model
+ return None
+
+ refs_dir = os.path.join(repo_cache, "refs")
+ snapshots_dir = os.path.join(repo_cache, "snapshots")
+ no_exist_dir = os.path.join(repo_cache, ".no_exist")
+
+ # Resolve refs (for instance to convert main to the associated commit sha)
+ if os.path.isdir(refs_dir):
+ revision_file = os.path.join(refs_dir, revision)
+ if os.path.isfile(revision_file):
+ with open(revision_file) as f:
+ revision = f.read()
+
+ # Check if file is cached as "no_exist"
+ if os.path.isfile(os.path.join(no_exist_dir, revision, filename)):
+ return _CACHED_NO_EXIST
+
+ # Check if revision folder exists
+ if not os.path.exists(snapshots_dir):
+ return None
+ cached_shas = os.listdir(snapshots_dir)
+ if revision not in cached_shas:
+ # No cache for this revision and we won't try to return a random revision
+ return None
+
+ # Check if file exists in cache
+ cached_file = os.path.join(snapshots_dir, revision, filename)
+ return cached_file if os.path.isfile(cached_file) else None
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/common.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..faaddf5c5ed272a807fceca665a29ff216c5ae63
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/common.py
@@ -0,0 +1,662 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import logging
+import os
+import re
+import shutil
+import stat
+import tempfile
+import threading
+import time
+import uuid
+import warnings
+from contextlib import contextmanager
+from dataclasses import dataclass
+from functools import lru_cache
+from pathlib import Path
+from typing import BinaryIO, Callable, Dict, Generator, Literal, Optional, Union
+from urllib.parse import urlparse
+
+import requests
+from huggingface_hub.utils import (
+ BadRequestError,
+ EntryNotFoundError,
+ HfHubHTTPError,
+ tqdm,
+)
+from requests import HTTPError, Response
+from requests.adapters import HTTPAdapter
+from requests.models import PreparedRequest
+
+logger = logging.getLogger(__name__)
+
+ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
+
+
+def _is_true(value: Optional[str]) -> bool:
+ if value is None:
+ return False
+ return value.upper() in ENV_VARS_TRUE_VALUES
+
+
+def _as_int(value: Optional[str]) -> Optional[int]:
+ if value is None:
+ return None
+ return int(value)
+
+
+DISABLE_SYMLINKS_WARNING = False
+# Regex to get filename from a "Content-Disposition" header for CDN-served files
+HEADER_FILENAME_PATTERN = re.compile(r'filename="(?P.*?)"')
+DOWNLOAD_CHUNK_SIZE = 10 * 1024 * 1024
+REPO_ID_SEPARATOR = "--"
+
+DEFAULT_DOWNLOAD_TIMEOUT = 10
+DEFAULT_REQUEST_TIMEOUT = 10
+DEFAULT_ETAG_TIMEOUT = 10
+DEFAULT_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD: int = 5 * 1024 * 1024
+
+OFFLINE = _is_true(os.environ.get("AISTUDIO_BOS_OFFLINE"))
+_CACHED_NO_EXIST = object()
+
+
+def _cache_commit_hash_for_specific_revision(storage_folder: str, revision: str, commit_hash: str) -> None:
+ """Cache reference between a revision (tag, branch or truncated commit hash) and the corresponding commit hash.
+
+ Does nothing if `revision` is already a proper `commit_hash` or reference is already cached.
+ """
+ # if revision != commit_hash:
+ ref_path = Path(storage_folder) / "refs" / revision
+ ref_path.parent.mkdir(parents=True, exist_ok=True)
+ if not ref_path.exists() or commit_hash != ref_path.read_text():
+ # Update ref only if has been updated. Could cause useless error in case
+ # repo is already cached and user doesn't have write access to cache folder.
+ # See https://github.com/huggingface/huggingface_hub/issues/1216.
+ ref_path.write_text(commit_hash)
+
+
+def _check_disk_space(expected_size: int, target_dir: Union[str, Path]) -> None:
+ """Check disk usage and log a warning if there is not enough disk space to download the file.
+
+ Args:
+ expected_size (`int`):
+ The expected size of the file in bytes.
+ target_dir (`str`):
+ The directory where the file will be stored after downloading.
+ """
+
+ target_dir = Path(target_dir) # format as `Path`
+ for path in [target_dir] + list(target_dir.parents): # first check target_dir, then each parents one by one
+ try:
+ target_dir_free = shutil.disk_usage(path).free
+ if target_dir_free < expected_size:
+ warnings.warn(
+ "Not enough free disk space to download the file. "
+ f"The expected file size is: {expected_size / 1e6:.2f} MB. "
+ f"The target location {target_dir} only has {target_dir_free / 1e6:.2f} MB free disk space."
+ )
+ return
+ except OSError: # raise on anything: file does not exist or space disk cannot be checked
+ pass
+
+
+def http_get(
+ url: str,
+ temp_file: BinaryIO,
+ *,
+ proxies=None,
+ resume_size: float = 0,
+ headers: Optional[Dict[str, str]] = None,
+ expected_size: Optional[int] = None,
+ _nb_retries: int = 5,
+):
+ """
+ Download a remote file. Do not gobble up errors, and will return errors tailored to the Hugging Face Hub.
+
+ If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely a
+ transient error (network outage?). We log a warning message and try to resume the download a few times before
+ giving up. The method gives up after 5 attempts if no new data has being received from the server.
+ """
+ initial_headers = headers
+ headers = copy.deepcopy(headers) or {}
+ if resume_size > 0:
+ headers["Range"] = "bytes=%d-" % (resume_size,)
+
+ r = _request_wrapper(
+ method="GET", url=url, stream=True, proxies=proxies, headers=headers, timeout=DEFAULT_DOWNLOAD_TIMEOUT
+ )
+ raise_for_status(r)
+ content_length = r.headers.get("Content-Length")
+
+ # NOTE: 'total' is the total number of bytes to download, not the number of bytes in the file.
+ # If the file is compressed, the number of bytes in the saved file will be higher than 'total'.
+ total = resume_size + int(content_length) if content_length is not None else None
+
+ displayed_name = url
+ content_disposition = r.headers.get("Content-Disposition")
+ if content_disposition is not None:
+ match = HEADER_FILENAME_PATTERN.search(content_disposition)
+ if match is not None:
+ # Means file is on CDN
+ displayed_name = match.groupdict()["filename"]
+
+ # Truncate filename if too long to display
+ if len(displayed_name) > 40:
+ displayed_name = f"(…){displayed_name[-40:]}"
+
+ consistency_error_message = (
+ f"Consistency check failed: file should be of size {expected_size} but has size"
+ f" {{actual_size}} ({displayed_name}).\nWe are sorry for the inconvenience. Please retry download and"
+ " pass `force_download=True, resume_download=False` as argument.\nIf the issue persists, please let us"
+ " know by opening an issue on https://github.com/huggingface/huggingface_hub."
+ )
+
+ # Stream file to buffer
+ with tqdm(
+ unit="B",
+ unit_scale=True,
+ total=total,
+ initial=resume_size,
+ desc=displayed_name,
+ disable=bool(logger.getEffectiveLevel() == logging.NOTSET),
+ ) as progress:
+ new_resume_size = resume_size
+ try:
+ for chunk in r.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE):
+ if chunk: # filter out keep-alive new chunks
+ progress.update(len(chunk))
+ temp_file.write(chunk)
+ new_resume_size += len(chunk)
+ # Some data has been downloaded from the server so we reset the number of retries.
+ _nb_retries = 5
+ except (requests.ConnectionError, requests.ReadTimeout) as e:
+ # If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely
+ # a transient error (network outage?). We log a warning message and try to resume the download a few times
+ # before giving up. Tre retry mechanism is basic but should be enough in most cases.
+ if _nb_retries <= 0:
+ logger.warning("Error while downloading from %s: %s\nMax retries exceeded.", url, str(e))
+ raise
+ logger.warning("Error while downloading from %s: %s\nTrying to resume download...", url, str(e))
+ time.sleep(1)
+ reset_sessions() # In case of SSLError it's best to reset the shared requests.Session objects
+ return http_get(
+ url=url,
+ temp_file=temp_file,
+ proxies=proxies,
+ resume_size=new_resume_size,
+ headers=initial_headers,
+ expected_size=expected_size,
+ _nb_retries=_nb_retries - 1,
+ )
+
+ if expected_size is not None and expected_size != temp_file.tell():
+ raise EnvironmentError(
+ consistency_error_message.format(
+ actual_size=temp_file.tell(),
+ )
+ )
+
+
+def _chmod_and_replace(src: str, dst: str) -> None:
+ """Set correct permission before moving a blob from tmp directory to cache dir.
+
+ Do not take into account the `umask` from the process as there is no convenient way
+ to get it that is thread-safe.
+
+ See:
+ - About umask: https://docs.python.org/3/library/os.html#os.umask
+ - Thread-safety: https://stackoverflow.com/a/70343066
+ - About solution: https://github.com/huggingface/huggingface_hub/pull/1220#issuecomment-1326211591
+ - Fix issue: https://github.com/huggingface/huggingface_hub/issues/1141
+ - Fix issue: https://github.com/huggingface/huggingface_hub/issues/1215
+ """
+ # Get umask by creating a temporary file in the cached repo folder.
+ tmp_file = Path(dst).parent.parent / f"tmp_{uuid.uuid4()}"
+ try:
+ tmp_file.touch()
+ cache_dir_mode = Path(tmp_file).stat().st_mode
+ os.chmod(src, stat.S_IMODE(cache_dir_mode))
+ finally:
+ tmp_file.unlink()
+
+ shutil.move(src, dst)
+
+
+def repo_folder_name(*, repo_id: str, repo_type: str) -> str:
+ """Return a serialized version of a aistudio repo name and type, safe for disk storage
+ as a single non-nested folder.
+
+ Example: models--julien-c--EsperBERTo-small
+ """
+ # remove all `/` occurrences to correctly convert repo to directory name
+ parts = [f"{repo_type}s", *repo_id.split("/")]
+ return REPO_ID_SEPARATOR.join(parts)
+
+
+class OfflineModeIsEnabled(ConnectionError):
+ """Raised when a request is made but `AISTUDIO_HUB_OFFLINE=1` is set as environment variable."""
+
+
+class OfflineAdapter(HTTPAdapter):
+ def send(self, request: PreparedRequest, *args, **kwargs) -> Response:
+ raise OfflineModeIsEnabled(
+ f"Cannot reach {request.url}: offline mode is enabled. To disable it, please unset the `AISTUDIO_HUB_OFFLINE` environment variable."
+ )
+
+
+BACKEND_FACTORY_T = Callable[[], requests.Session]
+
+
+def _default_backend_factory() -> requests.Session:
+ session = requests.Session()
+ if OFFLINE:
+ session.mount("http://", OfflineAdapter())
+ session.mount("https://", OfflineAdapter())
+
+ return session
+
+
+_GLOBAL_BACKEND_FACTORY: BACKEND_FACTORY_T = _default_backend_factory
+HTTP_METHOD_T = Literal["GET", "OPTIONS", "HEAD", "POST", "PUT", "PATCH", "DELETE"]
+
+
+@lru_cache
+def _get_session_from_cache(process_id: int, thread_id: int) -> requests.Session:
+ """
+ Create a new session per thread using global factory. Using LRU cache (maxsize 128) to avoid memory leaks when
+ using thousands of threads. Cache is cleared when `configure_http_backend` is called.
+ """
+ return _GLOBAL_BACKEND_FACTORY()
+
+
+def reset_sessions() -> None:
+ """Reset the cache of sessions.
+
+ Mostly used internally when sessions are reconfigured or an SSLError is raised.
+ See [`configure_http_backend`] for more details.
+ """
+ _get_session_from_cache.cache_clear()
+
+
+def get_session() -> requests.Session:
+ """
+ Get a `requests.Session` object, using the session factory from the user.
+
+ Use [`get_session`] to get a configured Session. Since `requests.Session` is not guaranteed to be thread-safe,
+ `huggingface_hub` creates 1 Session instance per thread. They are all instantiated using the same `backend_factory`
+ set in [`configure_http_backend`]. A LRU cache is used to cache the created sessions (and connections) between
+ calls. Max size is 128 to avoid memory leaks if thousands of threads are spawned.
+
+ See [this issue](https://github.com/psf/requests/issues/2766) to know more about thread-safety in `requests`.
+
+ Example:
+ ```py
+ import requests
+ from huggingface_hub import configure_http_backend, get_session
+
+ # Create a factory function that returns a Session with configured proxies
+ def backend_factory() -> requests.Session:
+ session = requests.Session()
+ session.proxies = {"http": "http://10.10.1.10:3128", "https": "https://10.10.1.11:1080"}
+ return session
+
+ # Set it as the default session factory
+ configure_http_backend(backend_factory=backend_factory)
+
+ # In practice, this is mostly done internally in `huggingface_hub`
+ session = get_session()
+ ```
+ """
+ return _get_session_from_cache(process_id=os.getpid(), thread_id=threading.get_ident())
+
+
+def _request_wrapper(
+ method: HTTP_METHOD_T, url: str, *, follow_relative_redirects: bool = False, **params
+) -> requests.Response:
+ """Wrapper around requests methods to follow relative redirects if `follow_relative_redirects=True` even when
+ `allow_redirection=False`.
+
+ Args:
+ method (`str`):
+ HTTP method, such as 'GET' or 'HEAD'.
+ url (`str`):
+ The URL of the resource to fetch.
+ follow_relative_redirects (`bool`, *optional*, defaults to `False`)
+ If True, relative redirection (redirection to the same site) will be resolved even when `allow_redirection`
+ kwarg is set to False. Useful when we want to follow a redirection to a renamed repository without
+ following redirection to a CDN.
+ **params (`dict`, *optional*):
+ Params to pass to `requests.request`.
+ """
+ # Recursively follow relative redirects
+ if follow_relative_redirects:
+ response = _request_wrapper(
+ method=method,
+ url=url,
+ follow_relative_redirects=False,
+ **params,
+ )
+
+ # If redirection, we redirect only relative paths.
+ # This is useful in case of a renamed repository.
+ if 300 <= response.status_code <= 399:
+ parsed_target = urlparse(response.headers["Location"])
+ if parsed_target.netloc == "":
+ # This means it is a relative 'location' headers, as allowed by RFC 7231.
+ # (e.g. '/path/to/resource' instead of 'http://domain.tld/path/to/resource')
+ # We want to follow this relative redirect !
+ #
+ # Highly inspired by `resolve_redirects` from requests library.
+ # See https://github.com/psf/requests/blob/main/requests/sessions.py#L159
+ next_url = urlparse(url)._replace(path=parsed_target.path).geturl()
+ return _request_wrapper(method=method, url=next_url, follow_relative_redirects=True, **params)
+ return response
+ # Perform request and return if status_code is not in the retry list.
+ response = get_session().request(method=method, url=url, **params)
+ raise_for_status(response)
+ return response
+
+
+def _get_pointer_path(storage_folder: str, revision: str, relative_filename: str) -> str:
+ # Using `os.path.abspath` instead of `Path.resolve()` to avoid resolving symlinks
+ snapshot_path = os.path.join(storage_folder, "snapshots")
+ pointer_path = os.path.join(snapshot_path, revision, relative_filename)
+ if Path(os.path.abspath(snapshot_path)) not in Path(os.path.abspath(pointer_path)).parents:
+ raise ValueError(
+ "Invalid pointer path: cannot create pointer path in snapshot folder if"
+ f" `storage_folder='{storage_folder}'`, `revision='{revision}'` and"
+ f" `relative_filename='{relative_filename}'`."
+ )
+ return pointer_path
+
+
+def _create_symlink(src: str, dst: str, new_blob: bool = False) -> None:
+ """Create a symbolic link named dst pointing to src.
+
+ By default, it will try to create a symlink using a relative path. Relative paths have 2 advantages:
+ - If the cache_folder is moved (example: back-up on a shared drive), relative paths within the cache folder will
+ not brake.
+ - Relative paths seems to be better handled on Windows. Issue was reported 3 times in less than a week when
+ changing from relative to absolute paths. See https://github.com/huggingface/huggingface_hub/issues/1398,
+ https://github.com/huggingface/diffusers/issues/2729 and https://github.com/huggingface/transformers/pull/22228.
+ NOTE: The issue with absolute paths doesn't happen on admin mode.
+ When creating a symlink from the cache to a local folder, it is possible that a relative path cannot be created.
+ This happens when paths are not on the same volume. In that case, we use absolute paths.
+
+
+ The result layout looks something like
+ └── [ 128] snapshots
+ ├── [ 128] 2439f60ef33a0d46d85da5001d52aeda5b00ce9f
+ │ ├── [ 52] README.md -> ../../../blobs/d7edf6bd2a681fb0175f7735299831ee1b22b812
+ │ └── [ 76] pytorch_model.bin -> ../../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
+
+ If symlinks cannot be created on this platform (most likely to be Windows), the workaround is to avoid symlinks by
+ having the actual file in `dst`. If it is a new file (`new_blob=True`), we move it to `dst`. If it is not a new file
+ (`new_blob=False`), we don't know if the blob file is already referenced elsewhere. To avoid breaking existing
+ cache, the file is duplicated on the disk.
+
+ In case symlinks are not supported, a warning message is displayed to the user once when loading `huggingface_hub`.
+ The warning message can be disable with the `DISABLE_SYMLINKS_WARNING` environment variable.
+ """
+ try:
+ os.remove(dst)
+ except OSError:
+ pass
+
+ abs_src = os.path.abspath(os.path.expanduser(src))
+ abs_dst = os.path.abspath(os.path.expanduser(dst))
+ abs_dst_folder = os.path.dirname(abs_dst)
+
+ # Use relative_dst in priority
+ try:
+ relative_src = os.path.relpath(abs_src, abs_dst_folder)
+ except ValueError:
+ # Raised on Windows if src and dst are not on the same volume. This is the case when creating a symlink to a
+ # local_dir instead of within the cache directory.
+ # See https://docs.python.org/3/library/os.path.html#os.path.relpath
+ relative_src = None
+
+ try:
+ commonpath = os.path.commonpath([abs_src, abs_dst])
+ _support_symlinks = are_symlinks_supported(commonpath)
+ except ValueError:
+ # Raised if src and dst are not on the same volume. Symlinks will still work on Linux/Macos.
+ # See https://docs.python.org/3/library/os.path.html#os.path.commonpath
+ _support_symlinks = os.name != "nt"
+ except PermissionError:
+ # Permission error means src and dst are not in the same volume (e.g. destination path has been provided
+ # by the user via `local_dir`. Let's test symlink support there)
+ _support_symlinks = are_symlinks_supported(abs_dst_folder)
+
+ # Symlinks are supported => let's create a symlink.
+ if _support_symlinks:
+ src_rel_or_abs = relative_src or abs_src
+ logger.debug(f"Creating pointer from {src_rel_or_abs} to {abs_dst}")
+ try:
+ os.symlink(src_rel_or_abs, abs_dst)
+ return
+ except FileExistsError:
+ if os.path.islink(abs_dst) and os.path.realpath(abs_dst) == os.path.realpath(abs_src):
+ # `abs_dst` already exists and is a symlink to the `abs_src` blob. It is most likely that the file has
+ # been cached twice concurrently (exactly between `os.remove` and `os.symlink`). Do nothing.
+ return
+ else:
+ # Very unlikely to happen. Means a file `dst` has been created exactly between `os.remove` and
+ # `os.symlink` and is not a symlink to the `abs_src` blob file. Raise exception.
+ raise
+ except PermissionError:
+ # Permission error means src and dst are not in the same volume (e.g. download to local dir) and symlink
+ # is supported on both volumes but not between them. Let's just make a hard copy in that case.
+ pass
+
+ # Symlinks are not supported => let's move or copy the file.
+ if new_blob:
+ logger.info(f"Symlink not supported. Moving file from {abs_src} to {abs_dst}")
+ shutil.move(abs_src, abs_dst)
+ else:
+ logger.info(f"Symlink not supported. Copying file from {abs_src} to {abs_dst}")
+ shutil.copyfile(abs_src, abs_dst)
+
+
+_are_symlinks_supported_in_dir: Dict[str, bool] = {}
+
+
+def _set_write_permission_and_retry(func, path, excinfo):
+ os.chmod(path, stat.S_IWRITE)
+ func(path)
+
+
+@contextmanager
+def SoftTemporaryDirectory(
+ suffix: Optional[str] = None,
+ prefix: Optional[str] = None,
+ dir: Optional[Union[Path, str]] = None,
+ **kwargs,
+) -> Generator[str, None, None]:
+ """
+ Context manager to create a temporary directory and safely delete it.
+
+ If tmp directory cannot be deleted normally, we set the WRITE permission and retry.
+ If cleanup still fails, we give up but don't raise an exception. This is equivalent
+ to `tempfile.TemporaryDirectory(..., ignore_cleanup_errors=True)` introduced in
+ Python 3.10.
+
+ See https://www.scivision.dev/python-tempfile-permission-error-windows/.
+ """
+ tmpdir = tempfile.TemporaryDirectory(prefix=prefix, suffix=suffix, dir=dir, **kwargs)
+ yield tmpdir.name
+
+ try:
+ # First once with normal cleanup
+ shutil.rmtree(tmpdir.name)
+ except Exception:
+ # If failed, try to set write permission and retry
+ try:
+ shutil.rmtree(tmpdir.name, onerror=_set_write_permission_and_retry)
+ except Exception:
+ pass
+
+ # And finally, cleanup the tmpdir.
+ # If it fails again, give up but do not throw error
+ try:
+ tmpdir.cleanup()
+ except Exception:
+ pass
+
+
+def _to_local_dir(
+ path: str, local_dir: str, relative_filename: str, use_symlinks: Union[bool, Literal["auto"]]
+) -> str:
+ """Place a file in a local dir (different than cache_dir).
+
+ Either symlink to blob file in cache or duplicate file depending on `use_symlinks` and file size.
+ """
+ # Using `os.path.abspath` instead of `Path.resolve()` to avoid resolving symlinks
+ local_dir_filepath = os.path.join(local_dir, relative_filename)
+ if Path(os.path.abspath(local_dir)) not in Path(os.path.abspath(local_dir_filepath)).parents:
+ raise ValueError(
+ f"Cannot copy file '{relative_filename}' to local dir '{local_dir}': file would not be in the local"
+ " directory."
+ )
+
+ os.makedirs(os.path.dirname(local_dir_filepath), exist_ok=True)
+ real_blob_path = os.path.realpath(path)
+
+ # If "auto" (default) copy-paste small files to ease manual editing but symlink big files to save disk
+ if use_symlinks == "auto":
+ use_symlinks = os.stat(real_blob_path).st_size > DEFAULT_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD
+
+ if use_symlinks:
+ _create_symlink(real_blob_path, local_dir_filepath, new_blob=False)
+ else:
+ shutil.copyfile(real_blob_path, local_dir_filepath)
+ return local_dir_filepath
+
+
+def _normalize_etag(etag: Optional[str]) -> Optional[str]:
+ """Normalize ETag HTTP header, so it can be used to create nice filepaths.
+
+ The HTTP spec allows two forms of ETag:
+ ETag: W/""
+ ETag: ""
+
+ For now, we only expect the second form from the server, but we want to be future-proof so we support both. For
+ more context, see `TestNormalizeEtag` tests and https://github.com/huggingface/huggingface_hub/pull/1428.
+
+ Args:
+ etag (`str`, *optional*): HTTP header
+
+ Returns:
+ `str` or `None`: string that can be used as a nice directory name.
+ Returns `None` if input is None.
+ """
+ if etag is None:
+ return None
+ return etag.lstrip("W/").strip('"')
+
+
+@dataclass(frozen=True)
+class AistudioBosFileMetadata:
+ """Data structure containing information about a file versioned on the Aistudio Hub.
+
+ Returned by [`get_aistudio_file_metadata`] based on a URL.
+
+ Args:
+ commit_hash (`str`, *optional*):
+ The commit_hash related to the file.
+ etag (`str`, *optional*):
+ Etag of the file on the server.
+ location (`str`):
+ Location where to download the file. Can be a Hub url or not (CDN).
+ size (`size`):
+ Size of the file. In case of an LFS file, contains the size of the actual
+ LFS file, not the pointer.
+ """
+
+ commit_hash: Optional[str]
+ etag: Optional[str]
+ location: str
+ size: Optional[int]
+
+
+def raise_for_status(response: Response, endpoint_name: Optional[str] = None) -> None:
+ try:
+ response.raise_for_status()
+ except HTTPError as e:
+ if response.status_code == 404:
+ message = f"{response.status_code} Client Error." + "\n\n" + f"Entry Not Found for url: {response.url}."
+ raise EntryNotFoundError(message, None) from e
+ elif response.status_code == 400:
+ message = (
+ f"\n\nBad request for {endpoint_name} endpoint:" if endpoint_name is not None else "\n\nBad request:"
+ )
+ raise BadRequestError(message, response=None) from e
+ raise HfHubHTTPError(str(e), response=None) from e
+
+
+def are_symlinks_supported(cache_dir: Union[str, Path, None] = None) -> bool:
+ """Return whether the symlinks are supported on the machine.
+
+ Since symlinks support can change depending on the mounted disk, we need to check
+ on the precise cache folder.
+
+ Args:
+ cache_dir (`str`, `Path`, *optional*):
+ Path to the folder where cached files are stored.
+
+ Returns: [bool] Whether symlinks are supported in the directory.
+ """
+ assert cache_dir is not None
+ cache_dir = str(Path(cache_dir).expanduser().resolve()) # make it unique
+
+ # Check symlink compatibility only once (per cache directory) at first time use
+ if cache_dir not in _are_symlinks_supported_in_dir:
+ _are_symlinks_supported_in_dir[cache_dir] = True
+
+ os.makedirs(cache_dir, exist_ok=True)
+ with SoftTemporaryDirectory(dir=cache_dir) as tmpdir:
+ src_path = Path(tmpdir) / "dummy_file_src"
+ src_path.touch()
+ dst_path = Path(tmpdir) / "dummy_file_dst"
+
+ # Relative source path as in `_create_symlink``
+ relative_src = os.path.relpath(src_path, start=os.path.dirname(dst_path))
+ try:
+ os.symlink(relative_src, dst_path)
+ except OSError:
+ # Likely running on Windows
+ _are_symlinks_supported_in_dir[cache_dir] = False
+
+ if not DISABLE_SYMLINKS_WARNING:
+ message = (
+ "cache-system uses symlinks by default to"
+ " efficiently store duplicated files but your machine does not"
+ f" support them in {cache_dir}. Caching files will still work"
+ " but in a degraded version that might require more space on"
+ " your disk. This warning can be disabled by setting the"
+ " `DISABLE_SYMLINKS_WARNING` environment variable."
+ )
+ if os.name == "nt":
+ message += (
+ "\nTo support symlinks on Windows, you either need to"
+ " activate Developer Mode or to run Python as an"
+ " administrator. In order to see activate developer mode,"
+ " see this article:"
+ " https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development"
+ )
+ warnings.warn(message)
+
+ return _are_symlinks_supported_in_dir[cache_dir]
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/hf_hub_download.py b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/hf_hub_download.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd030852567dd028f4703005cb837100747da80d
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/utils/downloader/hf_hub_download.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from huggingface_hub import file_exists as hf_hub_file_exists # noqa: F401
+from huggingface_hub import hf_hub_download # noqa: F401
+from huggingface_hub import ( # noqa: F401
+ try_to_load_from_cache as hf_hub_try_to_load_from_cache,
+)
diff --git a/VLMEvalKit_old/PaddleMIX/ppdiffusers/requirements.txt b/VLMEvalKit_old/PaddleMIX/ppdiffusers/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3b7fa12a3c248df7f991de3e61eb3fdc2066ac75
--- /dev/null
+++ b/VLMEvalKit_old/PaddleMIX/ppdiffusers/requirements.txt
@@ -0,0 +1,18 @@
+paddlenlp>=3.0.0b2
+safetensors>=0.3.1
+ftfy
+regex
+Pillow
+opencv-python
+av
+# for test
+parameterized
+requests_mock
+omegaconf
+note_seq
+urllib3<=2.0.0
+einops>=0.6.1
+paddlesde
+ligo-segments
+huggingface_hub==0.23.0
+hf_transfer
diff --git a/VLMEvalKit_old/docs/en/_static/image/logo.svg b/VLMEvalKit_old/docs/en/_static/image/logo.svg
new file mode 100644
index 0000000000000000000000000000000000000000..043530572afb48d0eac26b4b53d448aae6e9a9af
--- /dev/null
+++ b/VLMEvalKit_old/docs/en/_static/image/logo.svg
@@ -0,0 +1,24 @@
+
+
+
+Created with Fabric.js 5.3.0
+
+
+
+
+
+
+
+
+
+
+
+
+ VLMEvalKit
+
diff --git a/lightning-hydra-template/src/models/components/__init__.py b/lightning-hydra-template/src/models/components/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/lightning-hydra-template/tests/helpers/__init__.py b/lightning-hydra-template/tests/helpers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/lightning-hydra-template/tests/helpers/run_if.py b/lightning-hydra-template/tests/helpers/run_if.py
new file mode 100644
index 0000000000000000000000000000000000000000..9703af425129d0225d0aeed20dedc3ed35bc7548
--- /dev/null
+++ b/lightning-hydra-template/tests/helpers/run_if.py
@@ -0,0 +1,142 @@
+"""Adapted from:
+
+https://github.com/PyTorchLightning/pytorch-lightning/blob/master/tests/helpers/runif.py
+"""
+
+import sys
+from typing import Any, Dict, Optional
+
+import pytest
+import torch
+from packaging.version import Version
+from pkg_resources import get_distribution
+from pytest import MarkDecorator
+
+from tests.helpers.package_available import (
+ _COMET_AVAILABLE,
+ _DEEPSPEED_AVAILABLE,
+ _FAIRSCALE_AVAILABLE,
+ _IS_WINDOWS,
+ _MLFLOW_AVAILABLE,
+ _NEPTUNE_AVAILABLE,
+ _SH_AVAILABLE,
+ _TPU_AVAILABLE,
+ _WANDB_AVAILABLE,
+)
+
+
+class RunIf:
+ """RunIf wrapper for conditional skipping of tests.
+
+ Fully compatible with `@pytest.mark`.
+
+ Example:
+
+ ```python
+ @RunIf(min_torch="1.8")
+ @pytest.mark.parametrize("arg1", [1.0, 2.0])
+ def test_wrapper(arg1):
+ assert arg1 > 0
+ ```
+ """
+
+ def __new__(
+ cls,
+ min_gpus: int = 0,
+ min_torch: Optional[str] = None,
+ max_torch: Optional[str] = None,
+ min_python: Optional[str] = None,
+ skip_windows: bool = False,
+ sh: bool = False,
+ tpu: bool = False,
+ fairscale: bool = False,
+ deepspeed: bool = False,
+ wandb: bool = False,
+ neptune: bool = False,
+ comet: bool = False,
+ mlflow: bool = False,
+ **kwargs: Dict[Any, Any],
+ ) -> MarkDecorator:
+ """Creates a new `@RunIf` `MarkDecorator` decorator.
+
+ :param min_gpus: Min number of GPUs required to run test.
+ :param min_torch: Minimum pytorch version to run test.
+ :param max_torch: Maximum pytorch version to run test.
+ :param min_python: Minimum python version required to run test.
+ :param skip_windows: Skip test for Windows platform.
+ :param tpu: If TPU is available.
+ :param sh: If `sh` module is required to run the test.
+ :param fairscale: If `fairscale` module is required to run the test.
+ :param deepspeed: If `deepspeed` module is required to run the test.
+ :param wandb: If `wandb` module is required to run the test.
+ :param neptune: If `neptune` module is required to run the test.
+ :param comet: If `comet` module is required to run the test.
+ :param mlflow: If `mlflow` module is required to run the test.
+ :param kwargs: Native `pytest.mark.skipif` keyword arguments.
+ """
+ conditions = []
+ reasons = []
+
+ if min_gpus:
+ conditions.append(torch.cuda.device_count() < min_gpus)
+ reasons.append(f"GPUs>={min_gpus}")
+
+ if min_torch:
+ torch_version = get_distribution("torch").version
+ conditions.append(Version(torch_version) < Version(min_torch))
+ reasons.append(f"torch>={min_torch}")
+
+ if max_torch:
+ torch_version = get_distribution("torch").version
+ conditions.append(Version(torch_version) >= Version(max_torch))
+ reasons.append(f"torch<{max_torch}")
+
+ if min_python:
+ py_version = (
+ f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
+ )
+ conditions.append(Version(py_version) < Version(min_python))
+ reasons.append(f"python>={min_python}")
+
+ if skip_windows:
+ conditions.append(_IS_WINDOWS)
+ reasons.append("does not run on Windows")
+
+ if tpu:
+ conditions.append(not _TPU_AVAILABLE)
+ reasons.append("TPU")
+
+ if sh:
+ conditions.append(not _SH_AVAILABLE)
+ reasons.append("sh")
+
+ if fairscale:
+ conditions.append(not _FAIRSCALE_AVAILABLE)
+ reasons.append("fairscale")
+
+ if deepspeed:
+ conditions.append(not _DEEPSPEED_AVAILABLE)
+ reasons.append("deepspeed")
+
+ if wandb:
+ conditions.append(not _WANDB_AVAILABLE)
+ reasons.append("wandb")
+
+ if neptune:
+ conditions.append(not _NEPTUNE_AVAILABLE)
+ reasons.append("neptune")
+
+ if comet:
+ conditions.append(not _COMET_AVAILABLE)
+ reasons.append("comet")
+
+ if mlflow:
+ conditions.append(not _MLFLOW_AVAILABLE)
+ reasons.append("mlflow")
+
+ reasons = [rs for cond, rs in zip(conditions, reasons) if cond]
+ return pytest.mark.skipif(
+ condition=any(conditions),
+ reason=f"Requires: [{' + '.join(reasons)}]",
+ **kwargs,
+ )
diff --git a/lightning-hydra-template/tests/helpers/run_sh_command.py b/lightning-hydra-template/tests/helpers/run_sh_command.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdd2ed633f1185dd7936924616be6a6359a7bca7
--- /dev/null
+++ b/lightning-hydra-template/tests/helpers/run_sh_command.py
@@ -0,0 +1,22 @@
+from typing import List
+
+import pytest
+
+from tests.helpers.package_available import _SH_AVAILABLE
+
+if _SH_AVAILABLE:
+ import sh
+
+
+def run_sh_command(command: List[str]) -> None:
+ """Default method for executing shell commands with `pytest` and `sh` package.
+
+ :param command: A list of shell commands as strings.
+ """
+ msg = None
+ try:
+ sh.python(command)
+ except sh.ErrorReturnCode as e:
+ msg = e.stderr.decode()
+ if msg:
+ pytest.fail(msg=msg)