tuandunghcmut
/

vlm_clone_2

Model card Files Files and versions Community

tuandunghcmut commited on Apr 10

Commit

a60bdd6

verified ·

1 Parent(s): f435a72

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

PaddleMIX/.travis/precommit.sh +21 -0
PaddleMIX/applications/gradio_text2image.py +54 -0
PaddleMIX/comfyui/README.md +49 -0
PaddleMIX/deploy/README_en.md +108 -0
PaddleMIX/docs/train_tutorial.md +10 -0
PaddleMIX/paddlemix/activations.py +174 -0
PaddleMIX/paddlemix/checkpoint.py +216 -0
PaddleMIX/ppdiffusers/LICENSE +203 -0
PaddleMIX/ppdiffusers/Makefile +30 -0
PaddleMIX/ppdiffusers/deploy-deprecated/export.md +67 -0
PaddleMIX/ppdiffusers/deploy-deprecated/export.sh +17 -0
PaddleMIX/ppdiffusers/deploy-deprecated/export_model.py +201 -0
PaddleMIX/ppdiffusers/deploy-deprecated/gradio_demo.py +683 -0
PaddleMIX/ppdiffusers/deploy-deprecated/infer.py +742 -0
PaddleMIX/ppdiffusers/deploy-deprecated/infer_dygraph.py +380 -0
PaddleMIX/ppdiffusers/deploy-deprecated/infer_dygraph_torch.py +447 -0
PaddleMIX/ppdiffusers/deploy-deprecated/requirements.txt +2 -0
PaddleMIX/ppdiffusers/deploy/README.md +65 -0
PaddleMIX/ppdiffusers/ppdiffusers/__init__.py +814 -0
PaddleMIX/ppdiffusers/ppdiffusers/accelerate/__init__.py +30 -0
PaddleMIX/ppdiffusers/ppdiffusers/accelerate/logging.py +123 -0
PaddleMIX/ppdiffusers/ppdiffusers/accelerate/optimizer.py +180 -0
PaddleMIX/ppdiffusers/ppdiffusers/accelerate/scheduler.py +96 -0
PaddleMIX/ppdiffusers/ppdiffusers/accelerate/tracking.py +1103 -0
PaddleMIX/ppdiffusers/ppdiffusers/callbacks.py +156 -0
PaddleMIX/ppdiffusers/ppdiffusers/configuration_utils.py +695 -0
PaddleMIX/ppdiffusers/ppdiffusers/image_processor.py +671 -0
PaddleMIX/ppdiffusers/ppdiffusers/initializer.py +20 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/attention_processor.py +0 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/autoencoder_kl_cogvideox.py +1190 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/autoencoder_kl_temporal_decoder.py +396 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/autoencoder_tiny.py +363 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/cogvideox_transformer_3d.py +394 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/consistency_decoder_vae.py +445 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/controlnet.py +889 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/dit_llama_t2i.py +582 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/downsampling.py +383 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py +158 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/lora.py +462 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/lvdm_attention_temporal.py +462 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/lvdm_unet_3d.py +713 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/lvdm_util.py +296 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/modeling_pytorch_paddle_utils.py +117 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/modeling_utils.py +1356 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/modelscope_gaussion_sdedit.py +451 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/modelscope_st_unet_video2video.py +409 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/prior_transformer.py +398 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/simplified_sd3.py +216 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/transformer_2d.py +538 -0
PaddleMIX/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py +752 -0

PaddleMIX/.travis/precommit.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/bin/bash
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+function abort(){
+    echo "Your commit not fit PaddlePaddle code style" 1>&2
+    echo "Please use pre-commit scripts to auto-format your code" 1>&2
+    exit 1
+}

PaddleMIX/applications/gradio_text2image.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from paddlemix.appflow import Appflow
+from ppdiffusers.utils import load_image
+import paddle
+import imageio
+from PIL import Image
+import gradio as gr
+import traceback
+# upscaling
+def ups_fun(low_res_img, prompt):
+    low_res_img = Image.fromarray(low_res_img.astype('uint8')).convert('RGB')
+    app = Appflow(app='image2image_text_guided_upscaling',models=['stabilityai/stable-diffusion-x4-upscaler'])
+    image = app(prompt=prompt,image=low_res_img)['result']
+    return image
+# text_guided_generation
+def tge_fun(image, prompt_pos, prompt_neg):
+    image = Image.fromarray(image.astype('uint8')).convert('RGB')
+    app = Appflow(app='image2image_text_guided_generation',models=['Linaqruf/anything-v3.0'])
+    image = app(prompt=prompt_pos,negative_prompt=prompt_neg,image=image)['result']
+    return image
+# video_generation
+def vge_fun(prompt):
+    app = Appflow(app='text_to_video_generation',models=['damo-vilab/text-to-video-ms-1.7b'])
+    video_frames = app(prompt=prompt,num_inference_steps=25)['result']
+    imageio.mimsave("gen_video.gif", video_frames, duration=8)
+    return "gen_video.gif"
+with gr.Blocks() as demo:
+    gr.Markdown("# Appflow应用：text2image")
+    with gr.Tab("文本引导的图像放大"):
+        with gr.Row():
+            ups_image_in = gr.Image(label = "输入图片")
+            ups_image_out = gr.Image(label = "输出图片")
+        ups_text_in = gr.Text(label = "Prompt")
+        ups_button = gr.Button()
+        ups_button.click(fn=ups_fun, inputs = [ups_image_in, ups_text_in], outputs = [ups_image_out])
+    with gr.Tab("文本引导的图像变换"):
+        with gr.Row():
+            tge_image_in = gr.Image(label = "输入图片")
+            tge_image_out = gr.Image(label = "输出图片")
+        tge_text_pos_in = gr.Text(label = "Positive Prompt")
+        tge_text_neg_in = gr.Text(label = "Negative Prompt")
+        tge_button = gr.Button()
+        tge_button.click(fn=tge_fun, inputs = [tge_image_in, tge_text_pos_in, tge_text_neg_in], outputs = [tge_image_out])
+    with gr.Tab("文本条件的视频生成"):
+        vge_text_in = gr.Text(label = "Prompt")
+        vge_video_out = gr.Video(label = "输出视频")
+        vge_button = gr.Button()
+        vge_button.click(fn=vge_fun, inputs = [vge_text_in], outputs = [vge_video_out])
+demo.launch()

PaddleMIX/comfyui/README.md ADDED Viewed

	@@ -0,0 +1,49 @@

+# PaddleMIX 扩展插件 for ComfyUI
+## 简介
+[ComfyUI](https://github.com/comfyanonymous/ComfyUI/) 是一个在开源社区广受欢迎的AIGC程序。它通过节点拆分和工作流组合的方式，让不同模型协同工作，完成复杂的高级生产任务。本目录包含PaddleMIX为ComfyUI开发的一些节点扩展程序，支持文本到图像生成、图像分割、图像生成文本描述等多模态能力。
+## 安装与使用指南
+### 一、准备ComfyUI环境
+#### 从源代码部署
+访问 [ComfyUI GitHub仓库](https://github.com/comfyanonymous/ComfyUI) 获取源代码。
+#### 使用Docker部署
+1. **拉取镜像文件压缩包并加载**（或直接使用 `docker pull` 命令拉取网上的任意ComfyUI镜像）：
+    ```shell
+    wget https://paddlenlp.bj.bcebos.com/models/community/aistudio/comfyui_docker/comfyui_aistudio_v1.tar
+    docker load -i comfyui_aistudio_v1.tar
+    ```
+2. **创建Docker实例**，注意替换路径和镜像名称：
+    ```shell
+    nvidia-docker run --name comfyui_env -it -e HOME="/root" -w "/root" -v </path/to/temp_data_dir>:/root --ipc=host --net=host <docker-image-name> /bin/bash --login
+    ```
+3. **进入Docker环境**：
+    ```shell
+    docker exec -it comfyui_env /bin/bash
+    ```
+4. **启动ComfyUI**：
+    ```shell
+    cd /comfyui_env
+    ./python_env/bin/python ComfyUI/main.py --listen 0.0.0.0 --port 8889 &
+    ```
+### 二、安装PaddleMIX ComfyUI扩展程序
+将PaddleMIX/comfyui/下的对应插件文件夹复制到ComfyUI/custom_nodes/文件夹下，并安装对应的requirements.txt文件即可使用。
+#### 安装文生图扩展节点的示例：
+```shell
+# 复制扩展程序文件夹到ComfyUI/custom_nodes/目录
+cp -r PaddleMIX/comfyui/ComfyUI_ppdiffusers /path/to/your/ComfyUI/custom_nodes/
+# 安装扩展程序所需要的依赖包
+pip install -r PaddleMIX/comfyui/ComfyUI_ppdiffusers/requirements.txt
+```
+### 三、加载工作流
+每个扩展程序目录下都有一个workflows文件夹，你可以通过浏览器加载其中的json文件来使用对应的工作流。具体用例可参考：[PaddleMIX ComfyUI扩展程序示例](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/comfyui/ComfyUI_ppdiffusers)。

PaddleMIX/deploy/README_en.md ADDED Viewed

	@@ -0,0 +1,108 @@

+# PaddleMIX Inference Deployment
+[[中文文档](README.md)]
+PaddleMIX utilizes Paddle Inference and provides a Python-based deployment solution. There are two deployment methods:
+1. **APPflow Deployment**:
+   - By setting the `static_mode = True` variable in APPflow, you can enable static graph inference. Additionally, you can accelerate inference using TensorRT. Note that not all models support static graph or TensorRT. Please refer to the [Multi Modal And Scenario](../applications/README_en.md/#multi-modal-and-scenario) section for specific model support.
+2. **Single Model Deployment**:
+For APPflow usage, you can set the `static_mode = True` variable to enable static graph inference and optionally accelerate inference using TensorRT.
+### 1.1 Exmaples
+```python
+>>> from paddlemix.appflow import Appflow
+>>> from PIL import Image
+>>> task = Appflow(app="openset_det_sam",
+                   models=["GroundingDino/groundingdino-swint-ogc","Sam/SamVitH-1024"],
+                   static_mode=True,
+                   precision="fp32")
+>>> image_pil = Image.open("beauty.png").convert("RGB")
+>>> result = task(image=image_pil,prompt="women")
+```
+### 1.2 Parameter Explanation
+| Parameter | Required? | Meaning                                                                                          |
+|-------|-------|---------------------------------------------------------------------------------------------|
+| --app | Yes| Application name                                                                                   |
+| --models | Yes | Model(s) used. Can be one model, or multiple models                                                                                    |
+| --static_mode  | Optional | Whether to use static graph inference, default to False                                                                                 |
+| --precision | Optional | When `static_mode == True`, it defaults to using FP32. You can optionally select `trt_fp32` or `trt_fp16`.                                                                                   |
+Instructions：
+- Some models do not support static graph or TensorRT. For specific information, please refer to [Multi Modal And Scenario](../applications/README_en.md/#multi-modal-and-scenario).
+- The generated static graph will be located in the folder corresponding to the model name, for example: `GroundingDino/groundingdino-swint-ogc/`.
+## 2. Single Model Prediction Deployment
+Python-based prediction deployment mainly involves two steps:
+- Exporting the predictive model
+- Performing prediction using Python
+Currently supported models:
+- [blip2](./blip2/README.md)
+- [groundingdino](./groundingdino/README.md)
+- [sam](./sam/README.md)
+- [qwen_vl](./qwen_vl/README.md)
+Using groundingdino as an exmaple.
+### 2.1 Exporting Predictive Model
+```bash
+cd deploy/groundingdino
+# 导出groundingdino模型
+python export.py \
+--dino_type GroundingDino/groundingdino-swint-ogc
+```
+Will be exported to the following directory, including `model_state.pdiparams`,  `model_state.pdiparams.info`, `model_state.pdmodel`and other files.
+### 2.2 Python-based Inference
+```bash
+ python predict.py  \
+ --text_encoder_type GroundingDino/groundingdino-swint-ogc \
+ --model_path output_groundingdino/GroundingDino/groundingdino-swint-ogc \
+ --input_image https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg \
+ --output_dir ./groundingdino_predict_output \
+ --prompt "bus"
+```
+## 3. BenchMark
+> Note:
+> environment
+Paddle 3.0
+PaddleMIX release/2.0
+PaddleNLP 2.7.2
+A100 80G。
+### 3.1 benchmark cmd
+Add -- benchmark after running in the 'deploy' corresponding model directory to obtain the running time of the model.
+example: GroundingDino benchmark：
+```bash
+ cd deploy/groundingdino
+ python predict.py  \
+ --text_encoder_type GroundingDino/groundingdino-swint-ogc \
+ --model_path output_groundingdino/GroundingDino/groundingdino-swint-ogc \
+ --input_image https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg \
+ --output_dir ./groundingdino_predict_output \
+ --prompt "bus" \
+ --benchmark True
+```
+|Model|image size|dtype |Paddle Deploy |
+|-|-|-|-|
+|qwen-vl-7b|448*448|fp16|669.8 ms|
+|llava-1.5-7b|336*336|fp16|981.2 ms|
+|llava-1.6-7b|336*336|fp16|778.7 ms|
+|groundingDino/groundingdino-swint-ogc|800*1193|fp32|100 ms|
+|Sam/SamVitH-1024|1024*1024|fp32|121 ms|

PaddleMIX/docs/train_tutorial.md ADDED Viewed

	@@ -0,0 +1,10 @@

+# Train Tutorial
+## 训练微调示例
+- [Blip2](../paddlemix/examples/blip2/README.md)
+- [clip](../paddlemix/examples/clip/README.md)
+- [coca](../paddlemix/examples/coca/README.md)
+- [eva02](../paddlemix/examples/eva02/README.md)
+- [evaclip](../paddlemix/examples/evaclip/README.md)
+- [Stable Diffusion](../ppdiffusers/examples/text_to_image/README.md)

PaddleMIX/paddlemix/activations.py ADDED Viewed

	@@ -0,0 +1,174 @@

+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from collections import OrderedDict
+import paddle
+import paddle.nn.functional as F
+from paddle import Tensor, nn
+class NewGELUActivation(nn.Layer):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return (
+            0.5 * input * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0))))
+        )
+class GELUActivation(nn.Layer):
+    """
+    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
+    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
+    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+    def __init__(self, use_gelu_python: bool = False):
+        super().__init__()
+        if use_gelu_python:
+            self.act = self._gelu_python
+        else:
+            self.act = nn.functional.gelu
+    def _gelu_python(self, input: Tensor) -> Tensor:
+        return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0)))
+    def forward(self, input: Tensor) -> Tensor:
+        return self.act(input)
+class FastGELUActivation(nn.Layer):
+    """
+    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return 0.5 * input * (1.0 + paddle.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input)))
+class QuickGELUActivation(nn.Layer):
+    """
+    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return input * F.sigmoid(1.702 * input)
+class ClippedGELUActivation(nn.Layer):
+    """
+    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
+    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
+    https://arxiv.org/abs/2004.09602.
+    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
+    initially created.
+    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
+    """
+    def __init__(self, min: float, max: float):
+        if min > max:
+            raise ValueError(f"min should be < max (got min: {min}, max: {max})")
+        super().__init__()
+        self.min = min
+        self.max = max
+    def forward(self, x: Tensor) -> Tensor:
+        return paddle.clip(gelu(x), self.min, self.max)
+class SiLUActivation(nn.Layer):
+    """
+    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
+    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
+    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
+    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
+    later.
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return F.silu(input)
+class MishActivation(nn.Layer):
+    """
+    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
+    visit the official repository for the paper: https://github.com/digantamisra98/Mish
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return F.mish(input)
+class LinearActivation(nn.Layer):
+    """
+    Applies the linear activation function, i.e. forwarding input directly to output.
+    """
+    def forward(self, input: Tensor) -> Tensor:
+        return input
+class ClassInstantier(OrderedDict):
+    def __getitem__(self, key):
+        content = super().__getitem__(key)
+        cls, kwargs = content if isinstance(content, tuple) else (content, {})
+        return cls(**kwargs)
+ACT2CLS = {
+    "gelu": GELUActivation,
+    "gelu_10": (ClippedGELUActivation, {"min": -10, "max": 10}),
+    "gelu_fast": FastGELUActivation,
+    "gelu_new": NewGELUActivation,
+    "gelu_python": (GELUActivation, {"use_gelu_python": True}),
+    "linear": LinearActivation,
+    "mish": MishActivation,
+    "quick_gelu": QuickGELUActivation,
+    "relu": nn.ReLU,
+    "relu6": nn.ReLU6,
+    "sigmoid": nn.Sigmoid,
+    "silu": SiLUActivation,
+    "swish": SiLUActivation,
+    "tanh": nn.Tanh,
+}
+ACT2FN = ClassInstantier(ACT2CLS)
+def get_activation(activation_string):
+    if activation_string in ACT2FN:
+        return ACT2FN[activation_string]
+    else:
+        raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
+# For backwards compatibility with: from activations import gelu_python
+gelu_python = get_activation("gelu_python")
+gelu_new = get_activation("gelu_new")
+gelu = get_activation("gelu")
+gelu_fast = get_activation("gelu_fast")
+quick_gelu = get_activation("quick_gelu")
+silu = get_activation("silu")
+mish = get_activation("mish")
+linear_act = get_activation("linear")

PaddleMIX/paddlemix/checkpoint.py ADDED Viewed

	@@ -0,0 +1,216 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import shutil
+import paddle
+import paddle.nn.functional as F
+def save(args, model, optimizer, epoch=0, step=0, output_dir="", is_best=False):
+    """
+    save the state dicts of model and optimizer into an checkpoint.
+    """
+    if args.dp_rank != 0:
+        return
+    if output_dir and isinstance(output_dir, str):
+        output_dir = os.path.join(output_dir, "epoch_%d_step_%d" % (epoch, step))
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir, exist_ok=True)
+        print("Save model to %s" % output_dir)
+        save_dir = "{}/mp_{:0>2d}_sharding_{:0>2d}".format(output_dir, args.mp_rank, args.sharding_rank)
+        # if args.sharding_stage == 3:
+        #     model.get_all_parameters(convert2cpu=False)
+        paddle.save(model.state_dict(), os.path.join(save_dir, "model.pdparams"))
+        paddle.save(optimizer.state_dict(), os.path.join(save_dir, "model_state.pdopt"))
+        if is_best:
+            shutil.copyfile("model.pdparams", "model_best.pdparams")
+        meta_dict = {
+            "epoch": epoch,
+            "step": step,
+            "cuda_rng_state": paddle.get_cuda_rng_state(),
+        }
+        paddle.save(meta_dict, os.path.join(save_dir, "meta_state.pdopt"))
+    else:
+        raise TypeError("`save` requires a valid value of `output_dir`.")
+def load_model(args, model, optimizer=None, ckpt_dir=""):
+    """
+    load the saved checkpoint file and update the state dicts of model and optimizer.
+    """
+    if ckpt_dir and isinstance(ckpt_dir, str) and os.path.isdir(ckpt_dir):
+        print("Try to load checkpoint from %s " % ckpt_dir)
+        load_dir = "{}/mp_{:0>2d}_sharding_{:0>2d}".format(ckpt_dir, args.mp_rank, args.sharding_rank)
+        model_path = os.path.join(load_dir, "model.pdparams")
+        opt_path = os.path.join(load_dir, "model_state.pdopt")
+        # meta_path = os.path.join(load_dir, "meta_state.pdopt")
+        if os.path.exists(model_path):
+            model_dict = paddle.load(model_path)
+            for name, param in model.state_dict().items():
+                assert name in model_dict.keys(), "No param named `{}` was found in checkpoint file.".format(name)
+                if param.dtype != model_dict[name].dtype:
+                    model_dict[name] = model_dict[name].cast(param.dtype)
+            model.set_state_dict(model_dict)
+            del model_dict
+        else:
+            raise ValueError("No checkpoint file found in %s" % model_path)
+        if os.path.exists(opt_path):
+            opt_dict = paddle.load(opt_path)
+            optimizer.set_state_dict(opt_dict)
+            del opt_dict
+        else:
+            print("No optimizer checkpoint file found in %s." % opt_path)
+        # if os.path.exists(meta_path):
+        #     meta_dict = paddle.load(meta_path)
+        #     load_recovery = {
+        #         'step': meta_dict['step'],
+        #         'epoch': meta_dict['epoch'],
+        #         'rng_state': meta_dict['cuda_rng_state']
+        #     }
+        #     del meta_dict
+        # else:
+        #     raise ValueError("No meta checkpoint file found in %s." %
+        #                         meta_path)
+        print("successfully load checkpoints")
+    elif ckpt_dir and os.path.isfile(ckpt_dir):
+        print("Try to load a whole checkpoint from %s " % ckpt_dir)
+        embedding_list = ["token_embedding"]
+        collinear_list = [
+            "proj",
+            "w1",
+            "w2",
+            "w3",
+            "head",
+            "c_fc",
+            "c_proj",
+            "q_bias",
+            "v_bias",
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "qkv",
+            "c_fc",
+            "c_proj",
+            "lm_head",
+            "fc1",
+            "fc2",
+            "fc3",
+        ]
+        rowlinear_list = ["out_proj"]  # in eva_text_model.py, but evaclip do not use text model
+        all_list = collinear_list + rowlinear_list + embedding_list
+        skip_list = [
+            "visual.patch_embed.proj.weight",
+            "visual.patch_embed.proj.bias",
+            "patch_embed.proj.weight",
+            "patch_embed.proj.bias",
+        ]
+        col_list = []
+        row_list = []
+        emb_list = []
+        mp_rank = args.mp_rank
+        mp_size = max(args.tensor_parallel_degree, 1)
+        def col_split_modeldict(model_dict):
+            if len(model_dict.shape) == 2:
+                subbatch = model_dict.shape[1] // mp_size
+                return model_dict[:, mp_rank * subbatch : (mp_rank + 1) * subbatch]
+            elif len(model_dict.shape) == 1:
+                subbatch = model_dict.shape[0] // mp_size
+                return model_dict[mp_rank * subbatch : (mp_rank + 1) * subbatch]
+        def row_split_modeldict(model_dict):
+            if len(model_dict.shape) == 2:
+                subbatch = model_dict.shape[0] // mp_size
+                return model_dict[mp_rank * subbatch : (mp_rank + 1) * subbatch]
+            else:
+                return model_dict
+        def emb_split_modeldict(model_dict):
+            subbatch = model_dict.shape[0] // mp_size
+            return model_dict[mp_rank * subbatch : (mp_rank + 1) * subbatch]
+        model_dict = paddle.load(ckpt_dir)
+        modelkeys = model_dict.keys()
+        for whole_key in modelkeys:
+            if "." not in whole_key:
+                continue
+            key = whole_key.split(".")[-2]
+            if whole_key in skip_list:
+                continue
+            if key in all_list:
+                if key in collinear_list:
+                    col_list.append((key, model_dict[whole_key].shape))
+                    model_dict[whole_key] = col_split_modeldict(model_dict[whole_key])
+                elif key in rowlinear_list:
+                    row_list.append((key, model_dict[whole_key].shape))
+                    model_dict[whole_key] = row_split_modeldict(model_dict[whole_key])
+                else:
+                    emb_list.append((key, model_dict[whole_key].shape))
+                    model_dict[whole_key] = emb_split_modeldict(model_dict[whole_key])
+        if hasattr(args, "context_length") and args.context_length != 77:
+            model_dict["text.positional_embedding"] = model_dict["text.positional_embedding"][: args.context_length, :]
+        # interpolate position embedding, only in eva02 finetune large size training
+        if "pos_embed" in model_dict and hasattr(model, "patch_embed"):
+            pos_embed_checkpoint = model_dict["pos_embed"]  #
+            embedding_size = pos_embed_checkpoint.shape[-1]
+            num_patches = model.patch_embed.num_patches
+            num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+            # height (== width) for the checkpoint position embedding
+            orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+            # height (== width) for the new position embedding
+            new_size = int(num_patches**0.5)
+            # class_token and dist_token are kept unchanged
+            if orig_size != new_size:
+                print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+                extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+                # only the position tokens are interpolated
+                pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+                pos_tokens = pos_tokens.reshape([-1, orig_size, orig_size, embedding_size]).transpose(
+                    perm=[0, 3, 1, 2]
+                )
+                pos_tokens = F.interpolate(
+                    pos_tokens.astype(dtype="float32"), size=(new_size, new_size), mode="bicubic", align_corners=False
+                )
+                pos_tokens = pos_tokens.transpose(perm=[0, 2, 3, 1]).flatten(start_axis=1, stop_axis=2)
+                new_pos_embed = paddle.concat((extra_tokens, pos_tokens), axis=1)
+                model_dict["pos_embed"] = new_pos_embed
+        print("cast state_dict to default dtype:{}".format(paddle.get_default_dtype()))
+        for key, value in model_dict.items():
+            if "freqs_cos" in key or "freqs_sin" in key:
+                continue
+            model_dict[key] = paddle.cast(value, dtype=paddle.get_default_dtype())
+        model.set_state_dict(model_dict)
+        del model_dict
+    else:
+        print("`load` requires a valid value of `ckpt_dir`.")
+        raise TypeError("`load` requires a valid value of `ckpt_dir`.")

PaddleMIX/ppdiffusers/LICENSE ADDED Viewed

	@@ -0,0 +1,203 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

PaddleMIX/ppdiffusers/Makefile ADDED Viewed

	@@ -0,0 +1,30 @@

+.DEFAULT_GOAL := all
+.PHONY: all
+all: deploy-version build deploy
+.PHONY: build
+build:
+	python3 setup.py sdist bdist_wheel
+.PHONY: deploy
+deploy:
+	make deploy-version
+	twine upload --skip-existing dist/*
+.PHONY: deploy-version
+deploy-version:
+	echo "VERSION = '$$(cat VERSION)'" > ppdiffusers/version.py
+.PHONY: install
+install:
+	pip install -r requirements.txt
+.PHONY: version
+version:
+	@newVersion=$$(awk -F. '{print $$1"."$$2"."$$3+1}' < VERSION) \
+		&& echo $${newVersion} > VERSION \
+		&& git add VERSION \
+		&& git commit -m "🔥 update version to $${newVersion}" > /dev/null \
+		&& echo "Bumped version to $${newVersion}"

PaddleMIX/ppdiffusers/deploy-deprecated/export.md ADDED Viewed

	@@ -0,0 +1,67 @@

+# Diffusion 模型导出教程
+[PPDiffusers](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers) 是一款支持跨模态（如图像与语音）训练和推理的扩散模型（Diffusion Model）工具箱，其借鉴了🤗 Huggingface 团队的 [Diffusers](https://github.com/huggingface/diffusers) 的优秀设计，并且依托 [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) 框架和 [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP) 自然语言处理库。下面将介绍如何将 PPDiffusers 提供的预训练模型进行模型导出。
+### 模型导出
+___注意：模型导出过程中，需要下载 StableDiffusion 模型。为了使用该模型与权重，你必须接受该模型所要求的 License，请访问 HuggingFace 的[model card](https://huggingface.co/runwayml/stable-diffusion-v1-5), 仔细阅读里面的 License，然后签署该协议。___
+___Tips: Stable Diffusion 是基于以下的 License: The CreativeML OpenRAIL M license is an Open RAIL M license, adapted from the work that BigScience and the RAIL Initiative are jointly carrying in the area of responsible AI licensing. See also the article about the BLOOM Open RAIL license on which this license is based.___
+可执行以下命令行完成模型导出。
+```shell
+# 关闭ppxformers，否则会导致模型导出失败
+export USE_PPXFORMERS=False
+python export_model.py --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 --output_path stable-diffusion-v1-5
+```
+注: 上述指令没有导出固定尺寸的模型，固定尺寸的导出模型有利于优化模型推理性能，但会牺牲一定灵活性。若要导出固定尺寸的模型，可指定`--height`和`--width`参数。
+输出的模型目录结构如下：
+```shell
+stable-diffusion-v1-5/
+├── model_index.json
+├── scheduler
+│   └── scheduler_config.json
+├── tokenizer
+│   ├── tokenizer_config.json
+│   ├── merges.txt
+│   ├── vocab.json
+│   └── special_tokens_map.json
+├── text_encoder
+│   ├── inference.pdiparams
+│   ├── inference.pdiparams.info
+│   └── inference.pdmodel
+├── unet
+│   ├── inference.pdiparams
+│   ├── inference.pdiparams.info
+│   └── inference.pdmodel
+├── vae_decoder
+│   ├── inference.pdiparams
+│   ├── inference.pdiparams.info
+│   └── inference.pdmodel
+└── vae_encoder
+    ├── inference.pdiparams
+    ├── inference.pdiparams.info
+    └── inference.pdmodel
+```
+#### Inpaint 任务模型导出
+除了支持常规 StableDiffusion 文生图、图生图任务的模型导出以外，还支持Inpaint任务模型 (注意：这个不是 legacy 版本的 inpaint) 的导出、如果需要导出 inpaint 模型，可以执行以下命令：
+```shell
+python export_model.py --pretrained_model_name_or_path runwayml/stable-diffusion-inpainting --output_path stable-diffusion-v1-5-inpainting
+```
+#### 参数说明
+`export_model.py` 各命令行参数的说明。
+| 参数 |参数说明 |
+|----------|--------------|
+| <span style="display:inline-block;width: 230pt"> --pretrained_model_name_or_path </span> | ppdiffuers提供的diffusion预训练模型。默认为："CompVis/stable-diffusion-v1-4"。更多 StableDiffusion 预训练模型可参考 [ppdiffusers 模型列表](../README.md#ppdiffusers模型支持的权重)。|
+| --output_path | 导出的模型目录。 |
+| --sample | vae encoder 的输出是否调整为 sample 模式，注意：sample模式会引入随机因素，默认是 False。|

PaddleMIX/ppdiffusers/deploy-deprecated/export.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+export USE_PPXFORMERS=False
+export CUDA_VISIBLE_DEVICES=1
+python export_model.py --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 --output_path stable-diffusion-v1-5

PaddleMIX/ppdiffusers/deploy-deprecated/export_model.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+# set USE_PPXFORMERS=False to avoid using ppxformers
+os.environ["USE_PPXFORMERS"] = "False"
+from pathlib import Path
+from types import MethodType
+import paddle
+from ppdiffusers import (
+    FastDeployRuntimeModel,
+    FastDeployStableDiffusionInpaintPipeline,
+    FastDeployStableDiffusionMegaPipeline,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
+    model_path: str,
+    output_path: str,
+    sample: bool = False,
+    height: int = None,
+    width: int = None,
+):
+    # specify unet model with unet pre_temb_act opt enabled.
+    unet_model = UNet2DConditionModel.from_pretrained(model_path, resnet_pre_temb_non_linearity=True, subfolder="unet")
+    pipeline = StableDiffusionPipeline.from_pretrained(
+        model_path, unet=unet_model, safety_checker=None, feature_extractor=None
+    )
+    output_path = Path(output_path)
+    # calculate latent's H and W
+    latent_height = height // 8 if height is not None else None
+    latent_width = width // 8 if width is not None else None
+    # get arguments
+    cross_attention_dim = pipeline.unet.config.cross_attention_dim  # 768 or 1024 or 1280
+    unet_channels = pipeline.unet.config.in_channels  # 4 or 9
+    vae_in_channels = pipeline.vae.config.in_channels  # 3
+    vae_latent_channels = pipeline.vae.config.latent_channels  # 4
+    print(
+        f"cross_attention_dim: {cross_attention_dim}\n",
+        f"unet_in_channels: {unet_channels}\n",
+        f"vae_encoder_in_channels: {vae_in_channels}\n",
+        f"vae_decoder_latent_channels: {vae_latent_channels}",
+    )
+    # 1. Convert text_encoder
+    text_encoder = paddle.jit.to_static(
+        pipeline.text_encoder,
+        input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")],  # input_ids
+    )
+    save_path = os.path.join(args.output_path, "text_encoder", "inference")
+    paddle.jit.save(text_encoder, save_path)
+    print(f"Save text_encoder model in {save_path} successfully.")
+    del pipeline.text_encoder
+    # 2. Convert unet
+    unet = paddle.jit.to_static(
+        pipeline.unet,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None, unet_channels, latent_height, latent_width],
+                dtype="float32",
+                name="sample",
+            ),  # sample
+            paddle.static.InputSpec(shape=[1], dtype="float32", name="timestep"),  # timestep
+            paddle.static.InputSpec(
+                shape=[None, None, cross_attention_dim],
+                dtype="float32",
+                name="encoder_hidden_states",
+            ),  # encoder_hidden_states
+        ],
+    )
+    save_path = os.path.join(args.output_path, "unet", "inference")
+    paddle.jit.save(unet, save_path)
+    print(f"Save unet model in {save_path} successfully.")
+    del pipeline.unet
+    def forward_vae_encoder_mode(self, z):
+        return self.encode(z, True).latent_dist.mode()
+    def forward_vae_encoder_sample(self, z):
+        return self.encode(z, True).latent_dist.sample()
+    # 3. Convert vae encoder
+    vae_encoder = pipeline.vae
+    if sample:
+        vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder)
+    else:
+        vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder)
+    vae_encoder = paddle.jit.to_static(
+        vae_encoder,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None, vae_in_channels, height, width],
+                dtype="float32",
+                name="sample",  # N, C, H, W
+            ),  # latent
+        ],
+    )
+    # Save vae_encoder in static graph model.
+    save_path = os.path.join(args.output_path, "vae_encoder", "inference")
+    paddle.jit.save(vae_encoder, save_path)
+    print(f"Save vae_encoder model in {save_path} successfully.")
+    # 4. Convert vae encoder
+    vae_decoder = pipeline.vae
+    def forward_vae_decoder(self, z):
+        return self.decode(z, True).sample
+    vae_decoder.forward = MethodType(forward_vae_decoder, vae_decoder)
+    vae_decoder = paddle.jit.to_static(
+        vae_decoder,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None, vae_latent_channels, latent_height, latent_width],
+                dtype="float32",
+                name="latent_sample",
+            ),  # latent_sample
+        ],
+    )
+    # Save vae_decoder in static graph model.
+    save_path = os.path.join(args.output_path, "vae_decoder", "inference")
+    paddle.jit.save(vae_decoder, save_path)
+    print(f"Save vae_decoder model in {save_path} successfully.")
+    del pipeline.vae
+    if "inpainting" in model_path:
+        fd_pipe_cls = FastDeployStableDiffusionInpaintPipeline
+    else:
+        fd_pipe_cls = FastDeployStableDiffusionMegaPipeline
+    fastdeploy_pipeline = fd_pipe_cls(
+        vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"),
+        vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_decoder"),
+        text_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder"),
+        unet=FastDeployRuntimeModel.from_pretrained(output_path / "unet"),
+        tokenizer=pipeline.tokenizer,
+        scheduler=pipeline.scheduler,
+        safety_checker=None,
+        feature_extractor=None,
+        image_encoder=None,
+        requires_safety_checker=False,
+    )
+    fastdeploy_pipeline.save_pretrained(str(output_path))
+    print("FastDeploy pipeline saved to", output_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        required=True,
+        help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).",
+    )
+    parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
+    parser.add_argument(
+        "--sample",
+        action="store_true",
+        default=False,
+        help="Export the vae encoder in mode or sample",
+    )
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=None,
+        help="The height of output images. Default: None",
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=None,
+        help="The width of output images. Default: None",
+    )
+    args = parser.parse_args()
+    convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
+        args.pretrained_model_name_or_path,
+        args.output_path,
+        args.sample,
+        args.height,
+        args.width,
+    )

PaddleMIX/ppdiffusers/deploy-deprecated/gradio_demo.py ADDED Viewed

	@@ -0,0 +1,683 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import cv2
+import fastdeploy as fd
+import gradio as gr
+import numpy as np
+import paddle
+from paddlenlp.trainer.argparser import strtobool
+from PIL import Image
+from ppdiffusers import FastDeployStableDiffusionMegaPipeline
+def create_paddle_inference_runtime(
+    use_trt=False,
+    dynamic_shape=None,
+    use_fp16=False,
+    use_bf16=False,
+    device_id=0,
+    disable_paddle_trt_ops=[],
+    disable_paddle_pass=[],
+    paddle_stream=None,
+    workspace=None,
+):
+    assert not use_fp16 or not use_bf16, "use_fp16 and use_bf16 are mutually exclusive"
+    option = fd.RuntimeOption()
+    option.use_paddle_backend()
+    if device_id == -1:
+        option.use_cpu()
+    else:
+        option.use_gpu(device_id)
+    if paddle_stream is not None and use_trt:
+        option.set_external_raw_stream(paddle_stream)
+    for pass_name in disable_paddle_pass:
+        option.paddle_infer_option.delete_pass(pass_name)
+    if use_bf16:
+        option.paddle_infer_option.inference_precision = "bfloat16"
+    if use_trt:
+        option.paddle_infer_option.disable_trt_ops(disable_paddle_trt_ops)
+        option.paddle_infer_option.enable_trt = True
+        if workspace is not None:
+            option.set_trt_max_workspace_size(workspace)
+        if use_fp16:
+            option.trt_option.enable_fp16 = True
+        else:
+            # Note(zhoushunjie): These four passes don't support fp32 now.
+            # Remove this line of code in future.
+            only_fp16_passes = [
+                "trt_cross_multihead_matmul_fuse_pass",
+                "trt_flash_multihead_matmul_fuse_pass",
+                "preln_elementwise_groupnorm_act_pass",
+                "elementwise_groupnorm_act_pass",
+            ]
+            for curr_pass in only_fp16_passes:
+                option.paddle_infer_option.delete_pass(curr_pass)
+        # Need to enable collect shape
+        if dynamic_shape is not None:
+            option.paddle_infer_option.collect_trt_shape = True
+            for key, shape_dict in dynamic_shape.items():
+                option.trt_option.set_shape(
+                    key,
+                    shape_dict["min_shape"],
+                    shape_dict.get("opt_shape", None),
+                    shape_dict.get("max_shape", None),
+                )
+    return option
+def create_trt_runtime(workspace=(1 << 31), dynamic_shape=None, use_fp16=False, device_id=0):
+    option = fd.RuntimeOption()
+    option.use_trt_backend()
+    option.use_gpu(device_id)
+    if use_fp16:
+        option.enable_trt_fp16()
+    if workspace is not None:
+        option.set_trt_max_workspace_size(workspace)
+    if dynamic_shape is not None:
+        for key, shape_dict in dynamic_shape.items():
+            option.set_trt_input_shape(
+                key,
+                min_shape=shape_dict["min_shape"],
+                opt_shape=shape_dict.get("opt_shape", None),
+                max_shape=shape_dict.get("max_shape", None),
+            )
+    return option
+def pipe_init(args):
+    paddle.set_device(f"gpu:{args.device_id}")
+    paddle_stream = paddle.device.cuda.current_stream(args.device_id).cuda_stream
+    vae_in_channels = 4
+    text_encoder_max_length = 77
+    unet_max_length = text_encoder_max_length * 3  # lpw support max_length is 77x3
+    min_image_size = 384
+    max_image_size = 768
+    hidden_states = 1024 if args.is_sd2_0 else 768
+    unet_in_channels = 9 if args.task_name == "inpaint" else 4
+    bs = 2
+    text_encoder_dynamic_shape = {
+        "input_ids": {
+            "min_shape": [1, text_encoder_max_length],
+            "max_shape": [1, text_encoder_max_length],
+            "opt_shape": [1, text_encoder_max_length],
+        }
+    }
+    vae_encoder_dynamic_shape = {
+        "sample": {
+            "min_shape": [1, 3, min_image_size, min_image_size],
+            "max_shape": [1, 3, max_image_size, max_image_size],
+            "opt_shape": [1, 3, min_image_size, min_image_size],
+        }
+    }
+    vae_decoder_dynamic_shape = {
+        "latent_sample": {
+            "min_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
+            "max_shape": [1, vae_in_channels, max_image_size // 8, max_image_size // 8],
+            "opt_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
+        }
+    }
+    unet_dynamic_shape = {
+        "sample": {
+            "min_shape": [
+                1,
+                unet_in_channels,
+                min_image_size // 8,
+                min_image_size // 8,
+            ],
+            "max_shape": [
+                bs,
+                unet_in_channels,
+                max_image_size // 8,
+                max_image_size // 8,
+            ],
+            "opt_shape": [
+                2,
+                unet_in_channels,
+                min_image_size // 8,
+                min_image_size // 8,
+            ],
+        },
+        "timestep": {
+            "min_shape": [1],
+            "max_shape": [1],
+            "opt_shape": [1],
+        },
+        "encoder_hidden_states": {
+            "min_shape": [1, text_encoder_max_length, hidden_states],
+            "max_shape": [bs, unet_max_length, hidden_states],
+            "opt_shape": [2, text_encoder_max_length, hidden_states],
+        },
+    }
+    # 4. Init runtime
+    if args.backend == "tensorrt":
+        runtime_options = dict(
+            text_encoder=create_trt_runtime(
+                dynamic_shape=text_encoder_dynamic_shape,
+                use_fp16=args.use_fp16,
+                device_id=args.device_id,
+            ),
+            vae_encoder=create_trt_runtime(
+                dynamic_shape=vae_encoder_dynamic_shape,
+                use_fp16=args.use_fp16,
+                device_id=args.device_id,
+            ),
+            vae_decoder=create_trt_runtime(
+                dynamic_shape=vae_decoder_dynamic_shape,
+                use_fp16=args.use_fp16,
+                device_id=args.device_id,
+            ),
+            unet=create_trt_runtime(
+                dynamic_shape=unet_dynamic_shape,
+                use_fp16=args.use_fp16,
+                device_id=args.device_id,
+            ),
+        )
+    elif args.backend == "paddle" or args.backend == "paddle_tensorrt":
+        args.use_trt = args.backend == "paddle_tensorrt"
+        runtime_options = dict(
+            text_encoder=create_paddle_inference_runtime(
+                use_trt=args.use_trt,
+                dynamic_shape=text_encoder_dynamic_shape,
+                use_fp16=args.use_fp16,
+                use_bf16=args.use_bf16,
+                device_id=args.device_id,
+                disable_paddle_trt_ops=["arg_max", "range", "lookup_table_v2"],
+                paddle_stream=paddle_stream,
+            ),
+            vae_encoder=create_paddle_inference_runtime(
+                use_trt=args.use_trt,
+                dynamic_shape=vae_encoder_dynamic_shape,
+                use_fp16=args.use_fp16,
+                use_bf16=args.use_bf16,
+                device_id=args.device_id,
+                paddle_stream=paddle_stream,
+            ),
+            vae_decoder=create_paddle_inference_runtime(
+                use_trt=args.use_trt,
+                dynamic_shape=vae_decoder_dynamic_shape,
+                use_fp16=args.use_fp16,
+                use_bf16=args.use_bf16,
+                device_id=args.device_id,
+                paddle_stream=paddle_stream,
+            ),
+            unet=create_paddle_inference_runtime(
+                use_trt=args.use_trt,
+                dynamic_shape=unet_dynamic_shape,
+                use_fp16=args.use_fp16,
+                use_bf16=args.use_bf16,
+                device_id=args.device_id,
+                paddle_stream=paddle_stream,
+            ),
+        )
+    pipe = FastDeployStableDiffusionMegaPipeline.from_pretrained(
+        args.model_dir,
+        runtime_options=runtime_options,
+    )
+    pipe.set_progress_bar_config(disable=True)
+    return pipe
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_dir",
+        default="stable-diffusion-v1-5",
+        help="The model directory of diffusion_model.",
+    )
+    parser.add_argument(
+        "--task_name",
+        type=str,
+        default="text2img_img2img_inpaint_legacy",
+        choices=[
+            "text2img_img2img_inpaint_legacy",
+            "inpaint",
+            "controlnet_canny",
+        ],
+        help="The task can be one of [text2img_img2img_inpaint_legacy, inpaint, controlnet_canny]. ",
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="paddle",
+        # Note(zhoushunjie): Will support 'tensorrt' soon.
+        choices=["paddle", "paddle_tensorrt"],
+        help="The inference runtime backend of unet model and text encoder model.",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
+    parser.add_argument("--use_bf16", type=strtobool, default=False, help="Wheter to use BF16 mode")
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id.")
+    parser.add_argument(
+        "--parse_prompt_type",
+        type=str,
+        default="lpw",
+        choices=[
+            "raw",
+            "lpw",
+        ],
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
+    parser.add_argument("--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?")
+    return parser.parse_args()
+def get_canny_image(image):
+    if image is not None:
+        low_threshold = 100
+        high_threshold = 200
+        image = cv2.Canny(np.array(image), low_threshold, high_threshold)
+        image = image[:, :, None]
+        image = np.concatenate([image, image, image], axis=2)
+    return image
+def infer(
+    taskname,
+    image,
+    mask,
+    prompt,
+    negative_prompt,
+    steps,
+    height,
+    width,
+    seed,
+    strength,
+    guidance_scale,
+    scheduler,
+    conditioning_scale,
+):
+    task_name = taskname
+    fd_pipe.change_scheduler(scheduler)
+    if int(seed) != -1:
+        generator = paddle.Generator("cuda").manual_seed(seed)
+    else:
+        generator = None
+    if image is not None:
+        if isinstance(image, dict):
+            image["image"] = cv2.resize(image["image"], (width, height))
+            image["mask"] = cv2.resize(image["mask"], (width, height))
+        else:
+            image = cv2.resize(image, (width, height))
+    if mask is not None:
+        mask = cv2.resize(mask, (width, height))
+    if task_name == "text2img":
+        images = fd_pipe.text2img(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_inference_steps=steps,
+            height=height,
+            width=width,
+            guidance_scale=guidance_scale,
+            parse_prompt_type=parse_prompt_type,
+            infer_op_dict=infer_op_dict,
+            generator=generator,
+        )
+    elif task_name == "img2img":
+        images = fd_pipe.img2img(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            image=Image.fromarray(np.array(image)).convert("RGB"),
+            num_inference_steps=steps,
+            height=height,
+            width=width,
+            strength=strength,
+            guidance_scale=guidance_scale,
+            parse_prompt_type=parse_prompt_type,
+            infer_op_dict=infer_op_dict,
+            generator=generator,
+        )
+    elif task_name == "inpaint_legacy":
+        if mask is not None:
+            mask_image = mask
+        else:
+            mask_image = image["mask"]
+        image = image["image"]
+        images = fd_pipe.inpaint_legacy(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            image=Image.fromarray(np.array(image)).convert("RGB"),
+            mask_image=Image.fromarray(mask_image).convert("RGB"),
+            num_inference_steps=steps,
+            height=height,
+            width=width,
+            strength=strength,
+            guidance_scale=guidance_scale,
+            parse_prompt_type=parse_prompt_type,
+            infer_op_dict=infer_op_dict,
+            generator=generator,
+        )
+    elif task_name == "inpaint":
+        if mask is not None:
+            mask_image = mask
+        else:
+            mask_image = image["mask"]
+        image = image["image"]
+        images = fd_pipe.inpaint(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            image=Image.fromarray(np.array(image)).convert("RGB"),
+            mask_image=Image.fromarray(mask_image).convert("RGB"),
+            num_inference_steps=steps,
+            height=height,
+            width=width,
+            strength=strength,
+            guidance_scale=guidance_scale,
+            parse_prompt_type=parse_prompt_type,
+            infer_op_dict=infer_op_dict,
+            generator=generator,
+        )
+    elif task_name == "controlnet_canny":
+        canny_image = Image.fromarray(mask)
+        images = fd_pipe.text2img(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_inference_steps=steps,
+            height=height,
+            width=width,
+            guidance_scale=guidance_scale,
+            parse_prompt_type=parse_prompt_type,
+            controlnet_cond=canny_image,
+            controlnet_conditioning_scale=conditioning_scale,
+            infer_op_dict=infer_op_dict,
+            generator=generator,
+        )
+    else:
+        return gr.Error(f"task error! {task_name} not found ")
+    return images[0][0]
+scheduler_choices = [
+    "pndm",
+    "lms",
+    "euler",
+    "euler-ancestral",
+    "preconfig-euler-ancestral",
+    "dpm-multi",
+    "dpm-single",
+    "unipc-multi",
+    "ddim",
+    "ddpm",
+    "deis-multi",
+    "heun",
+    "kdpm2-ancestral",
+    "kdpm2",
+]
+# some param init
+args = parse_arguments()
+if "model_dir" and "task_name" in os.environ:
+    args.model_dir = os.environ["model_dir"]
+    args.task_name = os.environ["task_name"]
+fd_pipe = pipe_init(args)
+parse_prompt_type = args.parse_prompt_type
+if args.backend == "paddle":
+    print("When device is kunlunxin_xpu or backend is paddle, we will use `raw` infer op.")
+    infer_op_mode = "raw"
+else:
+    infer_op_mode = "zero_copy_infer"
+infer_op_dict = {
+    "vae_encoder": infer_op_mode,
+    "vae_decoder": infer_op_mode,
+    "text_encoder": infer_op_mode,
+    "unet": infer_op_mode,
+}
+with gr.Blocks() as demo:
+    gr.Markdown("# FastDeploy Stablediffusion")
+    if args.task_name == "text2img_img2img_inpaint_legacy":
+        with gr.Tab("text2img"):
+            with gr.Row():
+                with gr.Column():
+                    text2img_taskname = gr.State(value="text2img")
+                    text2img_img = gr.State(value=None)
+                    text2img_mask = gr.State(value=None)
+                    text2img_prompt = gr.Textbox(label="正向描述词", lines=2)
+                    text2img_negative_prompt = gr.Textbox(label="负向描述词", lines=2)
+                    text2img_steps = gr.Slider(label="steps", minimum=1, maximum=60, step=1, value=20)
+                    with gr.Row():
+                        text2img_height = gr.Slider(label="height", minimum=384, maximum=768, step=8, value=512)
+                        text2img_width = gr.Slider(label="width", minimum=384, maximum=768, step=8, value=512)
+                    text2img_seed = gr.Textbox(label="seed", value="-1")
+                    text2img_strength = gr.State(value=None)
+                    text2img_guidance_scale = gr.Slider(
+                        label="guidance_scale", minimum=1, maximum=30, step=0.5, value=7.5
+                    )
+                    text2img_scheduler = gr.Radio(label="采样方法", choices=scheduler_choices, value="ddim")
+                    text2img_conditioning_scale = gr.State(value=None)
+                with gr.Column():
+                    text2img_output = gr.Image(type="numpy", label="result")
+                    text2img_button = gr.Button("生成")
+            text2img_button.click(
+                fn=infer,
+                inputs=[
+                    text2img_taskname,
+                    text2img_img,
+                    text2img_mask,
+                    text2img_prompt,
+                    text2img_negative_prompt,
+                    text2img_steps,
+                    text2img_height,
+                    text2img_width,
+                    text2img_seed,
+                    text2img_strength,
+                    text2img_guidance_scale,
+                    text2img_scheduler,
+                    text2img_conditioning_scale,
+                ],
+                outputs=[text2img_output],
+            )
+        with gr.Tab("img2img"):
+            with gr.Row():
+                with gr.Column():
+                    img2img_taskname = gr.State(value="img2img")
+                    img2img_img = gr.Image(label="原图")
+                    img2img_mask = gr.State(value=None)
+                    img2img_prompt = gr.Textbox(label="请输入描述词", lines=2)
+                    img2img_negative_prompt = gr.Textbox(label="负向描述词", lines=2)
+                    img2img_steps = gr.Slider(label="steps", minimum=1, maximum=60, step=1, value=20)
+                    with gr.Row():
+                        img2img_height = gr.Slider(label="height", minimum=384, maximum=768, step=8, value=512)
+                        img2img_width = gr.Slider(label="width", minimum=384, maximum=768, step=8, value=512)
+                    img2img_seed = gr.Textbox(label="seed", value="-1")
+                    img2img_strength = gr.Slider(
+                        label="Denoising strength", minimum=0, maximum=1, step=0.01, value=0.75
+                    )
+                    img2img_guidance_scale = gr.Slider(
+                        label="guidance_scale", minimum=1, maximum=30, step=0.5, value=7.5
+                    )
+                    img2img_scheduler = gr.Radio(label="采样方法", choices=scheduler_choices, value="ddim")
+                    img2img_conditioning_scale = gr.State(value=None)
+                with gr.Column():
+                    img2img_output = gr.Image(type="numpy", label="result")
+                    img2img_button = gr.Button("生成")
+            img2img_button.click(
+                fn=infer,
+                inputs=[
+                    img2img_taskname,
+                    img2img_img,
+                    img2img_mask,
+                    img2img_prompt,
+                    img2img_negative_prompt,
+                    img2img_steps,
+                    img2img_height,
+                    img2img_width,
+                    img2img_seed,
+                    img2img_strength,
+                    img2img_guidance_scale,
+                    img2img_scheduler,
+                    img2img_conditioning_scale,
+                ],
+                outputs=[img2img_output],
+            )
+        with gr.Tab("inpaint_legacy"):
+            with gr.Row():
+                with gr.Column():
+                    inpaint_legacy_taskname = gr.State(value="inpaint_legacy")
+                    inpaint_legacy_img = gr.ImageMask(label="传入原图并涂鸦mask")
+                    inpaint_legacy_mask = gr.Image(label="重绘mask（可选，若不涂鸦则需要传入）", image_mode="L")
+                    inpaint_legacy_prompt = gr.Textbox(label="请输入正向描述词", lines=2)
+                    inpaint_legacy_negative_prompt = gr.Textbox(label="负向描述词", lines=2)
+                    inpaint_legacy_steps = gr.Slider(label="steps", minimum=1, maximum=60, step=1, value=20)
+                    with gr.Row():
+                        inpaint_legacy_height = gr.Slider(label="height", minimum=384, maximum=768, step=8, value=512)
+                        inpaint_legacy_width = gr.Slider(label="width", minimum=384, maximum=768, step=8, value=512)
+                    inpaint_legacy_seed = gr.Textbox(label="seed", value="-1")
+                    inpaint_legacy_strength = gr.Slider(
+                        label="Denoising strength", minimum=0, maximum=1, step=0.01, value=0.75
+                    )
+                    inpaint_legacy_guidance_scale = gr.Slider(
+                        label="guidance_scale", minimum=1, maximum=30, step=0.5, value=7.5
+                    )
+                    inpaint_legacy_scheduler = gr.Radio(label="采样方法", choices=scheduler_choices, value="ddim")
+                    inpaint_legacy_conditioning_scale = gr.State(value=None)
+                with gr.Column():
+                    inpaint_legacy_output = gr.Image(type="numpy", label="result")
+                    inpaint_legacy_button = gr.Button("生成")
+            inpaint_legacy_button.click(
+                fn=infer,
+                inputs=[
+                    inpaint_legacy_taskname,
+                    inpaint_legacy_img,
+                    inpaint_legacy_mask,
+                    inpaint_legacy_prompt,
+                    inpaint_legacy_negative_prompt,
+                    inpaint_legacy_steps,
+                    inpaint_legacy_height,
+                    inpaint_legacy_width,
+                    inpaint_legacy_seed,
+                    inpaint_legacy_strength,
+                    inpaint_legacy_guidance_scale,
+                    inpaint_legacy_scheduler,
+                    inpaint_legacy_conditioning_scale,
+                ],
+                outputs=[inpaint_legacy_output],
+            )
+    elif args.task_name == "inpaint":
+        with gr.Tab("inpaint"):
+            with gr.Row():
+                with gr.Column():
+                    inpaint_taskname = gr.State(value="inpaint")
+                    inpaint_img = gr.ImageMask(label="传入原图并涂鸦mask")
+                    inpaint_mask = gr.Image(label="重绘mask（可选，若不涂鸦则需要传入）", image_mode="L")
+                    inpaint_prompt = gr.Textbox(label="请输入正向描述词", lines=2)
+                    inpaint_negative_prompt = gr.Textbox(label="负向描述词", lines=2)
+                    inpaint_steps = gr.Slider(label="steps", minimum=1, maximum=60, step=1, value=20)
+                    with gr.Row():
+                        inpaint_height = gr.Slider(label="height", minimum=384, maximum=768, step=8, value=512)
+                        inpaint_width = gr.Slider(label="width", minimum=384, maximum=768, step=8, value=512)
+                    inpaint_seed = gr.Textbox(label="seed", value="-1")
+                    inpaint_strength = gr.Slider(
+                        label="Denoising strength", minimum=0, maximum=1, step=0.01, value=0.75
+                    )
+                    inpaint_guidance_scale = gr.Slider(
+                        label="guidance_scale", minimum=1, maximum=30, step=0.5, value=7.5
+                    )
+                    inpaint_scheduler = gr.Radio(label="采样方法", choices=scheduler_choices, value="ddim")
+                    inpaint_conditioning_scale = gr.State(value=None)
+                with gr.Column():
+                    inpaint_output = gr.Image(type="numpy", label="result")
+                    inpaint_button = gr.Button("生成")
+            inpaint_button.click(
+                fn=infer,
+                inputs=[
+                    inpaint_taskname,
+                    inpaint_img,
+                    inpaint_mask,
+                    inpaint_prompt,
+                    inpaint_negative_prompt,
+                    inpaint_steps,
+                    inpaint_height,
+                    inpaint_width,
+                    inpaint_seed,
+                    inpaint_strength,
+                    inpaint_guidance_scale,
+                    inpaint_scheduler,
+                    inpaint_conditioning_scale,
+                ],
+                outputs=[inpaint_output],
+            )
+    elif args.task_name == "controlnet_canny":
+        with gr.Tab("controlnet_canny"):
+            with gr.Row():
+                with gr.Column():
+                    controlnet_canny_taskname = gr.State(value="controlnet_canny")
+                    controlnet_canny_img = gr.Image(label="canny参考图")
+                    controlnet_canny_mask = gr.Image(label="canny图（可选传入）")
+                    controlnet_canny_prompt = gr.Textbox(label="请输入正向描述词", lines=2)
+                    controlnet_canny_negative_prompt = gr.Textbox(label="负向描述词", lines=2)
+                    controlnet_canny_steps = gr.Slider(label="steps", minimum=1, maximum=60, step=1, value=20)
+                    with gr.Row():
+                        controlnet_canny_height = gr.Slider(
+                            label="height", minimum=384, maximum=768, step=8, value=512
+                        )
+                        controlnet_canny_width = gr.Slider(label="width", minimum=384, maximum=768, step=8, value=512)
+                    controlnet_canny_seed = gr.Textbox(label="seed", value="-1")
+                    controlnet_canny_strength = gr.Slider(
+                        label="Denoising strength", minimum=0, maximum=1, step=0.01, value=0.75
+                    )
+                    controlnet_canny_guidance_scale = gr.Slider(
+                        label="guidance_scale", minimum=1, maximum=30, step=0.5, value=7.5
+                    )
+                    controlnet_canny_scheduler = gr.Radio(label="采样方法", choices=scheduler_choices, value="ddim")
+                    controlnet_canny_conditioning_scale = gr.Slider(
+                        label="conditioning_scale", minimum=0, maximum=2, step=0.05, value=1
+                    )
+                with gr.Column():
+                    controlnet_canny_output = gr.Image(type="numpy", label="result")
+                    controlnet_canny_button = gr.Button("生成")
+            controlnet_canny_img.change(
+                fn=get_canny_image, inputs=[controlnet_canny_img], outputs=[controlnet_canny_mask]
+            )
+            controlnet_canny_button.click(
+                fn=infer,
+                inputs=[
+                    controlnet_canny_taskname,
+                    controlnet_canny_img,
+                    controlnet_canny_mask,
+                    controlnet_canny_prompt,
+                    controlnet_canny_negative_prompt,
+                    controlnet_canny_steps,
+                    controlnet_canny_height,
+                    controlnet_canny_width,
+                    controlnet_canny_seed,
+                    controlnet_canny_strength,
+                    controlnet_canny_guidance_scale,
+                    controlnet_canny_scheduler,
+                    controlnet_canny_conditioning_scale,
+                ],
+                outputs=[controlnet_canny_output],
+            )
+if __name__ == "__main__":
+    demo.launch(show_error=True)

PaddleMIX/ppdiffusers/deploy-deprecated/infer.py ADDED Viewed

	@@ -0,0 +1,742 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+# isort: split
+import paddle
+# isort: split
+import fastdeploy as fd
+import numpy as np
+from paddlenlp.trainer.argparser import strtobool
+from tqdm.auto import trange
+from ppdiffusers import DiffusionPipeline, FastDeployStableDiffusionMegaPipeline
+from ppdiffusers.utils import load_image
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_dir",
+        default="runwayml/stable-diffusion-v1-5@fastdeploy",
+        help="The model directory of diffusion_model.",
+    )
+    parser.add_argument(
+        "--inference_steps",
+        type=int,
+        default=50,
+        help="The number of unet inference steps.",
+    )
+    parser.add_argument(
+        "--benchmark_steps",
+        type=int,
+        default=1,
+        help="The number of performance benchmark steps.",
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="paddle_tensorrt",
+        # Note(zhoushunjie): Will support 'tensorrt' soon.
+        choices=["onnx_runtime", "paddle", "paddlelite", "paddle_tensorrt"],
+        help="The inference runtime backend of unet model and text encoder model.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="gpu",
+        # Note(shentanyue): Will support more devices.
+        choices=[
+            "cpu",
+            "gpu",
+            "huawei_ascend_npu",
+            "kunlunxin_xpu",
+        ],
+        help="The inference runtime device of models.",
+    )
+    parser.add_argument(
+        "--task_name",
+        type=str,
+        default="text2img",
+        choices=[
+            "text2img",
+            "img2img",
+            "inpaint",
+            "inpaint_legacy",
+            "cycle_diffusion",
+            "hiresfix",
+            "mixture_tiling",
+            "all",
+        ],
+        help="The task can be one of [text2img, img2img, inpaint, inpaint_legacy, cycle_diffusion, hiresfix, mixture_tiling, all]. ",
+    )
+    parser.add_argument(
+        "--parse_prompt_type",
+        type=str,
+        default="lpw",
+        choices=[
+            "raw",
+            "lpw",
+        ],
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
+    parser.add_argument("--use_bf16", type=strtobool, default=False, help="Wheter to use BF16 mode")
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
+    parser.add_argument(
+        "--scheduler",
+        type=str,
+        default="preconfig-euler-ancestral",
+        choices=[
+            "pndm",
+            "lms",
+            "euler",
+            "euler-ancestral",
+            "preconfig-euler-ancestral",
+            "dpm-multi",
+            "dpm-single",
+            "unipc-multi",
+            "ddim",
+            "ddpm",
+            "deis-multi",
+            "heun",
+            "kdpm2-ancestral",
+            "kdpm2",
+        ],
+        help="The scheduler type of stable diffusion.",
+    )
+    parser.add_argument(
+        "--infer_op",
+        type=str,
+        default="zero_copy_infer",
+        choices=[
+            "zero_copy_infer",
+            "raw",
+            "all",
+        ],
+        help="The type of infer op.",
+    )
+    parser.add_argument("--height", type=int, default=512, help="Height of input image")
+    parser.add_argument("--width", type=int, default=512, help="Width of input image")
+    parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image")
+    parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image")
+    parser.add_argument("--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?")
+    return parser.parse_args()
+def create_ort_runtime(device_id=0):
+    option = fd.RuntimeOption()
+    option.use_ort_backend()
+    if device_id == -1:
+        option.use_cpu()
+    else:
+        option.use_gpu(device_id)
+    return option
+def create_paddle_inference_runtime(
+    use_trt=False,
+    dynamic_shape=None,
+    use_fp16=False,
+    use_bf16=False,
+    device_id=0,
+    disable_paddle_trt_ops=[],
+    disable_paddle_pass=[],
+    paddle_stream=None,
+    workspace=None,
+):
+    assert not use_fp16 or not use_bf16, "use_fp16 and use_bf16 are mutually exclusive"
+    option = fd.RuntimeOption()
+    option.use_paddle_backend()
+    if device_id == -1:
+        option.use_cpu()
+    else:
+        option.use_gpu(device_id)
+    if paddle_stream is not None and use_trt:
+        option.set_external_raw_stream(paddle_stream)
+    for pass_name in disable_paddle_pass:
+        option.paddle_infer_option.delete_pass(pass_name)
+    if use_bf16:
+        option.paddle_infer_option.inference_precision = "bfloat16"
+    if use_trt:
+        option.paddle_infer_option.disable_trt_ops(disable_paddle_trt_ops)
+        option.paddle_infer_option.enable_trt = True
+        if workspace is not None:
+            option.set_trt_max_workspace_size(workspace)
+        if use_fp16:
+            option.trt_option.enable_fp16 = True
+        else:
+            # Note(zhoushunjie): These four passes don't support fp32 now.
+            # Remove this line of code in future.
+            only_fp16_passes = [
+                "trt_cross_multihead_matmul_fuse_pass",
+                "trt_flash_multihead_matmul_fuse_pass",
+                "preln_elementwise_groupnorm_act_pass",
+                "elementwise_groupnorm_act_pass",
+            ]
+            for curr_pass in only_fp16_passes:
+                option.paddle_infer_option.delete_pass(curr_pass)
+        # Need to enable collect shape
+        if dynamic_shape is not None:
+            option.paddle_infer_option.collect_trt_shape = True
+            for key, shape_dict in dynamic_shape.items():
+                option.trt_option.set_shape(
+                    key,
+                    shape_dict["min_shape"],
+                    shape_dict.get("opt_shape", None),
+                    shape_dict.get("max_shape", None),
+                )
+    return option
+def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False):
+    option = fd.RuntimeOption()
+    option.use_paddle_lite_backend()
+    if device == "huawei_ascend_npu":
+        option.use_ascend()
+        option.set_lite_device_names(["huawei_ascend_npu"])
+        option.set_lite_context_properties(
+            "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={};HUAWEI_ASCEND_NPU_PRECISION_MODE=allow_mix_precision".format(
+                device_id
+            )
+        )
+    elif device == "kunlunxin_xpu":
+        # TODO(shentanyue): Add kunlunxin_xpu code
+        # https://github.com/PaddlePaddle/FastDeploy/blob/4c3e7030e151528d304619901c794481bb2f6037/examples/multimodal/stable_diffusion/infer.py#L178-L195
+        option.use_kunlunxin(
+            device_id,
+            l3_workspace_size=(64 * 1024 * 1024 - 4 * 1024),
+            locked=False,
+            autotune=False,
+            autotune_file="",
+            precision="int16",
+            adaptive_seqlen=True,
+            enable_multi_stream=True,
+        )
+        if use_fp16:
+            option.enable_lite_fp16()
+    else:
+        pass
+    return option
+def create_trt_runtime(workspace=(1 << 31), dynamic_shape=None, use_fp16=False, device_id=0):
+    option = fd.RuntimeOption()
+    option.use_trt_backend()
+    option.use_gpu(device_id)
+    if use_fp16:
+        option.enable_trt_fp16()
+    if workspace is not None:
+        option.set_trt_max_workspace_size(workspace)
+    if dynamic_shape is not None:
+        for key, shape_dict in dynamic_shape.items():
+            option.set_trt_input_shape(
+                key,
+                min_shape=shape_dict["min_shape"],
+                opt_shape=shape_dict.get("opt_shape", None),
+                max_shape=shape_dict.get("max_shape", None),
+            )
+    return option
+def main(args):
+    if args.device_id == -1:
+        paddle.set_device("cpu")
+        paddle_stream = None
+    else:
+        paddle.set_device(f"gpu:{args.device_id}")
+        paddle_stream = paddle.device.cuda.current_stream(args.device_id).cuda_stream
+    seed = 1024
+    vae_in_channels = 4
+    text_encoder_max_length = 77
+    unet_max_length = text_encoder_max_length * 3  # lpw support max_length is 77x3
+    min_image_size = 512
+    max_image_size = 768
+    max_image_size = max(min_image_size, max_image_size)
+    hidden_states = 1024 if args.is_sd2_0 else 768
+    unet_in_channels = 9 if args.task_name == "inpaint" else 4
+    if args.task_name == "cycle_diffusion":
+        bs = 4
+        min_image_size = max_image_size = 512
+    else:
+        bs = 2
+    text_encoder_dynamic_shape = {
+        "input_ids": {
+            "min_shape": [1, text_encoder_max_length],
+            "max_shape": [1, text_encoder_max_length],
+            "opt_shape": [1, text_encoder_max_length],
+        }
+    }
+    vae_encoder_dynamic_shape = {
+        "sample": {
+            "min_shape": [1, 3, min_image_size, min_image_size],
+            "max_shape": [1, 3, max_image_size, max_image_size],
+            "opt_shape": [1, 3, min_image_size, min_image_size],
+        }
+    }
+    vae_decoder_dynamic_shape = {
+        "latent_sample": {
+            "min_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
+            "max_shape": [1, vae_in_channels, max_image_size // 8, max_image_size // 8],
+            "opt_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
+        }
+    }
+    unet_dynamic_shape = {
+        "sample": {
+            "min_shape": [
+                1,
+                unet_in_channels,
+                min_image_size // 8,
+                min_image_size // 8,
+            ],
+            "max_shape": [
+                bs,
+                unet_in_channels,
+                max_image_size // 8,
+                max_image_size // 8,
+            ],
+            "opt_shape": [
+                2,
+                unet_in_channels,
+                min_image_size // 8,
+                min_image_size // 8,
+            ],
+        },
+        "timestep": {
+            "min_shape": [1],
+            "max_shape": [1],
+            "opt_shape": [1],
+        },
+        "encoder_hidden_states": {
+            "min_shape": [1, text_encoder_max_length, hidden_states],
+            "max_shape": [bs, unet_max_length, hidden_states],
+            "opt_shape": [2, text_encoder_max_length, hidden_states],
+        },
+    }
+    # 4. Init runtime
+    if args.backend == "onnx_runtime":
+        runtime_options = dict(
+            text_encoder=create_ort_runtime(device_id=args.device_id),
+            vae_encoder=create_ort_runtime(device_id=args.device_id),
+            vae_decoder=create_ort_runtime(device_id=args.device_id),
+            unet=create_ort_runtime(device_id=args.device_id),
+        )
+    elif args.backend == "paddlelite":
+        runtime_options = dict(
+            text_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
+            vae_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
+            vae_decoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
+            unet=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=args.use_fp16),
+        )
+    elif args.backend == "tensorrt":
+        runtime_options = dict(
+            text_encoder=create_trt_runtime(
+                dynamic_shape=text_encoder_dynamic_shape,
+                use_fp16=args.use_fp16,
+                device_id=args.device_id,
+            ),
+            vae_encoder=create_trt_runtime(
+                dynamic_shape=vae_encoder_dynamic_shape,
+                use_fp16=args.use_fp16,
+                device_id=args.device_id,
+            ),
+            vae_decoder=create_trt_runtime(
+                dynamic_shape=vae_decoder_dynamic_shape,
+                use_fp16=args.use_fp16,
+                device_id=args.device_id,
+            ),
+            unet=create_trt_runtime(
+                dynamic_shape=unet_dynamic_shape,
+                use_fp16=args.use_fp16,
+                device_id=args.device_id,
+            ),
+        )
+    elif args.backend == "paddle" or args.backend == "paddle_tensorrt":
+        args.use_trt = args.backend == "paddle_tensorrt"
+        runtime_options = dict(
+            text_encoder=create_paddle_inference_runtime(
+                use_trt=args.use_trt,
+                dynamic_shape=text_encoder_dynamic_shape,
+                use_fp16=args.use_fp16,
+                use_bf16=args.use_bf16,
+                device_id=args.device_id,
+                disable_paddle_trt_ops=["arg_max", "range", "lookup_table_v2"],
+                paddle_stream=paddle_stream,
+            ),
+            vae_encoder=create_paddle_inference_runtime(
+                use_trt=args.use_trt,
+                dynamic_shape=vae_encoder_dynamic_shape,
+                use_fp16=args.use_fp16,
+                use_bf16=args.use_bf16,
+                device_id=args.device_id,
+                paddle_stream=paddle_stream,
+            ),
+            vae_decoder=create_paddle_inference_runtime(
+                use_trt=args.use_trt,
+                dynamic_shape=vae_decoder_dynamic_shape,
+                use_fp16=args.use_fp16,
+                use_bf16=args.use_bf16,
+                device_id=args.device_id,
+                paddle_stream=paddle_stream,
+            ),
+            unet=create_paddle_inference_runtime(
+                use_trt=args.use_trt,
+                dynamic_shape=unet_dynamic_shape,
+                use_fp16=args.use_fp16,
+                use_bf16=args.use_bf16,
+                device_id=args.device_id,
+                paddle_stream=paddle_stream,
+            ),
+        )
+    pipe = FastDeployStableDiffusionMegaPipeline.from_pretrained(
+        args.model_dir,
+        runtime_options=runtime_options,
+    )
+    pipe.set_progress_bar_config(disable=True)
+    pipe.change_scheduler(args.scheduler)
+    parse_prompt_type = args.parse_prompt_type
+    width = args.width
+    height = args.height
+    hr_resize_width = args.hr_resize_width
+    hr_resize_height = args.hr_resize_height
+    if args.infer_op == "all":
+        infer_op_list = ["zero_copy_infer", "raw"]
+    else:
+        infer_op_list = [args.infer_op]
+    if args.device == "kunlunxin_xpu" or args.backend == "paddle":
+        print("When device is kunlunxin_xpu or backend is paddle, we will use `raw` infer op.")
+        infer_op_list = ["raw"]
+    for infer_op in infer_op_list:
+        infer_op_dict = {
+            "vae_encoder": infer_op,
+            "vae_decoder": infer_op,
+            "text_encoder": infer_op,
+            "unet": infer_op,
+        }
+        folder = f"infer_op_{infer_op}_fp16" if args.use_fp16 else f"infer_op_{infer_op}_fp32"
+        os.makedirs(folder, exist_ok=True)
+        if args.task_name in ["text2img", "all"]:
+            # text2img
+            prompt = "a photo of an astronaut riding a horse on mars"
+            time_costs = []
+            # warmup
+            pipe.text2img(
+                prompt,
+                num_inference_steps=10,
+                height=height,
+                width=width,
+                parse_prompt_type=parse_prompt_type,
+                infer_op_dict=infer_op_dict,
+            )
+            print("==> Test text2img performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                paddle.seed(seed)
+                images = pipe.text2img(
+                    prompt,
+                    num_inference_steps=args.inference_steps,
+                    height=height,
+                    width=width,
+                    parse_prompt_type=parse_prompt_type,
+                    infer_op_dict=infer_op_dict,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/text2img.png")
+        if args.task_name in ["img2img", "all"]:
+            # img2img
+            img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
+            init_image = load_image(img_url)
+            prompt = "A fantasy landscape, trending on artstation"
+            time_costs = []
+            # warmup
+            pipe.img2img(
+                prompt,
+                image=init_image,
+                num_inference_steps=20,
+                height=height,
+                width=width,
+                parse_prompt_type=parse_prompt_type,
+                infer_op_dict=infer_op_dict,
+            )
+            print("==> Test img2img performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                paddle.seed(seed)
+                images = pipe.img2img(
+                    prompt,
+                    image=init_image,
+                    num_inference_steps=args.inference_steps,
+                    height=height,
+                    width=width,
+                    parse_prompt_type=parse_prompt_type,
+                    infer_op_dict=infer_op_dict,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/img2img.png")
+        if args.task_name in ["inpaint", "inpaint_legacy", "all"]:
+            img_url = (
+                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+            )
+            mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
+            init_image = load_image(img_url)
+            mask_image = load_image(mask_url)
+            prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+            time_costs = []
+            # warmup
+            if args.task_name in ["inpaint_legacy", "all"]:
+                call_fn = pipe.inpaint_legacy
+                task_name = "inpaint_legacy"
+            else:
+                call_fn = pipe.inpaint
+                task_name = "inpaint"
+            call_fn(
+                prompt,
+                image=init_image,
+                mask_image=mask_image,
+                num_inference_steps=20,
+                height=height,
+                width=width,
+                parse_prompt_type=parse_prompt_type,
+                infer_op_dict=infer_op_dict,
+            )
+            print(f"==> Test {task_name} performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                paddle.seed(seed)
+                images = call_fn(
+                    prompt,
+                    image=init_image,
+                    mask_image=mask_image,
+                    num_inference_steps=args.inference_steps,
+                    height=height,
+                    width=width,
+                    parse_prompt_type=parse_prompt_type,
+                    infer_op_dict=infer_op_dict,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/{task_name}.png")
+        if args.task_name in ["hiresfix", "all"]:
+            hiresfix_pipe = DiffusionPipeline.from_pretrained(
+                args.model_dir,
+                vae_encoder=pipe.vae_encoder,
+                vae_decoder=pipe.vae_decoder,
+                text_encoder=pipe.text_encoder,
+                tokenizer=pipe.tokenizer,
+                unet=pipe.unet,
+                scheduler=pipe.scheduler,
+                safety_checker=pipe.safety_checker,
+                feature_extractor=pipe.feature_extractor,
+                requires_safety_checker=pipe.requires_safety_checker,
+                custom_pipeline="pipeline_fastdeploy_stable_diffusion_hires_fix",
+            )
+            # custom_pipeline
+            # https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_hires_fix.py
+            hiresfix_pipe._progress_bar_config = pipe._progress_bar_config
+            # hiresfix
+            prompt = "a photo of an astronaut riding a horse on mars"
+            time_costs = []
+            # warmup
+            hiresfix_pipe(
+                prompt,
+                height=height,
+                width=width,
+                num_inference_steps=20,
+                hires_ratio=0.5,
+                hr_resize_width=hr_resize_width,
+                hr_resize_height=hr_resize_height,
+                enable_hr=True,
+                parse_prompt_type=parse_prompt_type,
+                infer_op_dict=infer_op_dict,
+            )
+            print("==> Test hiresfix performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                paddle.seed(seed)
+                images = hiresfix_pipe(
+                    prompt,
+                    height=height,
+                    width=width,
+                    num_inference_steps=args.inference_steps,
+                    hires_ratio=0.5,
+                    hr_resize_width=hr_resize_width,
+                    hr_resize_height=hr_resize_height,
+                    enable_hr=True,
+                    infer_op_dict=infer_op_dict,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/hiresfix.png")
+        if args.task_name in ["cycle_diffusion"]:
+            pipe.change_scheduler("ddim")
+            image_url = (
+                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/ride_on_horse.png"
+            )
+            init_image = load_image(image_url)
+            source_prompt = "An astronaut riding a horse"
+            prompt = "An astronaut riding an elephant"
+            time_costs = []
+            # warmup
+            pipe.cycle_diffusion(
+                prompt=prompt,
+                source_prompt=source_prompt,
+                image=init_image,
+                num_inference_steps=10,
+                eta=0.1,
+                strength=0.8,
+                guidance_scale=2,
+                source_guidance_scale=1,
+                height=height,
+                width=width,
+                parse_prompt_type=parse_prompt_type,
+                infer_op_dict=infer_op_dict,
+            ).images[0]
+            print("==> Test cycle diffusion performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                paddle.seed(seed)
+                images = pipe.cycle_diffusion(
+                    prompt=prompt,
+                    source_prompt=source_prompt,
+                    image=init_image,
+                    num_inference_steps=args.inference_steps,
+                    eta=0.1,
+                    strength=0.8,
+                    guidance_scale=2,
+                    source_guidance_scale=1,
+                    height=height,
+                    width=width,
+                    parse_prompt_type=parse_prompt_type,
+                    infer_op_dict=infer_op_dict,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/cycle_diffusion.png")
+        if args.task_name in ["mixture_tiling"]:
+            mixture_tiling_pipe = DiffusionPipeline.from_pretrained(
+                args.model_dir,
+                vae_encoder=pipe.vae_encoder,
+                vae_decoder=pipe.vae_decoder,
+                text_encoder=pipe.text_encoder,
+                tokenizer=pipe.tokenizer,
+                unet=pipe.unet,
+                scheduler=pipe.scheduler,
+                safety_checker=pipe.safety_checker,
+                feature_extractor=pipe.feature_extractor,
+                requires_safety_checker=pipe.requires_safety_checker,
+                custom_pipeline="pipeline_fastdeploy_stable_diffusion_mixture_tiling",
+            )
+            # custom_pipeline
+            mixture_tiling_pipe._progress_bar_config = pipe._progress_bar_config
+            # mixture_tiling
+            time_costs = []
+            # warmup
+            mixture_tiling_pipe(
+                prompt=[
+                    [
+                        "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+                        # "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+                        # "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+                    ]
+                ],
+                tile_height=512,
+                tile_width=512,
+                tile_row_overlap=0,
+                tile_col_overlap=0,
+                guidance_scale=8,
+                seed=7178915308,
+                num_inference_steps=50,
+                infer_op_dict=None,
+            )
+            print("==> Test mixture tiling.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                images = mixture_tiling_pipe(
+                    prompt=[
+                        [
+                            "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+                            # "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+                            # "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
+                        ]
+                    ],
+                    tile_height=512,
+                    tile_width=512,
+                    tile_row_overlap=0,
+                    tile_col_overlap=0,
+                    guidance_scale=8,
+                    seed=7178915308,
+                    num_inference_steps=50,
+                    infer_op_dict=None,
+                )["images"]
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/mixture_tiling.png")
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args)

PaddleMIX/ppdiffusers/deploy-deprecated/infer_dygraph.py ADDED Viewed

	@@ -0,0 +1,380 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+import warnings
+import numpy as np
+import paddle
+from paddlenlp.trainer.argparser import strtobool
+from paddlenlp.utils.log import logger
+from tqdm.auto import trange
+from ppdiffusers import DiffusionPipeline
+from ppdiffusers.utils import load_image
+logger.set_level("WARNING")
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_dir",
+        default="runwayml/stable-diffusion-v1-5",
+        help="The model directory of diffusion_model.",
+    )
+    parser.add_argument(
+        "--inference_steps",
+        type=int,
+        default=50,
+        help="The number of unet inference steps.",
+    )
+    parser.add_argument(
+        "--benchmark_steps",
+        type=int,
+        default=1,
+        help="The number of performance benchmark steps.",
+    )
+    parser.add_argument(
+        "--task_name",
+        type=str,
+        default="text2img",
+        choices=[
+            "text2img",
+            "img2img",
+            "inpaint",
+            "inpaint_legacy",
+            "cycle_diffusion",
+            "hiresfix",
+            "all",
+        ],
+        help="The task can be one of [text2img, img2img, inpaint, inpaint_legacy, cycle_diffusion, hiresfix, all]. ",
+    )
+    parser.add_argument(
+        "--parse_prompt_type",
+        type=str,
+        default="lpw",
+        choices=[
+            "raw",
+            "lpw",
+        ],
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
+    parser.add_argument(
+        "--attention_type",
+        type=str,
+        default="raw",
+        choices=["raw", "cutlass", "flash", "all"],
+        help="attention_type.",
+    )
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
+    parser.add_argument(
+        "--scheduler",
+        type=str,
+        default="euler-ancestral",
+        choices=[
+            "pndm",
+            "lms",
+            "euler",
+            "euler-ancestral",
+            "dpm-multi",
+            "dpm-single",
+            "unipc-multi",
+            "ddim",
+            "ddpm",
+            "deis-multi",
+            "heun",
+            "kdpm2-ancestral",
+            "kdpm2",
+        ],
+        help="The scheduler type of stable diffusion.",
+    )
+    parser.add_argument("--height", type=int, default=512, help="Height of input image")
+    parser.add_argument("--width", type=int, default=512, help="Width of input image")
+    parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image")
+    parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image")
+    return parser.parse_args()
+def main(args):
+    if args.device_id == -1:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device(f"gpu:{args.device_id}")
+    seed = 1024
+    paddle_dtype = paddle.float16 if args.use_fp16 else paddle.float32
+    print(
+        os.path.join(os.path.abspath(os.path.join(os.getcwd(), "..")), "examples/community/stable_diffusion_mega.py")
+    )
+    pipe = DiffusionPipeline.from_pretrained(
+        args.model_dir,
+        safety_checker=None,
+        feature_extractor=None,
+        requires_safety_checker=False,
+        paddle_dtype=paddle_dtype,
+        custom_pipeline=os.path.join(
+            os.path.abspath(os.path.join(os.getcwd(), "..")), "examples/community/stable_diffusion_mega.py"
+        ),
+    )
+    pipe.set_progress_bar_config(disable=True)
+    pipe.change_scheduler(args.scheduler)
+    parse_prompt_type = args.parse_prompt_type
+    if args.attention_type == "all":
+        args.attention_type = ["raw", "cutlass", "flash"]
+    else:
+        args.attention_type = [args.attention_type]
+    for attention_type in args.attention_type:
+        if attention_type == "raw":
+            pipe.disable_xformers_memory_efficient_attention()
+        else:
+            try:
+                pipe.enable_xformers_memory_efficient_attention(attention_type)
+            except Exception as e:
+                if attention_type == "flash":
+                    warnings.warn(
+                        "Attention type flash is not supported on your GPU! We need to use 3060、3070、3080、3090、4060、4070、4080、4090、A30、A100 etc."
+                    )
+                    continue
+                else:
+                    raise ValueError(e)
+        if not args.use_fp16 and attention_type == "flash":
+            print("Flash attention is not supported dtype=float32! Please use float16 or bfloat16. We will skip this!")
+            continue
+        width = args.width
+        height = args.height
+        hr_resize_width = args.hr_resize_width
+        hr_resize_height = args.hr_resize_height
+        folder = f"attn_{attention_type}_fp16" if args.use_fp16 else f"attn_{attention_type}_fp32"
+        os.makedirs(folder, exist_ok=True)
+        if args.task_name in ["text2img", "all"]:
+            # text2img
+            prompt = "a photo of an astronaut riding a horse on mars"
+            time_costs = []
+            # warmup
+            pipe.text2img(
+                prompt,
+                num_inference_steps=10,
+                height=height,
+                width=width,
+                parse_prompt_type=parse_prompt_type,
+            )
+            print("==> Test text2img performance.")
+            paddle.seed(seed)
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                images = pipe.text2img(
+                    prompt,
+                    num_inference_steps=args.inference_steps,
+                    height=height,
+                    width=width,
+                    parse_prompt_type=parse_prompt_type,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/text2img.png")
+        if args.task_name in ["img2img", "all"]:
+            # img2img
+            img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
+            init_image = load_image(img_url)
+            prompt = "A fantasy landscape, trending on artstation"
+            time_costs = []
+            # warmup
+            pipe.img2img(
+                prompt,
+                image=init_image,
+                num_inference_steps=20,
+                height=height,
+                width=width,
+                parse_prompt_type=parse_prompt_type,
+            )
+            print("==> Test img2img performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                paddle.seed(seed)
+                images = pipe.img2img(
+                    prompt,
+                    image=init_image,
+                    num_inference_steps=args.inference_steps,
+                    height=height,
+                    width=width,
+                    parse_prompt_type=parse_prompt_type,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/img2img.png")
+        if args.task_name in ["inpaint", "inpaint_legacy", "all"]:
+            img_url = (
+                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+            )
+            mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
+            init_image = load_image(img_url)
+            mask_image = load_image(mask_url)
+            prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+            time_costs = []
+            # warmup
+            if args.task_name in ["inpaint_legacy", "all"]:
+                call_fn = pipe.inpaint_legacy
+                task_name = "inpaint_legacy"
+            else:
+                call_fn = pipe.inpaint
+                task_name = args.task_name
+            if pipe.unet.config.in_channels == 4:
+                task_name = "inpaint_legacy"
+            elif pipe.unet.config.in_channels == 9:
+                task_name = "inpaint"
+            call_fn(
+                prompt,
+                image=init_image,
+                mask_image=mask_image,
+                num_inference_steps=20,
+                height=height,
+                width=width,
+                parse_prompt_type=parse_prompt_type,
+            )
+            print(f"==> Test {task_name} performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                paddle.seed(seed)
+                images = call_fn(
+                    prompt,
+                    image=init_image,
+                    mask_image=mask_image,
+                    num_inference_steps=args.inference_steps,
+                    height=height,
+                    width=width,
+                    parse_prompt_type=parse_prompt_type,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/{task_name}.png")
+        if args.task_name in ["cycle_diffusion", "all"]:
+            pipe.change_scheduler("ddim")
+            image_url = (
+                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/ride_on_horse.png"
+            )
+            init_image = load_image(image_url)
+            source_prompt = "An astronaut riding a horse"
+            prompt = "An astronaut riding an elephant"
+            time_costs = []
+            # warmup
+            pipe.cycle_diffusion(
+                prompt=prompt,
+                source_prompt=source_prompt,
+                image=init_image,
+                num_inference_steps=10,
+                eta=0.1,
+                strength=0.8,
+                guidance_scale=2,
+                source_guidance_scale=1,
+                height=height,
+                width=width,
+                parse_prompt_type=parse_prompt_type,
+            ).images[0]
+            print("==> Test cycle diffusion performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                paddle.seed(seed)
+                images = pipe.cycle_diffusion(
+                    prompt=prompt,
+                    source_prompt=source_prompt,
+                    image=init_image,
+                    num_inference_steps=args.inference_steps,
+                    eta=0.1,
+                    strength=0.8,
+                    guidance_scale=2,
+                    source_guidance_scale=1,
+                    height=height,
+                    width=width,
+                    parse_prompt_type=parse_prompt_type,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/cycle_diffusion.png")
+        if args.task_name in ["hiresfix", "all"]:
+            # hiresfix
+            prompt = "a photo of an astronaut riding a horse on mars"
+            time_costs = []
+            # warmup
+            pipe.hires_fix(
+                prompt,
+                height=height,
+                width=width,
+                num_inference_steps=20,
+                hires_ratio=0.5,
+                hr_resize_width=hr_resize_width,
+                hr_resize_height=hr_resize_height,
+                enable_hr=True,
+                parse_prompt_type=parse_prompt_type,
+            )
+            print("==> Test hiresfix performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                paddle.seed(seed)
+                images = pipe.hires_fix(
+                    prompt,
+                    height=height,
+                    width=width,
+                    num_inference_steps=args.inference_steps,
+                    hires_ratio=0.5,
+                    hr_resize_width=hr_resize_width,
+                    hr_resize_height=hr_resize_height,
+                    enable_hr=True,
+                    parse_prompt_type=parse_prompt_type,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/hiresfix.png")
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args)

PaddleMIX/ppdiffusers/deploy-deprecated/infer_dygraph_torch.py ADDED Viewed

	@@ -0,0 +1,447 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+import torch
+torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention
+delattr(torch.nn.functional, "scaled_dot_product_attention")
+import numpy as np
+from diffusers import (
+    CycleDiffusionPipeline,
+    DDIMScheduler,
+    DDPMScheduler,
+    DEISMultistepScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    KDPM2AncestralDiscreteScheduler,
+    KDPM2DiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UniPCMultistepScheduler,
+)
+from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0
+from diffusers.utils import load_image
+from tqdm.auto import trange
+def strtobool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise ValueError(
+            f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
+        )
+def change_scheduler(self, scheduler_type="ddim"):
+    self.original_scheduler_config = self.scheduler.config
+    scheduler_type = scheduler_type.lower()
+    if scheduler_type == "pndm":
+        scheduler = PNDMScheduler.from_config(self.original_scheduler_config, skip_prk_steps=True)
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(self.original_scheduler_config)
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(self.original_scheduler_config)
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(self.original_scheduler_config)
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(self.original_scheduler_config)
+    elif scheduler_type == "dpm-multi":
+        scheduler = DPMSolverMultistepScheduler.from_config(self.original_scheduler_config)
+    elif scheduler_type == "dpm-single":
+        scheduler = DPMSolverSinglestepScheduler.from_config(self.original_scheduler_config)
+    elif scheduler_type == "kdpm2-ancestral":
+        scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.original_scheduler_config)
+    elif scheduler_type == "kdpm2":
+        scheduler = KDPM2DiscreteScheduler.from_config(self.original_scheduler_config)
+    elif scheduler_type == "unipc-multi":
+        scheduler = UniPCMultistepScheduler.from_config(self.original_scheduler_config)
+    elif scheduler_type == "ddim":
+        scheduler = DDIMScheduler.from_config(
+            self.original_scheduler_config,
+            steps_offset=1,
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+    elif scheduler_type == "ddpm":
+        scheduler = DDPMScheduler.from_config(
+            self.original_scheduler_config,
+        )
+    elif scheduler_type == "deis-multi":
+        scheduler = DEISMultistepScheduler.from_config(
+            self.original_scheduler_config,
+        )
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+    return scheduler
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        default="runwayml/stable-diffusion-v1-5",
+        help="The model directory of diffusion_model.",
+    )
+    parser.add_argument(
+        "--inference_steps",
+        type=int,
+        default=50,
+        help="The number of unet inference steps.",
+    )
+    parser.add_argument(
+        "--benchmark_steps",
+        type=int,
+        default=10,
+        help="The number of performance benchmark steps.",
+    )
+    parser.add_argument(
+        "--task_name",
+        type=str,
+        default="all",
+        choices=[
+            "text2img",
+            "img2img",
+            "inpaint",
+            "inpaint_legacy",
+            "cycle_diffusion",
+            "all",
+        ],
+        help="The task can be one of [text2img, img2img, inpaint, inpaint_legacy, cycle_diffusion, hiresfix, all]. ",
+    )
+    parser.add_argument(
+        "--parse_prompt_type",
+        type=str,
+        default="raw",
+        choices=[
+            "raw",
+            "lpw",
+        ],
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
+    parser.add_argument(
+        "--channels_last",
+        type=strtobool,
+        default=False,
+        help="Wheter to use channels_last",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
+    parser.add_argument("--tf32", type=strtobool, default=True, help="tf32")
+    parser.add_argument("--compile", type=strtobool, default=False, help="compile")
+    parser.add_argument(
+        "--attention_type",
+        type=str,
+        default="sdp",
+        choices=[
+            "raw",
+            "sdp",
+        ],
+        help="attention_type.",
+    )
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
+    parser.add_argument(
+        "--scheduler",
+        type=str,
+        default="euler-ancestral",
+        choices=[
+            "pndm",
+            "lms",
+            "euler",
+            "euler-ancestral",
+            "dpm-multi",
+            "dpm-single",
+            "unipc-multi",
+            "ddim",
+            "ddpm",
+            "deis-multi",
+            "heun",
+            "kdpm2-ancestral",
+            "kdpm2",
+        ],
+        help="The scheduler type of stable diffusion.",
+    )
+    parser.add_argument("--height", type=int, default=512, help="Height of input image")
+    parser.add_argument("--width", type=int, default=512, help="Width of input image")
+    return parser.parse_args()
+def attn_processors(self):
+    processors = {}
+    def fn_recursive_add_processors(name: str, module, processors):
+        if hasattr(module, "set_processor"):
+            processors[f"{name}.processor"] = module.processor
+        for sub_name, child in module.named_children():
+            fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+        return processors
+    for name, module in self.named_children():
+        fn_recursive_add_processors(name, module, processors)
+    return processors
+def set_attn_processor(self, processor):
+    count = len(attn_processors(self).keys())
+    if isinstance(processor, dict) and len(processor) != count:
+        raise ValueError(
+            f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+            f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+        )
+    def fn_recursive_attn_processor(name: str, module, processor):
+        if hasattr(module, "set_processor"):
+            if not isinstance(processor, dict):
+                module.set_processor(processor)
+            else:
+                module.set_processor(processor.pop(f"{name}.processor"))
+        for sub_name, child in module.named_children():
+            fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+    for name, module in self.named_children():
+        fn_recursive_attn_processor(name, module, processor)
+def main(args):
+    if args.tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+    else:
+        torch.backends.cuda.matmul.allow_tf32 = False
+    seed = 1024
+    torch_dtype = torch.float16 if args.use_fp16 else torch.float32
+    pipe = DiffusionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        safety_checker=None,
+        feature_extractor=None,
+        requires_safety_checker=False,
+        torch_dtype=torch_dtype,
+        custom_pipeline="stable_diffusion_mega" if args.parse_prompt_type == "raw" else "lpw_stable_diffusion",
+    )
+    scheduler = change_scheduler(pipe, args.scheduler)
+    pipe.scheduler = scheduler
+    if args.device_id >= 0:
+        pipe.to(f"cuda:{args.device_id}")
+    if args.attention_type == "all":
+        args.attention_type = ["raw", "sdp"]
+    else:
+        args.attention_type = [args.attention_type]
+    for attention_type in args.attention_type:
+        attn_prrocessor_cls = AttnProcessor if attention_type == "raw" else AttnProcessor2_0
+        if attention_type == "sdp":
+            torch.nn.functional.scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention_
+        set_attn_processor(pipe.unet, attn_prrocessor_cls())
+        set_attn_processor(pipe.vae, attn_prrocessor_cls())
+        if args.channels_last:
+            pipe.unet.to(memory_format=torch.channels_last)
+        if args.compile:
+            print("Run torch compile")
+            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+        width = args.width
+        height = args.height
+        pipe.set_progress_bar_config(disable=True)
+        folder = f"torch_attn_{attention_type}_fp16" if args.use_fp16 else f"torch_attn_{attention_type}_fp32"
+        os.makedirs(folder, exist_ok=True)
+        if args.task_name in ["text2img", "all"]:
+            # text2img
+            prompt = "a photo of an astronaut riding a horse on mars"
+            time_costs = []
+            # warmup
+            pipe.text2img(
+                prompt,
+                num_inference_steps=10,
+                height=height,
+                width=width,
+            )
+            print("==> Test text2img performance.")
+            torch.cuda.manual_seed(seed)
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                images = pipe.text2img(
+                    prompt,
+                    num_inference_steps=args.inference_steps,
+                    height=height,
+                    width=width,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/text2img.png")
+        if args.task_name in ["img2img", "all"]:
+            # img2img
+            img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
+            init_image = load_image(img_url).resize((width, height))
+            prompt = "A fantasy landscape, trending on artstation"
+            time_costs = []
+            # warmup
+            pipe.img2img(
+                prompt,
+                image=init_image,
+                num_inference_steps=20,
+                height=height,
+                width=width,
+            )
+            print("==> Test img2img performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                torch.cuda.manual_seed(seed)
+                images = pipe.img2img(
+                    prompt,
+                    image=init_image,
+                    num_inference_steps=args.inference_steps,
+                    height=height,
+                    width=width,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/img2img.png")
+        if args.task_name in ["inpaint", "inpaint_legacy", "all"]:
+            img_url = (
+                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
+            )
+            mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
+            init_image = load_image(img_url).resize((width, height))
+            mask_image = load_image(mask_url).resize((width, height))
+            prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+            time_costs = []
+            # warmup
+            if args.task_name in ["inpaint_legacy", "all"]:
+                call_fn = pipe.inpaint
+                task_name = "inpaint_legacy"
+            else:
+                call_fn = pipe.inpaint
+                task_name = args.task_name
+            if pipe.unet.config.in_channels == 4:
+                task_name = "inpaint_legacy"
+            elif pipe.unet.config.in_channels == 9:
+                task_name = "inpaint"
+            call_fn(
+                prompt,
+                image=init_image,
+                mask_image=mask_image,
+                num_inference_steps=20,
+            )
+            print(f"==> Test {task_name} performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                torch.cuda.manual_seed(seed)
+                images = call_fn(
+                    prompt,
+                    image=init_image,
+                    mask_image=mask_image,
+                    num_inference_steps=args.inference_steps,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/{task_name}.png")
+        if args.task_name in ["cycle_diffusion", "all"]:
+            # need fix diffuers=0.17.1, self.unet return_dict=False!
+            cycle_pipe = CycleDiffusionPipeline(
+                vae=pipe.vae,
+                text_encoder=pipe.text_encoder,
+                tokenizer=pipe.tokenizer,
+                unet=pipe.unet,
+                scheduler=scheduler,
+                safety_checker=None,
+                feature_extractor=None,
+                requires_safety_checker=False,
+            )
+            cycle_pipe.set_progress_bar_config(disable=True)
+            scheduler = change_scheduler(cycle_pipe, "ddim")
+            cycle_pipe.scheduler = scheduler
+            image_url = "ride_on_horse.png"
+            init_image = load_image(image_url).resize((width, height))
+            source_prompt = "An astronaut riding a horse"
+            prompt = "An astronaut riding an elephant"
+            time_costs = []
+            # warmup
+            cycle_pipe(
+                prompt=prompt,
+                source_prompt=source_prompt,
+                image=init_image,
+                num_inference_steps=10,
+                eta=0.1,
+                strength=0.8,
+                guidance_scale=2,
+                source_guidance_scale=1,
+            ).images[0]
+            print("==> Test cycle diffusion performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                torch.cuda.manual_seed(seed)
+                images = cycle_pipe(
+                    prompt=prompt,
+                    source_prompt=source_prompt,
+                    image=init_image,
+                    num_inference_steps=args.inference_steps,
+                    eta=0.1,
+                    strength=0.8,
+                    guidance_scale=2,
+                    source_guidance_scale=1,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/cycle_diffusion.png")
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args)

PaddleMIX/ppdiffusers/deploy-deprecated/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ppdiffusers>=0.16.3
2	+ ligo-segments

PaddleMIX/ppdiffusers/deploy/README.md ADDED Viewed

	@@ -0,0 +1,65 @@

+# PPDiffusers推理部署
+PPDiffusers基于Paddle Inference，提供了以下重点扩散模型的部署方案:
+- ControlNet
+- IP-Adapter-SD15
+- IP-Adapter-SDXL
+- SD15
+- SDXL
+# V100性能数据
+|模型|Paddle Deploy TensorRT / ips|Torch Dynamic / ips|
+|-|-|-|
+|IP-Adapter-SD15 text2img|18.30|18.18|
+|IP-Adapter-SD15 img2img|18.11|17.87|
+|IP-Adapter-SD15 inpaint|17.93|17.44|
+|IP-Adapter-SDXL text2img|12.01|11.47|
+|IP-Adapter-SDXL img2img|12.00|10.95|
+|IP-Adapter-SDXL inpaint|11.67|10.79|
+|SD15 text2img|19.68|18.27|
+|SD15 img2img|19.68|17.90|
+|SD15 inpaint|19.44|17.56|
+|SDXL text2img|13.91|11.50|
+|SDXL img2img|13.86|11.60|
+|SDXL inpaint|13.45|11.28|
+<!-- |SD15 text2img|11.87|6.68|6.32|
+|SD15 img2img|14.47|8.09|7.63|
+|SD15 inpaint|14.30|6.42|6.06| -->
+> Note:
+> 测试环境或配置为Paddle 3.0 beta版本，V100 32G单卡，FP16。
+推理参数为Image Width = 512， Image Height = 512， Num Inference Steps = 50。
+# A100性能数据
+|模型|Paddle Deploy TensorRT / ips|Torch Dynamic / ips|
+|-|-|-|
+|IP-Adapter-SD15 text2img|38.52|32.75|
+|IP-Adapter-SD15 img2img|37.91|32.50|
+|IP-Adapter-SD15 inpaint|37.80|31.78|
+|IP-Adapter-SDXL text2img|22.88|17.26|
+|IP-Adapter-SDXL img2img|22.79|17.24|
+|IP-Adapter-SDXL inpaint|22.30|17.06|
+|SD15 text2img|47.22|33.74|
+|SD15 img2img|46.59|32.96|
+|SD15 inpaint|46.05|32.14|
+|SDXL text2img|31.98|17.73|
+|SDXL img2img|31.80|17.40|
+|SDXL inpaint|30.58|16.98|
+<!-- |SD15 text2img|26.37|10.49||
+|SD15 img2img|30.81|12.70||
+|SD15 inpaint|30.55|9.67|| -->
+> Note: 测试环境或配置为Paddle 3.0 beta版本，A100 80G单卡，FP16。
+推理参数为Image Width = 512， Image Height = 512， Num Inference Steps = 50。
+<!-- |SDXL text2img||||
+|SDXL img2img||||
+|SDXL inpaint|||| -->
+<!-- |-|-|-|-|
+|ControlNet text2img|3.360597|||
+|ControlNet img2img|3.360597|||
+|ControlNet inpaint|3.360597||| -->

PaddleMIX/ppdiffusers/ppdiffusers/__init__.py ADDED Viewed

	@@ -0,0 +1,814 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from .patches import *
+from .utils import (
+    PPDIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_einops_available,
+    is_fastdeploy_available,
+    is_inflect_available,
+    is_k_diffusion_available,
+    is_k_diffusion_version,
+    is_librosa_available,
+    is_note_seq_available,
+    is_onnx_available,
+    is_paddle_available,
+    is_paddle_version,
+    is_paddlenlp_available,
+    is_paddlenlp_version,
+    is_paddlesde_available,
+    is_pp_invisible_watermark_available,
+    is_ppxformers_available,
+    is_scipy_available,
+    is_torch_available,
+    is_transformers_available,
+    is_unidecode_available,
+    logging,
+)
+from .version import VERSION as __version__
+# Lazy Import based on
+# https://github.com/huggingface/transformers/blob/main/src/transformers/__init__.py
+# When adding a new object to this init, please add it to `_import_structure`. The `_import_structure` is a dictionary submodule to list of object names,
+# and is used to defer the actual importing for when the objects are requested.
+# This way `import ppdiffusers` provides the names in the namespace without actually importing anything (and especially none of the backends).
+_import_structure = {
+    "configuration_utils": ["ConfigMixin"],
+    "models": [],
+    "pipelines": [],
+    "schedulers": [],
+    "utils": [
+        "OptionalDependencyNotAvailable",
+        "is_inflect_available",
+        "is_pp_invisible_watermark_available",
+        "is_k_diffusion_available",
+        "is_k_diffusion_version",
+        "is_librosa_available",
+        "is_note_seq_available",
+        "is_onnx_available",
+        "is_fastdeploy_available",
+        "is_scipy_available",
+        "is_paddle_available",
+        "is_paddle_version",
+        "is_paddlesde_available",
+        "is_paddlenlp_available",
+        "is_paddlenlp_version",
+        "is_unidecode_available",
+        # NEW ADD
+        "is_ppxformers_available",
+        "is_einops_available",
+        "is_torch_available",
+        "is_transformers_available",
+        "logging",
+    ],
+}
+try:
+    if not is_fastdeploy_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_fastdeploy_objects  # noqa F403
+    _import_structure["utils.dummy_fastdeploy_objects"] = [
+        name for name in dir(dummy_fastdeploy_objects) if not name.startswith("_")
+    ]
+else:
+    _import_structure["pipelines"].extend(
+        ["FastDeployRuntimeModel", "FastDeployDiffusionPipelineMixin", "FastDeployDiffusionXLPipelineMixin"]
+    )
+try:
+    if not is_paddle_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_paddle_objects  # noqa F403
+    _import_structure["utils.dummy_paddle_objects"] = [
+        name for name in dir(dummy_paddle_objects) if not name.startswith("_")
+    ]
+else:
+    _import_structure["models"].extend(
+        [
+            "AsymmetricAutoencoderKL",
+            "AutoencoderKL",
+            "AutoencoderKLCogVideoX",
+            "AutoencoderKLTemporalDecoder",
+            "AutoencoderTiny",
+            "CogVideoXTransformer3DModel",
+            "CogVideoXTransformer3DVCtrlModel",
+            "ConsistencyDecoderVAE",
+            "ControlNetModel",
+            "Kandinsky3UNet",
+            "ModelMixin",
+            "MotionAdapter",
+            "MultiAdapter",
+            "PriorTransformer",
+            "SD3Transformer2DModel",
+            "T2IAdapter",
+            "T5FilmDecoder",
+            "Transformer2DModel",
+            "UNet1DModel",
+            "UNet2DConditionModel",
+            "UNet2DModel",
+            "UNet3DConditionModel",
+            "UNetMotionModel",
+            "UNetSpatioTemporalConditionModel",
+            "VQModel",
+            "UViTT2IModel",
+            "DiTLLaMA2DModel",
+            "DiTLLaMAT2IModel",
+            # new add
+            "LVDMAutoencoderKL",
+            "LVDMUNet3DModel",
+            "PaddleInferRuntimeModel",
+            # new add
+            "AutoencoderKL_imgtovideo",
+            "GaussianDiffusion",
+            "GaussianDiffusion_SDEdit",
+            "STUNetModel",
+            "Vid2VidSTUNet",
+            # new add
+            "SD3ControlNetModel",
+            "SD3MultiControlNetModel",
+            # new add
+            "VCtrlModel",
+        ]
+    )
+    _import_structure["optimization"] = [
+        "get_constant_schedule",
+        "get_constant_schedule_with_warmup",
+        "get_cosine_schedule_with_warmup",
+        "get_cosine_with_hard_restarts_schedule_with_warmup",
+        "get_linear_schedule_with_warmup",
+        "get_polynomial_decay_schedule_with_warmup",
+        "get_scheduler",
+    ]
+    _import_structure["pipelines"].extend(
+        [
+            "AudioPipelineOutput",
+            "AutoPipelineForImage2Image",
+            "AutoPipelineForInpainting",
+            "AutoPipelineForText2Image",
+            "ConsistencyModelPipeline",
+            "CogVideoXVCtrlPipeline",
+            "CogVideoXVCtrlImageToVideoPipeline",
+            "DanceDiffusionPipeline",
+            "DDIMPipeline",
+            "DDPMPipeline",
+            "DiffusionPipeline",
+            "DiTPipeline",
+            "ImagePipelineOutput",
+            "KarrasVePipeline",
+            "LDMPipeline",
+            "LDMSuperResolutionPipeline",
+            "PNDMPipeline",
+            "RePaintPipeline",
+            "ScoreSdeVePipeline",
+        ]
+    )
+    _import_structure["schedulers"].extend(
+        [
+            "CMStochasticIterativeScheduler",
+            "CogVideoXDDIMScheduler",
+            "CogVideoXDPMScheduler",
+            "DDIMInverseScheduler",
+            "DDIMParallelScheduler",
+            "DDIMScheduler",
+            "DDPMParallelScheduler",
+            "DDPMScheduler",
+            "DDPMWuerstchenScheduler",
+            "DEISMultistepScheduler",
+            "DPMSolverMultistepInverseScheduler",
+            "DPMSolverMultistepScheduler",
+            "DPMSolverSinglestepScheduler",
+            "EulerAncestralDiscreteScheduler",
+            "EulerDiscreteScheduler",
+            "FlowMatchEulerDiscreteScheduler",
+            "HeunDiscreteScheduler",
+            "IPNDMScheduler",
+            "KarrasVeScheduler",
+            "ScoreSdeVpScheduler",  # new add
+            "PreconfigEulerAncestralDiscreteScheduler",  # new add
+            "PreconfigLMSDiscreteScheduler",  # new add
+            "KDPM2AncestralDiscreteScheduler",
+            "KDPM2DiscreteScheduler",
+            "LCMScheduler",
+            "PNDMScheduler",
+            "RePaintScheduler",
+            "SchedulerMixin",
+            "ScoreSdeVeScheduler",
+            "UnCLIPScheduler",
+            "UniPCMultistepScheduler",
+            "VQDiffusionScheduler",
+            "EDMDPMSolverMultistepScheduler",
+            "EDMEulerScheduler",
+        ]
+    )
+    _import_structure["training_utils"] = ["EMAModel"]
+try:
+    if not (is_paddle_available() and is_scipy_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_paddle_and_scipy_objects  # noqa F403
+    _import_structure["utils.dummy_paddle_and_scipy_objects"] = [
+        name for name in dir(dummy_paddle_and_scipy_objects) if not name.startswith("_")
+    ]
+else:
+    _import_structure["schedulers"].extend(["LMSDiscreteScheduler"])
+try:
+    if not (is_paddle_available() and is_paddlesde_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_paddle_and_paddlesde_objects  # noqa F403
+    _import_structure["utils.dummy_paddle_and_paddlesde_objects"] = [
+        name for name in dir(dummy_paddle_and_paddlesde_objects) if not name.startswith("_")
+    ]
+else:
+    _import_structure["schedulers"].extend(["DPMSolverSDEScheduler"])
+try:
+    if not (is_paddle_available() and is_paddlenlp_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_paddle_and_paddlenlp_objects  # noqa F403
+    _import_structure["utils.dummy_paddle_and_paddlenlp_objects"] = [
+        name for name in dir(dummy_paddle_and_paddlenlp_objects) if not name.startswith("_")
+    ]
+else:
+    _import_structure["pipelines"].extend(
+        [
+            "AltDiffusionImg2ImgPipeline",
+            "AltDiffusionPipeline",
+            "AnimateDiffPipeline",
+            "AudioLDM2Pipeline",
+            "AudioLDM2ProjectionModel",
+            "AudioLDM2UNet2DConditionModel",
+            "AudioLDMPipeline",
+            "BlipDiffusionControlNetPipeline",
+            "BlipDiffusionPipeline",
+            "CLIPImageProjection",
+            "CogVideoXPipeline",
+            "CycleDiffusionPipeline",
+            "IFImg2ImgPipeline",
+            "IFImg2ImgSuperResolutionPipeline",
+            "IFInpaintingPipeline",
+            "IFInpaintingSuperResolutionPipeline",
+            "IFPipeline",
+            "IFSuperResolutionPipeline",
+            "ImageTextPipelineOutput",
+            "Kandinsky3Img2ImgPipeline",
+            "Kandinsky3Pipeline",
+            "KandinskyCombinedPipeline",
+            "KandinskyImg2ImgCombinedPipeline",
+            "KandinskyImg2ImgPipeline",
+            "KandinskyInpaintCombinedPipeline",
+            "KandinskyInpaintPipeline",
+            "KandinskyPipeline",
+            "KandinskyPriorPipeline",
+            "KandinskyV22CombinedPipeline",
+            "KandinskyV22ControlnetImg2ImgPipeline",
+            "KandinskyV22ControlnetPipeline",
+            "KandinskyV22Img2ImgCombinedPipeline",
+            "KandinskyV22Img2ImgPipeline",
+            "KandinskyV22InpaintCombinedPipeline",
+            "KandinskyV22InpaintPipeline",
+            "KandinskyV22Pipeline",
+            "KandinskyV22PriorEmb2EmbPipeline",
+            "KandinskyV22PriorPipeline",
+            "LatentConsistencyModelImg2ImgPipeline",
+            "LatentConsistencyModelPipeline",
+            "LDMTextToImagePipeline",
+            "LDMTextToImageUViTPipeline",
+            "LDMTextToImageLargeDiTPipeline",
+            "MusicLDMPipeline",
+            "PaintByExamplePipeline",
+            "PixArtAlphaPipeline",
+            "SemanticStableDiffusionPipeline",
+            "ShapEImg2ImgPipeline",
+            "ShapEPipeline",
+            "StableDiffusion3ControlNetInpaintingPipeline",
+            "StableDiffusion3ControlNetPipeline",
+            "StableDiffusion3Img2ImgPipeline",
+            "StableDiffusion3Pipeline",
+            "StableDiffusionAdapterPipeline",
+            "StableDiffusionAttendAndExcitePipeline",
+            "StableDiffusionControlNetImg2ImgPipeline",
+            "StableDiffusionControlNetInpaintPipeline",
+            "StableDiffusionControlNetPipeline",
+            "StableDiffusionDepth2ImgPipeline",
+            "StableDiffusionDiffEditPipeline",
+            "StableDiffusionGLIGENPipeline",
+            "StableDiffusionGLIGENTextImagePipeline",
+            "StableDiffusionImageVariationPipeline",
+            "StableDiffusionImg2ImgPipeline",
+            "StableDiffusionInpaintPipeline",
+            "StableDiffusionInpaintPipelineLegacy",
+            "StableDiffusionInstructPix2PixPipeline",
+            "StableDiffusionLatentUpscalePipeline",
+            "StableDiffusionLDM3DPipeline",
+            "StableDiffusionModelEditingPipeline",
+            "StableDiffusionPanoramaPipeline",
+            "StableDiffusionParadigmsPipeline",
+            "StableDiffusionPipeline",
+            "StableDiffusionPipelineSafe",
+            "StableDiffusionPix2PixZeroPipeline",
+            "StableDiffusionSAGPipeline",
+            "StableDiffusionUpscalePipeline",
+            "StableDiffusionXLAdapterPipeline",
+            "StableDiffusionXLControlNetImg2ImgPipeline",
+            "StableDiffusionXLControlNetInpaintPipeline",
+            "StableDiffusionXLControlNetPipeline",
+            "StableDiffusionXLImg2ImgPipeline",
+            "StableDiffusionXLInpaintPipeline",
+            "StableDiffusionXLInstructPix2PixPipeline",
+            "StableDiffusionXLPipeline",
+            "StableUnCLIPImg2ImgPipeline",
+            "StableUnCLIPPipeline",
+            "StableDiffusionSafetyChecker",
+            "StableVideoDiffusionPipeline",
+            "TextToVideoSDPipeline",
+            "TextToVideoZeroPipeline",
+            "TextToVideoZeroSDXLPipeline",
+            "UnCLIPImageVariationPipeline",
+            "UnCLIPPipeline",
+            "UniDiffuserModel",
+            "UniDiffuserPipeline",
+            "UniDiffuserTextDecoder",
+            "VersatileDiffusionDualGuidedPipeline",
+            "VersatileDiffusionImageVariationPipeline",
+            "VersatileDiffusionPipeline",
+            "VersatileDiffusionTextToImagePipeline",
+            "VideoToVideoSDPipeline",
+            "VQDiffusionPipeline",
+            "WuerstchenCombinedPipeline",
+            "WuerstchenDecoderPipeline",
+            "WuerstchenPriorPipeline",
+            # new add
+            "LVDMTextToVideoPipeline",
+            "LVDMUncondPipeline",
+            "PaddleInferCycleDiffusionPipeline",
+            "PaddleInferStableDiffusionImg2ImgPipeline",
+            "PaddleInferStableDiffusionInpaintPipeline",
+            "PaddleInferStableDiffusionInpaintPipelineLegacy",
+            "PaddleInferStableDiffusionMegaPipeline",
+            "PaddleInferStableDiffusionPipeline",
+            "PaddleInferStableDiffusionUpscalePipeline",
+            "PaddleInferStableDiffusionXLPipeline",
+            "PaddleInferStableDiffusionXLImg2ImgPipeline",
+            "PaddleInferStableDiffusionXLInpaintPipeline",
+            "PaddleInferStableDiffusionXLInstructPix2PixPipeline",
+            "PaddleInferStableDiffusionXLMegaPipeline",
+            "PaddleInferStableDiffusionControlNetPipeline",
+            "PaddleInferStableVideoDiffusionPipeline",
+            # new add
+            "ImgToVideoSDPipeline",
+            "VideoToVideoModelscopePipeline",
+        ]
+    )
+try:
+    if not (is_paddle_available() and is_paddlenlp_available() and is_k_diffusion_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_paddle_and_paddlenlp_and_k_diffusion_objects  # noqa F403
+    _import_structure["utils.dummy_paddle_and_paddlenlp_and_k_diffusion_objects"] = [
+        name for name in dir(dummy_paddle_and_paddlenlp_and_k_diffusion_objects) if not name.startswith("_")
+    ]
+else:
+    _import_structure["pipelines"].extend(["StableDiffusionKDiffusionPipeline"])
+try:
+    if not (is_paddle_available() and is_paddlenlp_available() and is_fastdeploy_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_paddle_and_paddlenlp_and_fastdeploy_objects  # noqa F403
+    _import_structure["utils.dummy_paddle_and_paddlenlp_and_fastdeploy_objects"] = [
+        name for name in dir(dummy_paddle_and_paddlenlp_and_fastdeploy_objects) if not name.startswith("_")
+    ]
+else:
+    _import_structure["pipelines"].extend(
+        [
+            "FastDeployStableDiffusionImg2ImgPipeline",
+            "FastDeployStableDiffusionInpaintPipeline",
+            "FastDeployStableDiffusionInpaintPipelineLegacy",
+            "FastDeployStableDiffusionPipeline",
+            "FastDeployStableDiffusionMegaPipeline",
+            "FastDeployCycleDiffusionPipeline",
+            "FastDeployStableDiffusionControlNetPipeline",
+            "FastDeployStableDiffusionUpscalePipeline",
+        ]
+    )
+try:
+    if not (is_paddle_available() and is_librosa_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_paddle_and_librosa_objects  # noqa F403
+    _import_structure["utils.dummy_paddle_and_librosa_objects"] = [
+        name for name in dir(dummy_paddle_and_librosa_objects) if not name.startswith("_")
+    ]
+else:
+    _import_structure["pipelines"].extend(["AudioDiffusionPipeline", "Mel"])
+try:
+    if not (is_paddlenlp_available() and is_paddle_available() and is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_paddle_and_paddlenlp_and_note_seq_objects  # noqa F403
+    _import_structure["utils.dummy_paddle_and_paddlenlp_and_note_seq_objects"] = [
+        name for name in dir(dummy_paddle_and_paddlenlp_and_note_seq_objects) if not name.startswith("_")
+    ]
+else:
+    _import_structure["pipelines"].extend(["SpectrogramDiffusionPipeline"])
+try:
+    if not (is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_note_seq_objects  # noqa F403
+    _import_structure["utils.dummy_note_seq_objects"] = [
+        name for name in dir(dummy_note_seq_objects) if not name.startswith("_")
+    ]
+else:
+    _import_structure["pipelines"].extend(["MidiProcessor"])
+if TYPE_CHECKING or PPDIFFUSERS_SLOW_IMPORT:
+    from .configuration_utils import ConfigMixin
+    try:
+        if not is_fastdeploy_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_fastdeploy_objects import *  # noqa F403
+    else:
+        from .pipelines import (
+            FastDeployDiffusionPipelineMixin,
+            FastDeployDiffusionXLPipelineMixin,
+            FastDeployRuntimeModel,
+        )
+    try:
+        if not is_paddle_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_paddle_objects import *  # noqa F403
+    else:
+        from .models import (  # new add
+            AsymmetricAutoencoderKL,
+            AutoencoderKL,
+            AutoencoderKL_imgtovideo,
+            AutoencoderKLCogVideoX,
+            AutoencoderKLTemporalDecoder,
+            AutoencoderTiny,
+            CogVideoXTransformer3DModel,
+            CogVideoXTransformer3DVCtrlModel,
+            ConsistencyDecoderVAE,
+            ControlNetModel,
+            DiTLLaMA2DModel,
+            DiTLLaMAT2IModel,
+            GaussianDiffusion,
+            GaussianDiffusion_SDEdit,
+            Kandinsky3UNet,
+            LVDMAutoencoderKL,
+            LVDMUNet3DModel,
+            ModelMixin,
+            MotionAdapter,
+            MultiAdapter,
+            PaddleInferRuntimeModel,
+            PriorTransformer,
+            SD3ControlNetModel,
+            SD3MultiControlNetModel,
+            SD3Transformer2DModel,
+            STUNetModel,
+            T2IAdapter,
+            T5FilmDecoder,
+            Transformer2DModel,
+            UNet1DModel,
+            UNet2DConditionModel,
+            UNet2DModel,
+            UNet3DConditionModel,
+            UNetMotionModel,
+            UNetSpatioTemporalConditionModel,
+            UViTT2IModel,
+            VCtrlModel,
+            Vid2VidSTUNet,
+            VQModel,
+        )
+        from .optimization import (
+            get_constant_schedule,
+            get_constant_schedule_with_warmup,
+            get_cosine_schedule_with_warmup,
+            get_cosine_with_hard_restarts_schedule_with_warmup,
+            get_linear_schedule_with_warmup,
+            get_polynomial_decay_schedule_with_warmup,
+            get_scheduler,
+        )
+        from .pipelines import (  # new add
+            AudioPipelineOutput,
+            AutoPipelineForImage2Image,
+            AutoPipelineForInpainting,
+            AutoPipelineForText2Image,
+            BlipDiffusionControlNetPipeline,
+            BlipDiffusionPipeline,
+            CogVideoXVCtrlImageToVideoPipeline,
+            CogVideoXVCtrlPipeline,
+            ConsistencyModelPipeline,
+            DanceDiffusionPipeline,
+            DDIMPipeline,
+            DDPMPipeline,
+            DiffusionPipeline,
+            DiTPipeline,
+            ImagePipelineOutput,
+            ImgToVideoSDPipeline,
+            KarrasVePipeline,
+            LDMPipeline,
+            LDMSuperResolutionPipeline,
+            PNDMPipeline,
+            RePaintPipeline,
+            ScoreSdeVePipeline,
+            VideoToVideoModelscopePipeline,
+        )
+        from .schedulers import (
+            CMStochasticIterativeScheduler,
+            CogVideoXDDIMScheduler,
+            CogVideoXDPMScheduler,
+            DDIMInverseScheduler,
+            DDIMParallelScheduler,
+            DDIMScheduler,
+            DDPMParallelScheduler,
+            DDPMScheduler,
+            DDPMWuerstchenScheduler,
+            DEISMultistepScheduler,
+            DPMSolverMultistepInverseScheduler,
+            DPMSolverMultistepScheduler,
+            DPMSolverSinglestepScheduler,
+            EDMDPMSolverMultistepScheduler,
+            EDMEulerScheduler,
+            EulerAncestralDiscreteScheduler,
+            EulerDiscreteScheduler,
+            FlowMatchEulerDiscreteScheduler,
+            HeunDiscreteScheduler,
+            IPNDMScheduler,
+            KarrasVeScheduler,
+            KDPM2AncestralDiscreteScheduler,
+            KDPM2DiscreteScheduler,
+            LCMScheduler,
+            PNDMScheduler,
+            PreconfigEulerAncestralDiscreteScheduler,
+            PreconfigLMSDiscreteScheduler,
+            RePaintScheduler,
+            SchedulerMixin,
+            ScoreSdeVeScheduler,
+            ScoreSdeVpScheduler,
+            UnCLIPScheduler,
+            UniPCMultistepScheduler,
+            VQDiffusionScheduler,
+        )
+        from .training_utils import EMAModel
+    try:
+        if not (is_paddle_available() and is_scipy_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_paddle_and_scipy_objects import *  # noqa F403
+    else:
+        from .schedulers import LMSDiscreteScheduler
+    try:
+        if not (is_paddle_available() and is_paddlesde_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_paddle_and_paddlesde_objects import *  # noqa F403
+    else:
+        from .schedulers import DPMSolverSDEScheduler
+    try:
+        if not (is_paddle_available() and is_paddlenlp_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_paddle_and_paddlenlp_objects import *  # noqa F403
+    else:
+        from .pipelines import (  # new add
+            AltDiffusionImg2ImgPipeline,
+            AltDiffusionPipeline,
+            AnimateDiffPipeline,
+            AudioLDM2Pipeline,
+            AudioLDM2ProjectionModel,
+            AudioLDM2UNet2DConditionModel,
+            AudioLDMPipeline,
+            CLIPImageProjection,
+            CogVideoXPipeline,
+            CycleDiffusionPipeline,
+            IFImg2ImgPipeline,
+            IFImg2ImgSuperResolutionPipeline,
+            IFInpaintingPipeline,
+            IFInpaintingSuperResolutionPipeline,
+            IFPipeline,
+            IFSuperResolutionPipeline,
+            ImageTextPipelineOutput,
+            Kandinsky3Img2ImgPipeline,
+            Kandinsky3Pipeline,
+            KandinskyCombinedPipeline,
+            KandinskyImg2ImgCombinedPipeline,
+            KandinskyImg2ImgPipeline,
+            KandinskyInpaintCombinedPipeline,
+            KandinskyInpaintPipeline,
+            KandinskyPipeline,
+            KandinskyPriorPipeline,
+            KandinskyV22CombinedPipeline,
+            KandinskyV22ControlnetImg2ImgPipeline,
+            KandinskyV22ControlnetPipeline,
+            KandinskyV22Img2ImgCombinedPipeline,
+            KandinskyV22Img2ImgPipeline,
+            KandinskyV22InpaintCombinedPipeline,
+            KandinskyV22InpaintPipeline,
+            KandinskyV22Pipeline,
+            KandinskyV22PriorEmb2EmbPipeline,
+            KandinskyV22PriorPipeline,
+            LatentConsistencyModelImg2ImgPipeline,
+            LatentConsistencyModelPipeline,
+            LDMTextToImageLargeDiTPipeline,
+            LDMTextToImagePipeline,
+            LDMTextToImageUViTPipeline,
+            LVDMTextToVideoPipeline,
+            LVDMUncondPipeline,
+            MusicLDMPipeline,
+            PaddleInferCycleDiffusionPipeline,
+            PaddleInferStableDiffusionControlNetPipeline,
+            PaddleInferStableDiffusionImg2ImgPipeline,
+            PaddleInferStableDiffusionInpaintPipeline,
+            PaddleInferStableDiffusionInpaintPipelineLegacy,
+            PaddleInferStableDiffusionMegaPipeline,
+            PaddleInferStableDiffusionPipeline,
+            PaddleInferStableDiffusionXLImg2ImgPipeline,
+            PaddleInferStableDiffusionXLInpaintPipeline,
+            PaddleInferStableDiffusionXLInstructPix2PixPipeline,
+            PaddleInferStableDiffusionXLMegaPipeline,
+            PaddleInferStableDiffusionXLPipeline,
+            PaddleInferStableVideoDiffusionPipeline,
+            PaintByExamplePipeline,
+            PixArtAlphaPipeline,
+            SemanticStableDiffusionPipeline,
+            ShapEImg2ImgPipeline,
+            ShapEPipeline,
+            StableDiffusion3ControlNetPipeline,
+            StableDiffusion3Img2ImgPipeline,
+            StableDiffusion3Pipeline,
+            StableDiffusionAdapterPipeline,
+            StableDiffusionAttendAndExcitePipeline,
+            StableDiffusionControlNetImg2ImgPipeline,
+            StableDiffusionControlNetInpaintPipeline,
+            StableDiffusionControlNetPipeline,
+            StableDiffusionDepth2ImgPipeline,
+            StableDiffusionDiffEditPipeline,
+            StableDiffusionGLIGENPipeline,
+            StableDiffusionGLIGENTextImagePipeline,
+            StableDiffusionImageVariationPipeline,
+            StableDiffusionImg2ImgPipeline,
+            StableDiffusionInpaintPipeline,
+            StableDiffusionInpaintPipelineLegacy,
+            StableDiffusionInstructPix2PixPipeline,
+            StableDiffusionLatentUpscalePipeline,
+            StableDiffusionLDM3DPipeline,
+            StableDiffusionModelEditingPipeline,
+            StableDiffusionPanoramaPipeline,
+            StableDiffusionParadigmsPipeline,
+            StableDiffusionPipeline,
+            StableDiffusionPipelineSafe,
+            StableDiffusionPix2PixZeroPipeline,
+            StableDiffusionSafetyChecker,
+            StableDiffusionSAGPipeline,
+            StableDiffusionUpscalePipeline,
+            StableDiffusionXLAdapterPipeline,
+            StableDiffusionXLControlNetImg2ImgPipeline,
+            StableDiffusionXLControlNetInpaintPipeline,
+            StableDiffusionXLControlNetPipeline,
+            StableDiffusionXLImg2ImgPipeline,
+            StableDiffusionXLInpaintPipeline,
+            StableDiffusionXLInstructPix2PixPipeline,
+            StableDiffusionXLPipeline,
+            StableUnCLIPImg2ImgPipeline,
+            StableUnCLIPPipeline,
+            StableVideoDiffusionPipeline,
+            TextToVideoSDPipeline,
+            TextToVideoZeroPipeline,
+            TextToVideoZeroSDXLPipeline,
+            UnCLIPImageVariationPipeline,
+            UnCLIPPipeline,
+            UniDiffuserModel,
+            UniDiffuserPipeline,
+            UniDiffuserTextDecoder,
+            VersatileDiffusionDualGuidedPipeline,
+            VersatileDiffusionImageVariationPipeline,
+            VersatileDiffusionPipeline,
+            VersatileDiffusionTextToImagePipeline,
+            VideoToVideoSDPipeline,
+            VQDiffusionPipeline,
+            WuerstchenCombinedPipeline,
+            WuerstchenDecoderPipeline,
+            WuerstchenPriorPipeline,
+        )
+    try:
+        if not (is_paddle_available() and is_paddlenlp_available() and is_k_diffusion_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_paddle_and_paddlenlp_and_k_diffusion_objects import *  # noqa F403
+    else:
+        from .pipelines import StableDiffusionKDiffusionPipeline
+    try:
+        if not (is_paddle_available() and is_paddlenlp_available() and is_fastdeploy_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_paddle_and_paddlenlp_and_fastdeploy_objects import *  # noqa F403
+    else:
+        from .pipelines import (
+            FastDeployCycleDiffusionPipeline,
+            FastDeployStableDiffusionControlNetPipeline,
+            FastDeployStableDiffusionImg2ImgPipeline,
+            FastDeployStableDiffusionInpaintPipeline,
+            FastDeployStableDiffusionInpaintPipelineLegacy,
+            FastDeployStableDiffusionMegaPipeline,
+            FastDeployStableDiffusionPipeline,
+            FastDeployStableDiffusionUpscalePipeline,
+            FastDeployStableDiffusionXLImg2ImgPipeline,
+            FastDeployStableDiffusionXLInpaintPipeline,
+            FastDeployStableDiffusionXLInstructPix2PixPipeline,
+            FastDeployStableDiffusionXLPipeline,
+        )
+    try:
+        if not (is_paddle_available() and is_librosa_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_paddle_and_librosa_objects import *  # noqa F403
+    else:
+        from .pipelines import AudioDiffusionPipeline, Mel
+    try:
+        if not (is_paddlenlp_available() and is_paddle_available() and is_note_seq_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_paddle_and_paddlenlp_and_note_seq_objects import *  # noqa F403
+    else:
+        from .pipelines import SpectrogramDiffusionPipeline
+    try:
+        if not (is_note_seq_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_note_seq_objects import *  # noqa F403
+    else:
+        from .pipelines import MidiProcessor
+else:
+    import sys
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={"__version__": __version__},
+    )

PaddleMIX/ppdiffusers/ppdiffusers/accelerate/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__version__ = "0.25.0"
+from .accelerator import Accelerator
+from .state import PartialState
+from .utils import (
+    AutocastKwargs,
+    DistributedDataParallelKwargs,
+    DistributedType,
+    GradScalerKwargs,
+    InitProcessGroupKwargs,
+    find_executable_batch_size,
+    is_rich_available,
+)
+if is_rich_available():
+    from .utils import rich

PaddleMIX/ppdiffusers/ppdiffusers/accelerate/logging.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import logging
+import os
+from .state import PartialState
+class MultiProcessAdapter(logging.LoggerAdapter):
+    """
+    An adapter to assist with logging in multiprocess.
+    `log` takes in an additional `main_process_only` kwarg, which dictates whether it should be called on all processes
+    or only the main executed one. Default is `main_process_only=True`.
+    Does not require an `Accelerator` object to be created first.
+    """
+    @staticmethod
+    def _should_log(main_process_only):
+        "Check if log should be performed"
+        state = PartialState()
+        return not main_process_only or (main_process_only and state.is_main_process)
+    def log(self, level, msg, *args, **kwargs):
+        """
+        Delegates logger call after checking if we should log.
+        Accepts a new kwarg of `main_process_only`, which will dictate whether it will be logged across all processes
+        or only the main executed one. Default is `True` if not passed
+        Also accepts "in_order", which if `True` makes the processes log one by one, in order. This is much easier to
+        read, but comes at the cost of sometimes needing to wait for the other processes. Default is `False` to not
+        break with the previous behavior.
+        `in_order` is ignored if `main_process_only` is passed.
+        """
+        if PartialState._shared_state == {}:
+            raise RuntimeError(
+                "You must initialize the accelerate state by calling either `PartialState()` or `Accelerator()` before using the logging utility."
+            )
+        main_process_only = kwargs.pop("main_process_only", True)
+        in_order = kwargs.pop("in_order", False)
+        if self.isEnabledFor(level):
+            if self._should_log(main_process_only):
+                msg, kwargs = self.process(msg, kwargs)
+                self.logger.log(level, msg, *args, **kwargs)
+            elif in_order:
+                state = PartialState()
+                for i in range(state.num_processes):
+                    if i == state.process_index:
+                        msg, kwargs = self.process(msg, kwargs)
+                        self.logger.log(level, msg, *args, **kwargs)
+                    state.wait_for_everyone()
+    @functools.lru_cache(None)
+    def warning_once(self, *args, **kwargs):
+        """
+        This method is identical to `logger.warning()`, but will emit the warning with the same message only once
+        Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the
+        cache. The assumption here is that all warning messages are unique across the code. If they aren't then need to
+        switch to another type of cache that includes the caller frame information in the hashing function.
+        """
+        self.warning(*args, **kwargs)
+def get_logger(name: str, log_level: str = None):
+    """
+    Returns a `logging.Logger` for `name` that can handle multiprocessing.
+    If a log should be called on all processes, pass `main_process_only=False` If a log should be called on all
+    processes and in order, also pass `in_order=True`
+    Args:
+        name (`str`):
+            The name for the logger, such as `__file__`
+        log_level (`str`, *optional*):
+            The log level to use. If not passed, will default to the `LOG_LEVEL` environment variable, or `INFO` if not
+    Example:
+    ```python
+    >>> from accelerate.logging import get_logger
+    >>> from accelerate import Accelerator
+    >>> logger = get_logger(__name__)
+    >>> accelerator = Accelerator()
+    >>> logger.info("My log", main_process_only=False)
+    >>> logger.debug("My log", main_process_only=True)
+    >>> logger = get_logger(__name__, log_level="DEBUG")
+    >>> logger.info("My log")
+    >>> logger.debug("My second log")
+    >>> array = ["a", "b", "c", "d"]
+    >>> letter_at_rank = array[accelerator.process_index]
+    >>> logger.info(letter_at_rank, in_order=True)
+    ```
+    """
+    if log_level is None:
+        log_level = os.environ.get("ACCELERATE_LOG_LEVEL", None)
+    logger = logging.getLogger(name)
+    if log_level is not None:
+        logger.setLevel(log_level.upper())
+        logger.root.setLevel(log_level.upper())
+    return MultiProcessAdapter(logger, {})

PaddleMIX/ppdiffusers/ppdiffusers/accelerate/optimizer.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import warnings
+import paddle
+import paddle.optimizer
+from .state import AcceleratorState, GradientState
+from .utils import honor_type
+def move_to_device(state, device):
+    if isinstance(state, (list, tuple)):
+        return honor_type(state, (move_to_device(t, device) for t in state))
+    elif isinstance(state, dict):
+        return type(state)({k: move_to_device(v, device) for k, v in state.items()})
+    elif isinstance(state, paddle.Tensor):
+        return state.to(device)
+    return state
+class AcceleratedOptimizer(paddle.optimizer.Optimizer):
+    """
+    Internal wrapper around a torch optimizer.
+    Conditionally will perform `step` and `zero_grad` if gradients should be synchronized when performing gradient
+    accumulation.
+    Args:
+        optimizer (`torch.optim.optimizer.Optimizer`):
+            The optimizer to wrap.
+        device_placement (`bool`, *optional*, defaults to `True`):
+            Whether or not the optimizer should handle device placement. If so, it will place the state dictionary of
+            `optimizer` on the right device.
+        scaler (`torch.cuda.amp.grad_scaler.GradScaler`, *optional*):
+            The scaler to use in the step function if training with mixed precision.
+    """
+    def __init__(self, optimizer, device_placement=True, scaler=None):
+        self.optimizer = optimizer
+        self.scaler = scaler
+        self.accelerator_state = AcceleratorState()
+        self.gradient_state = GradientState()
+        device_placement = False
+        self.device_placement = device_placement
+        self._is_overflow = False
+        if self.scaler is not None:
+            self._accelerate_step_called = False
+            self._optimizer_original_step_method = self.optimizer.step
+            self._optimizer_patched_step_method = patch_optimizer_step(self, self.optimizer.step)
+        # Handle device placement
+        if device_placement:
+            state_dict = self.optimizer.state_dict()
+            self.optimizer.set_state_dict(state_dict)
+    @property
+    def state(self):
+        return self.optimizer.state
+    @state.setter
+    def state(self, state):
+        self.optimizer.state = state
+    @property
+    def param_groups(self):
+        return self.optimizer._param_groups
+    @param_groups.setter
+    def param_groups(self, param_groups):
+        self.optimizer._param_groups = param_groups
+    @property
+    def defaults(self):
+        return self.optimizer.defaults
+    @defaults.setter
+    def defaults(self, defaults):
+        self.optimizer.defaults = defaults
+    def add_param_group(self, param_group):
+        self.optimizer.add_param_group(param_group)
+    def load_state_dict(self, state_dict):
+        self.optimizer.set_state_dict(state_dict)
+    set_state_dict = load_state_dict
+    def state_dict(self):
+        return self.optimizer.state_dict()
+    def zero_grad(self, set_to_zero=None):
+        if self.gradient_state.sync_gradients:
+            accept_arg = "set_to_zero" in inspect.signature(self.optimizer.clear_grad).parameters
+            if accept_arg:
+                if set_to_zero is None:
+                    set_to_zero = True
+                self.optimizer.clear_grad(set_to_zero=set_to_zero)
+            else:
+                if set_to_zero is not None:
+                    raise ValueError("`set_to_zero` for Optimizer.clear_grad` is not supported by this optimizer.")
+                self.optimizer.clear_grad()
+    clear_grad = zero_grad
+    def step(self, closure=None):
+        if self.gradient_state.sync_gradients:
+            if self.scaler is not None:
+                self.optimizer.step = self._optimizer_patched_step_method
+                self.scaler.step(self.optimizer)
+                self.scaler.update()
+                if not self._accelerate_step_called:
+                    # If the optimizer step was skipped, gradient overflow was detected.
+                    self._is_overflow = True
+                else:
+                    self._is_overflow = False
+                # Reset the step method to the original one
+                self.optimizer.step = self._optimizer_original_step_method
+                # Reset the indicator
+                self._accelerate_step_called = False
+            else:
+                self.optimizer.step()
+    def _switch_parameters(self, parameters_map):
+        for param_group in self.param_groups:
+            param_group["params"] = [parameters_map.get(p, p) for p in param_group["params"]]
+    @property
+    def is_overflow(self):
+        """Whether or not the optimizer step was done, or skipped because of gradient overflow."""
+        warnings.warn(
+            "The `is_overflow` property is deprecated and will be removed in version 1.0 of Accelerate use "
+            "`optimizer.step_was_skipped` instead.",
+            FutureWarning,
+        )
+        return self._is_overflow
+    @property
+    def step_was_skipped(self):
+        """Whether or not the optimizer step was skipped."""
+        return self._is_overflow
+    def __getstate__(self):
+        _ignored_keys = [
+            "_accelerate_step_called",
+            "_optimizer_original_step_method",
+            "_optimizer_patched_step_method",
+        ]
+        return {k: v for k, v in self.__dict__.items() if k not in _ignored_keys}
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        if self.scaler is not None:
+            self._accelerate_step_called = False
+            self._optimizer_original_step_method = self.optimizer.step
+            self._optimizer_patched_step_method = patch_optimizer_step(self, self.optimizer.step)
+def patch_optimizer_step(accelerated_optimizer: AcceleratedOptimizer, method):
+    def patched_step(*args, **kwargs):
+        accelerated_optimizer._accelerate_step_called = True
+        return method(*args, **kwargs)
+    return patched_step

PaddleMIX/ppdiffusers/ppdiffusers/accelerate/scheduler.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# We ignore warnings about stepping the scheduler since we step it ourselves during gradient accumulation
+from .state import AcceleratorState, GradientState
+class AcceleratedScheduler:
+    """
+    A wrapper around a learning rate scheduler that will only step when the optimizer(s) have a training step. Useful
+    to avoid making a scheduler step too fast when gradients went overflow and there was no training step (in mixed
+    precision training)
+    When performing gradient accumulation scheduler lengths should not be changed accordingly, Accelerate will always
+    step the scheduler to account for it.
+    Args:
+        scheduler (`torch.optim.lr_scheduler._LRScheduler`):
+            The scheduler to wrap.
+        optimizers (one or a list of `torch.optim.Optimizer`):
+            The optimizers used.
+        step_with_optimizer (`bool`, *optional*, defaults to `True`):
+            Whether or not the scheduler should be stepped at each optimizer step.
+        split_batches (`bool`, *optional*, defaults to `False`):
+            Whether or not the dataloaders split one batch across the different processes (so batch size is the same
+            regardless of the number of processes) or create batches on each process (so batch size is the original
+            batch size multiplied by the number of processes).
+    """
+    def __init__(self, scheduler, optimizers, step_with_optimizer: bool = True, split_batches: bool = False):
+        self.scheduler = scheduler
+        self.optimizers = optimizers if isinstance(optimizers, (list, tuple)) else [optimizers]
+        self.split_batches = split_batches
+        self.step_with_optimizer = step_with_optimizer
+        self.gradient_state = GradientState()
+    def step(self, *args, **kwargs):
+        if not self.step_with_optimizer:
+            # No link between scheduler and optimizer -> just step
+            self.scheduler.step(*args, **kwargs)
+            return
+        # Otherwise, first make sure the optimizer was stepped.
+        if not self.gradient_state.sync_gradients:
+            if self.gradient_state.adjust_scheduler:
+                self.scheduler._step_count += 1
+            return
+        for opt in self.optimizers:
+            if opt.step_was_skipped:
+                return
+        if self.split_batches:
+            # Split batches -> the training dataloader batch size is not changed so one step per training step
+            self.scheduler.step(*args, **kwargs)
+        else:
+            # Otherwise the training dataloader batch size was multiplied by `num_processes`, so we need to do
+            # num_processes steps per training step
+            num_processes = AcceleratorState().num_processes
+            for _ in range(num_processes):
+                # Special case when using OneCycle and `drop_last` was not used
+                if hasattr(self.scheduler, "total_steps"):
+                    if self.scheduler._step_count <= self.scheduler.total_steps:
+                        self.scheduler.step(*args, **kwargs)
+                else:
+                    self.scheduler.step(*args, **kwargs)
+    # Passthroughs
+    def get_last_lr(self):
+        return self.scheduler.get_lr()
+    def state_dict(self):
+        return self.scheduler.state_dict()
+    def load_state_dict(self, state_dict):
+        self.scheduler.set_state_dict(state_dict)
+    set_state_dict = load_state_dict
+    def get_lr(self):
+        return self.scheduler.get_lr()
+    def print_lr(self, *args, **kwargs):
+        return self.scheduler.print_lr(*args, **kwargs)

PaddleMIX/ppdiffusers/ppdiffusers/accelerate/tracking.py ADDED Viewed

	@@ -0,0 +1,1103 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Expectation:
+# Provide a project dir name, then each type of logger gets stored in project/{`logging_dir`}
+import json
+import os
+import time
+from functools import wraps
+from typing import Any, Dict, List, Optional, Union
+import yaml
+from .logging import get_logger
+from .state import PartialState
+from .utils import (
+    LoggerType,
+    is_aim_available,
+    is_clearml_available,
+    is_comet_ml_available,
+    is_dvclive_available,
+    is_mlflow_available,
+    is_tensorboard_available,
+    is_visualdl_available,
+    is_wandb_available,
+    listify,
+)
+_available_trackers = []
+if is_tensorboard_available():
+    _available_trackers.append(LoggerType.TENSORBOARD)
+if is_wandb_available():
+    _available_trackers.append(LoggerType.WANDB)
+if is_comet_ml_available():
+    _available_trackers.append(LoggerType.COMETML)
+if is_aim_available():
+    _available_trackers.append(LoggerType.AIM)
+if is_mlflow_available():
+    _available_trackers.append(LoggerType.MLFLOW)
+if is_clearml_available():
+    _available_trackers.append(LoggerType.CLEARML)
+if is_dvclive_available():
+    _available_trackers.append(LoggerType.DVCLIVE)
+if is_visualdl_available():
+    _available_trackers.append(LoggerType.VISUALDL)
+logger = get_logger(__name__)
+def on_main_process(function):
+    """
+    Decorator to selectively run the decorated function on the main process only based on the `main_process_only`
+    attribute in a class.
+    Checks at function execution rather than initialization time, not triggering the initialization of the
+    `PartialState`.
+    """
+    @wraps(function)
+    def execute_on_main_process(self, *args, **kwargs):
+        if getattr(self, "main_process_only", False):
+            return PartialState().on_main_process(function)(self, *args, **kwargs)
+        else:
+            return function(self, *args, **kwargs)
+    return execute_on_main_process
+def get_available_trackers():
+    "Returns a list of all supported available trackers in the system"
+    return _available_trackers
+class GeneralTracker:
+    """
+    A base Tracker class to be used for all logging integration implementations.
+    Each function should take in `**kwargs` that will automatically be passed in from a base dictionary provided to
+    [`Accelerator`].
+    Should implement `name`, `requires_logging_directory`, and `tracker` properties such that:
+    `name` (`str`): String representation of the tracker class name, such as "TensorBoard" `requires_logging_directory`
+    (`bool`): Whether the logger requires a directory to store their logs. `tracker` (`object`): Should return internal
+    tracking mechanism used by a tracker class (such as the `run` for wandb)
+    Implementations can also include a `main_process_only` (`bool`) attribute to toggle if relevent logging, init, and
+    other functions should occur on the main process or across all processes (by default will use `True`)
+    """
+    main_process_only = True
+    def __init__(self, _blank=False):
+        if not _blank:
+            err = ""
+            if not hasattr(self, "name"):
+                err += "`name`"
+            if not hasattr(self, "requires_logging_directory"):
+                if len(err) > 0:
+                    err += ", "
+                err += "`requires_logging_directory`"
+            # as tracker is a @property that relies on post-init
+            if "tracker" not in dir(self):
+                if len(err) > 0:
+                    err += ", "
+                err += "`tracker`"
+            if len(err) > 0:
+                raise NotImplementedError(
+                    f"The implementation for this tracker class is missing the following "
+                    f"required attributes. Please define them in the class definition: "
+                    f"{err}"
+                )
+    def store_init_configuration(self, values: dict):
+        """
+        Logs `values` as hyperparameters for the run. Implementations should use the experiment configuration
+        functionality of a tracking API.
+        Args:
+            values (Dictionary `str` to `bool`, `str`, `float` or `int`):
+                Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
+                `str`, `float`, `int`, or `None`.
+        """
+        pass
+    def log(self, values: dict, step: Optional[int], **kwargs):
+        """
+        Logs `values` to the current run. Base `log` implementations of a tracking API should go in here, along with
+        special behavior for the `step parameter.
+        Args:
+            values (Dictionary `str` to `str`, `float`, or `int`):
+                Values to be logged as key-value pairs. The values need to have type `str`, `float`, or `int`.
+            step (`int`, *optional*):
+                The run step. If included, the log will be affiliated with this step.
+        """
+        pass
+    def finish(self):
+        """
+        Should run any finalizing functions within the tracking API. If the API should not have one, just don't
+        overwrite that method.
+        """
+        pass
+class TensorBoardTracker(GeneralTracker):
+    """
+    A `Tracker` class that supports `tensorboard`. Should be initialized at the start of your script.
+    Args:
+        run_name (`str`):
+            The name of the experiment run
+        logging_dir (`str`, `os.PathLike`):
+            Location for TensorBoard logs to be stored.
+        kwargs:
+            Additional key word arguments passed along to the `tensorboard.SummaryWriter.__init__` method.
+    """
+    name = "tensorboard"
+    requires_logging_directory = True
+    @on_main_process
+    def __init__(self, run_name: str, logging_dir: Union[str, os.PathLike], **kwargs):
+        try:
+            from torch.utils import tensorboard
+        except ModuleNotFoundError:
+            import tensorboardX as tensorboard
+        super().__init__()
+        self.run_name = run_name
+        self.logging_dir = os.path.join(logging_dir, run_name)
+        self.writer = tensorboard.SummaryWriter(self.logging_dir, **kwargs)
+        logger.debug(f"Initialized TensorBoard project {self.run_name} logging to {self.logging_dir}")
+        logger.debug(
+            "Make sure to log any initial configurations with `self.store_init_configuration` before training!"
+        )
+    @property
+    def tracker(self):
+        return self.writer
+    @on_main_process
+    def store_init_configuration(self, values: dict):
+        """
+        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment. Stores the
+        hyperparameters in a yaml file for future use.
+        Args:
+            values (Dictionary `str` to `bool`, `str`, `float` or `int`):
+                Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
+                `str`, `float`, `int`, or `None`.
+        """
+        self.writer.add_hparams(values, metric_dict={})
+        self.writer.flush()
+        project_run_name = time.time()
+        dir_name = os.path.join(self.logging_dir, str(project_run_name))
+        os.makedirs(dir_name, exist_ok=True)
+        with open(os.path.join(dir_name, "hparams.yml"), "w") as outfile:
+            try:
+                yaml.dump(values, outfile)
+            except yaml.representer.RepresenterError:
+                logger.error("Serialization to store hyperparameters failed")
+                raise
+        logger.debug("Stored initial configuration hyperparameters to TensorBoard and hparams yaml file")
+    @on_main_process
+    def log(self, values: dict, step: Optional[int] = None, **kwargs):
+        """
+        Logs `values` to the current run.
+        Args:
+            values (Dictionary `str` to `str`, `float`, `int` or `dict` of `str` to `float`/`int`):
+                Values to be logged as key-value pairs. The values need to have type `str`, `float`, `int` or `dict` of
+                `str` to `float`/`int`.
+            step (`int`, *optional*):
+                The run step. If included, the log will be affiliated with this step.
+            kwargs:
+                Additional key word arguments passed along to either `SummaryWriter.add_scaler`,
+                `SummaryWriter.add_text`, or `SummaryWriter.add_scalers` method based on the contents of `values`.
+        """
+        values = listify(values)
+        for k, v in values.items():
+            if isinstance(v, (int, float)):
+                self.writer.add_scalar(k, v, global_step=step, **kwargs)
+            elif isinstance(v, str):
+                self.writer.add_text(k, v, global_step=step, **kwargs)
+            elif isinstance(v, dict):
+                self.writer.add_scalars(k, v, global_step=step, **kwargs)
+        self.writer.flush()
+        logger.debug("Successfully logged to TensorBoard")
+    @on_main_process
+    def log_images(self, values: dict, step: Optional[int], **kwargs):
+        """
+        Logs `images` to the current run.
+        Args:
+            values (Dictionary `str` to `List` of `np.ndarray` or `PIL.Image`):
+                Values to be logged as key-value pairs. The values need to have type `List` of `np.ndarray` or
+            step (`int`, *optional*):
+                The run step. If included, the log will be affiliated with this step.
+            kwargs:
+                Additional key word arguments passed along to the `SummaryWriter.add_image` method.
+        """
+        for k, v in values.items():
+            self.writer.add_images(k, v, global_step=step, **kwargs)
+        logger.debug("Successfully logged images to TensorBoard")
+    @on_main_process
+    def finish(self):
+        """
+        Closes `TensorBoard` writer
+        """
+        self.writer.close()
+        logger.debug("TensorBoard writer closed")
+class VisualdlTracker(GeneralTracker):
+    """
+    A `Tracker` class that supports `visualdl`. Should be initialized at the start of your script.
+    Args:
+        run_name (`str`):
+            The name of the experiment run
+        logging_dir (`str`, `os.PathLike`):
+            Location for Visualdl logs to be stored.
+        kwargs:
+            Additional key word arguments passed along to the `visualdl.LogWriter.__init__` method.
+    """
+    name = "visualdl"
+    requires_logging_directory = True
+    @on_main_process
+    def __init__(self, run_name: str, logging_dir: Union[str, os.PathLike], **kwargs):
+        super().__init__()
+        from visualdl import LogWriter
+        self.run_name = run_name
+        self.logging_dir = os.path.join(logging_dir, run_name)
+        self.writer = LogWriter(self.logging_dir, **kwargs)
+        logger.debug(f"Initialized TensorBoard project {self.run_name} logging to {self.logging_dir}")
+        logger.debug(
+            "Make sure to log any initial configurations with `self.store_init_configuration` before training!"
+        )
+    @property
+    def tracker(self):
+        return self.writer
+    @on_main_process
+    def store_init_configuration(self, values: dict):
+        """
+        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment. Stores the
+        hyperparameters in a yaml file for future use.
+        Args:
+            values (Dictionary `str` to `bool`, `str`, `float` or `int`):
+                Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
+                `str`, `float`, `int`, or `None`.
+        """
+        self.writer.add_hparams(hparams_dict=values, metrics_list=[])
+        self.writer.flush()
+        project_run_name = time.time()
+        dir_name = os.path.join(self.logging_dir, str(project_run_name))
+        os.makedirs(dir_name, exist_ok=True)
+        with open(os.path.join(dir_name, "hparams.yml"), "w") as outfile:
+            try:
+                yaml.dump(values, outfile)
+            except yaml.representer.RepresenterError:
+                logger.error("Serialization to store hyperparameters failed")
+                raise
+        logger.debug("Stored initial configuration hyperparameters to TensorBoard and hparams yaml file")
+    @on_main_process
+    def log(self, values: dict, step: Optional[int] = None, **kwargs):
+        """
+        Logs `values` to the current run.
+        Args:
+            values (Dictionary `str` to `str`, `float`, `int` or `dict` of `str` to `float`/`int`):
+                Values to be logged as key-value pairs. The values need to have type `str`, `float`, `int` or `dict` of
+                `str` to `float`/`int`.
+            step (`int`, *optional*):
+                The run step. If included, the log will be affiliated with this step.
+            kwargs:
+                Additional key word arguments passed along to either `SummaryWriter.add_scaler`,
+                `SummaryWriter.add_text`, or `SummaryWriter.add_scalers` method based on the contents of `values`.
+        """
+        values = listify(values)
+        for k, v in values.items():
+            if isinstance(v, (int, float)):
+                self.writer.add_scalar(k, v, step=step, **kwargs)
+            elif isinstance(v, str):
+                self.writer.add_text(k, v, step=step, **kwargs)
+            elif isinstance(v, dict):
+                self.writer.add_scalars(k, v, step=step, **kwargs)
+        self.writer.flush()
+        logger.debug("Successfully logged to Visualdl")
+    @on_main_process
+    def log_images(self, values: dict, step: Optional[int], **kwargs):
+        """
+        Logs `images` to the current run.
+        Args:
+            values (Dictionary `str` to `List` of `np.ndarray` or `PIL.Image`):
+                Values to be logged as key-value pairs. The values need to have type `List` of `np.ndarray` or
+            step (`int`, *optional*):
+                The run step. If included, the log will be affiliated with this step.
+            kwargs:
+                Additional key word arguments passed along to the `SummaryWriter.add_image` method.
+        """
+        for k, v in values.items():
+            self.writer.add_image(k, v, step=step, **kwargs)
+        logger.debug("Successfully logged images to Visualdl")
+    @on_main_process
+    def finish(self):
+        """
+        Closes `VisualDL` writer
+        """
+        self.writer.close()
+        logger.debug("VisualDL writer closed")
+class WandBTracker(GeneralTracker):
+    """
+    A `Tracker` class that supports `wandb`. Should be initialized at the start of your script.
+    Args:
+        run_name (`str`):
+            The name of the experiment run.
+        kwargs:
+            Additional key word arguments passed along to the `wandb.init` method.
+    """
+    name = "wandb"
+    requires_logging_directory = False
+    main_process_only = False
+    @on_main_process
+    def __init__(self, run_name: str, **kwargs):
+        super().__init__()
+        self.run_name = run_name
+        import wandb
+        self.run = wandb.init(project=self.run_name, **kwargs)
+        logger.debug(f"Initialized WandB project {self.run_name}")
+        logger.debug(
+            "Make sure to log any initial configurations with `self.store_init_configuration` before training!"
+        )
+    @property
+    def tracker(self):
+        return self.run
+    @on_main_process
+    def store_init_configuration(self, values: dict):
+        """
+        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment.
+        Args:
+            values (Dictionary `str` to `bool`, `str`, `float` or `int`):
+                Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
+                `str`, `float`, `int`, or `None`.
+        """
+        import wandb
+        wandb.config.update(values, allow_val_change=True)
+        logger.debug("Stored initial configuration hyperparameters to WandB")
+    @on_main_process
+    def log(self, values: dict, step: Optional[int] = None, **kwargs):
+        """
+        Logs `values` to the current run.
+        Args:
+            values (Dictionary `str` to `str`, `float`, `int` or `dict` of `str` to `float`/`int`):
+                Values to be logged as key-value pairs. The values need to have type `str`, `float`, `int` or `dict` of
+                `str` to `float`/`int`.
+            step (`int`, *optional*):
+                The run step. If included, the log will be affiliated with this step.
+            kwargs:
+                Additional key word arguments passed along to the `wandb.log` method.
+        """
+        self.run.log(values, step=step, **kwargs)
+        logger.debug("Successfully logged to WandB")
+    @on_main_process
+    def log_images(self, values: dict, step: Optional[int] = None, **kwargs):
+        """
+        Logs `images` to the current run.
+        Args:
+            values (Dictionary `str` to `List` of `np.ndarray` or `PIL.Image`):
+                Values to be logged as key-value pairs. The values need to have type `List` of `np.ndarray` or
+            step (`int`, *optional*):
+                The run step. If included, the log will be affiliated with this step.
+            kwargs:
+                Additional key word arguments passed along to the `wandb.log` method.
+        """
+        import wandb
+        for k, v in values.items():
+            self.log({k: [wandb.Image(image) for image in v]}, step=step, **kwargs)
+        logger.debug("Successfully logged images to WandB")
+    @on_main_process
+    def log_table(
+        self,
+        table_name: str,
+        columns: List[str] = None,
+        data: List[List[Any]] = None,
+        dataframe: Any = None,
+        step: Optional[int] = None,
+        **kwargs,
+    ):
+        """
+        Log a Table containing any object type (text, image, audio, video, molecule, html, etc). Can be defined either
+        with `columns` and `data` or with `dataframe`.
+        Args:
+            table_name (`str`):
+                The name to give to the logged table on the wandb workspace
+            columns (list of `str`, *optional*):
+                The name of the columns on the table
+            data (List of List of Any data type, *optional*):
+                The data to be logged in the table
+            dataframe (Any data type, *optional*):
+                The data to be logged in the table
+            step (`int`, *optional*):
+                The run step. If included, the log will be affiliated with this step.
+        """
+        import wandb
+        values = {table_name: wandb.Table(columns=columns, data=data, dataframe=dataframe)}
+        self.log(values, step=step, **kwargs)
+    @on_main_process
+    def finish(self):
+        """
+        Closes `wandb` writer
+        """
+        self.run.finish()
+        logger.debug("WandB run closed")
+class CometMLTracker(GeneralTracker):
+    """
+    A `Tracker` class that supports `comet_ml`. Should be initialized at the start of your script.
+    API keys must be stored in a Comet config file.
+    Args:
+        run_name (`str`):
+            The name of the experiment run.
+        kwargs:
+            Additional key word arguments passed along to the `Experiment.__init__` method.
+    """
+    name = "comet_ml"
+    requires_logging_directory = False
+    @on_main_process
+    def __init__(self, run_name: str, **kwargs):
+        super().__init__()
+        self.run_name = run_name
+        from comet_ml import Experiment
+        self.writer = Experiment(project_name=run_name, **kwargs)
+        logger.debug(f"Initialized CometML project {self.run_name}")
+        logger.debug(
+            "Make sure to log any initial configurations with `self.store_init_configuration` before training!"
+        )
+    @property
+    def tracker(self):
+        return self.writer
+    @on_main_process
+    def store_init_configuration(self, values: dict):
+        """
+        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment.
+        Args:
+            values (Dictionary `str` to `bool`, `str`, `float` or `int`):
+                Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
+                `str`, `float`, `int`, or `None`.
+        """
+        self.writer.log_parameters(values)
+        logger.debug("Stored initial configuration hyperparameters to CometML")
+    @on_main_process
+    def log(self, values: dict, step: Optional[int] = None, **kwargs):
+        """
+        Logs `values` to the current run.
+        Args:
+            values (Dictionary `str` to `str`, `float`, `int` or `dict` of `str` to `float`/`int`):
+                Values to be logged as key-value pairs. The values need to have type `str`, `float`, `int` or `dict` of
+                `str` to `float`/`int`.
+            step (`int`, *optional*):
+                The run step. If included, the log will be affiliated with this step.
+            kwargs:
+                Additional key word arguments passed along to either `Experiment.log_metric`, `Experiment.log_other`,
+                or `Experiment.log_metrics` method based on the contents of `values`.
+        """
+        if step is not None:
+            self.writer.set_step(step)
+        for k, v in values.items():
+            if isinstance(v, (int, float)):
+                self.writer.log_metric(k, v, step=step, **kwargs)
+            elif isinstance(v, str):
+                self.writer.log_other(k, v, **kwargs)
+            elif isinstance(v, dict):
+                self.writer.log_metrics(v, step=step, **kwargs)
+        logger.debug("Successfully logged to CometML")
+    @on_main_process
+    def finish(self):
+        """
+        Closes `comet-ml` writer
+        """
+        self.writer.end()
+        logger.debug("CometML run closed")
+class AimTracker(GeneralTracker):
+    """
+    A `Tracker` class that supports `aim`. Should be initialized at the start of your script.
+    Args:
+        run_name (`str`):
+            The name of the experiment run.
+        kwargs:
+            Additional key word arguments passed along to the `Run.__init__` method.
+    """
+    name = "aim"
+    requires_logging_directory = True
+    @on_main_process
+    def __init__(self, run_name: str, logging_dir: Optional[Union[str, os.PathLike]] = ".", **kwargs):
+        self.run_name = run_name
+        from aim import Run
+        self.writer = Run(repo=logging_dir, **kwargs)
+        self.writer.name = self.run_name
+        logger.debug(f"Initialized Aim project {self.run_name}")
+        logger.debug(
+            "Make sure to log any initial configurations with `self.store_init_configuration` before training!"
+        )
+    @property
+    def tracker(self):
+        return self.writer
+    @on_main_process
+    def store_init_configuration(self, values: dict):
+        """
+        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment.
+        Args:
+            values (`dict`):
+                Values to be stored as initial hyperparameters as key-value pairs.
+        """
+        self.writer["hparams"] = values
+    @on_main_process
+    def log(self, values: dict, step: Optional[int], **kwargs):
+        """
+        Logs `values` to the current run.
+        Args:
+            values (`dict`):
+                Values to be logged as key-value pairs.
+            step (`int`, *optional*):
+                The run step. If included, the log will be affiliated with this step.
+            kwargs:
+                Additional key word arguments passed along to the `Run.track` method.
+        """
+        # Note: replace this with the dictionary support when merged
+        for key, value in values.items():
+            self.writer.track(value, name=key, step=step, **kwargs)
+    @on_main_process
+    def finish(self):
+        """
+        Closes `aim` writer
+        """
+        self.writer.close()
+class MLflowTracker(GeneralTracker):
+    """
+    A `Tracker` class that supports `mlflow`. Should be initialized at the start of your script.
+    Args:
+        experiment_name (`str`, *optional*):
+            Name of the experiment. Environment variable MLFLOW_EXPERIMENT_NAME has priority over this argument.
+        logging_dir (`str` or `os.PathLike`, defaults to `"."`):
+            Location for mlflow logs to be stored.
+        run_id (`str`, *optional*):
+            If specified, get the run with the specified UUID and log parameters and metrics under that run. The run’s
+            end time is unset and its status is set to running, but the run’s other attributes (source_version,
+            source_type, etc.) are not changed. Environment variable MLFLOW_RUN_ID has priority over this argument.
+        tags (`Dict[str, str]`, *optional*):
+            An optional `dict` of `str` keys and values, or a `str` dump from a `dict`, to set as tags on the run. If a
+            run is being resumed, these tags are set on the resumed run. If a new run is being created, these tags are
+            set on the new run. Environment variable MLFLOW_TAGS has priority over this argument.
+        nested_run (`bool`, *optional*, defaults to `False`):
+            Controls whether run is nested in parent run. True creates a nested run. Environment variable
+            MLFLOW_NESTED_RUN has priority over this argument.
+        run_name (`str`, *optional*):
+            Name of new run (stored as a mlflow.runName tag). Used only when `run_id` is unspecified.
+        description (`str`, *optional*):
+            An optional string that populates the description box of the run. If a run is being resumed, the
+            description is set on the resumed run. If a new run is being created, the description is set on the new
+            run.
+    """
+    name = "mlflow"
+    requires_logging_directory = False
+    @on_main_process
+    def __init__(
+        self,
+        experiment_name: str = None,
+        logging_dir: Optional[Union[str, os.PathLike]] = None,
+        run_id: Optional[str] = None,
+        tags: Optional[Union[Dict[str, Any], str]] = None,
+        nested_run: Optional[bool] = False,
+        run_name: Optional[str] = None,
+        description: Optional[str] = None,
+    ):
+        experiment_name = os.getenv("MLFLOW_EXPERIMENT_NAME", experiment_name)
+        run_id = os.getenv("MLFLOW_RUN_ID", run_id)
+        tags = os.getenv("MLFLOW_TAGS", tags)
+        if isinstance(tags, str):
+            tags = json.loads(tags)
+        nested_run = os.getenv("MLFLOW_NESTED_RUN", nested_run)
+        import mlflow
+        exps = mlflow.search_experiments(filter_string=f"name = '{experiment_name}'")
+        if len(exps) > 0:
+            if len(exps) > 1:
+                logger.warning("Multiple experiments with the same name found. Using first one.")
+            experiment_id = exps[0].experiment_id
+        else:
+            experiment_id = mlflow.create_experiment(
+                name=experiment_name,
+                artifact_location=logging_dir,
+                tags=tags,
+            )
+        self.active_run = mlflow.start_run(
+            run_id=run_id,
+            experiment_id=experiment_id,
+            run_name=run_name,
+            nested=nested_run,
+            tags=tags,
+            description=description,
+        )
+        logger.debug(f"Initialized mlflow experiment {experiment_name}")
+        logger.debug(
+            "Make sure to log any initial configurations with `self.store_init_configuration` before training!"
+        )
+    @property
+    def tracker(self):
+        return self.active_run
+    @on_main_process
+    def store_init_configuration(self, values: dict):
+        """
+        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment.
+        Args:
+            values (`dict`):
+                Values to be stored as initial hyperparameters as key-value pairs.
+        """
+        import mlflow
+        for name, value in list(values.items()):
+            # internally, all values are converted to str in MLflow
+            if len(str(value)) > mlflow.utils.validation.MAX_PARAM_VAL_LENGTH:
+                logger.warning_once(
+                    f'Accelerate is attempting to log a value of "{value}" for key "{name}" as a parameter. MLflow\'s'
+                    f" log_param() only accepts values no longer than {mlflow.utils.validation.MAX_PARAM_VAL_LENGTH} characters so we dropped this attribute."
+                )
+                del values[name]
+        values_list = list(values.items())
+        # MLflow cannot log more than 100 values in one go, so we have to split it
+        for i in range(0, len(values_list), mlflow.utils.validation.MAX_PARAMS_TAGS_PER_BATCH):
+            mlflow.log_params(dict(values_list[i : i + mlflow.utils.validation.MAX_PARAMS_TAGS_PER_BATCH]))
+        logger.debug("Stored initial configuration hyperparameters to MLflow")
+    @on_main_process
+    def log(self, values: dict, step: Optional[int]):
+        """
+        Logs `values` to the current run.
+        Args:
+            values (`dict`):
+                Values to be logged as key-value pairs.
+            step (`int`, *optional*):
+                The run step. If included, the log will be affiliated with this step.
+        """
+        metrics = {}
+        for k, v in values.items():
+            if isinstance(v, (int, float)):
+                metrics[k] = v
+            else:
+                logger.warning_once(
+                    f'MLflowTracker is attempting to log a value of "{v}" of type {type(v)} for key "{k}" as a metric. '
+                    "MLflow's log_metric() only accepts float and int types so we dropped this attribute."
+                )
+        import mlflow
+        mlflow.log_metrics(metrics, step=step)
+        logger.debug("Successfully logged to mlflow")
+    @on_main_process
+    def finish(self):
+        """
+        End the active MLflow run.
+        """
+        import mlflow
+        mlflow.end_run()
+class ClearMLTracker(GeneralTracker):
+    """
+    A `Tracker` class that supports `clearml`. Should be initialized at the start of your script.
+    Args:
+        run_name (`str`, *optional*):
+            Name of the experiment. Environment variables `CLEARML_PROJECT` and `CLEARML_TASK` have priority over this
+            argument.
+        kwargs:
+            Kwargs passed along to the `Task.__init__` method.
+    """
+    name = "clearml"
+    requires_logging_directory = False
+    @on_main_process
+    def __init__(self, run_name: str = None, **kwargs):
+        from clearml import Task
+        current_task = Task.current_task()
+        self._initialized_externally = False
+        if current_task:
+            self._initialized_externally = True
+            self.task = current_task
+            return
+        kwargs.setdefault("project_name", os.environ.get("CLEARML_PROJECT", run_name))
+        kwargs.setdefault("task_name", os.environ.get("CLEARML_TASK", run_name))
+        self.task = Task.init(**kwargs)
+    @property
+    def tracker(self):
+        return self.task
+    @on_main_process
+    def store_init_configuration(self, values: dict):
+        """
+        Connect configuration dictionary to the Task object. Should be run at the beginning of your experiment.
+        Args:
+            values (`dict`):
+                Values to be stored as initial hyperparameters as key-value pairs.
+        """
+        return self.task.connect_configuration(values)
+    @on_main_process
+    def log(self, values: Dict[str, Union[int, float]], step: Optional[int] = None, **kwargs):
+        """
+        Logs `values` dictionary to the current run. The dictionary keys must be strings. The dictionary values must be
+        ints or floats
+        Args:
+            values (`Dict[str, Union[int, float]]`):
+                Values to be logged as key-value pairs. If the key starts with 'eval_'/'test_'/'train_', the value will
+                be reported under the 'eval'/'test'/'train' series and the respective prefix will be removed.
+                Otherwise, the value will be reported under the 'train' series, and no prefix will be removed.
+            step (`int`, *optional*):
+                If specified, the values will be reported as scalars, with the iteration number equal to `step`.
+                Otherwise they will be reported as single values.
+            kwargs:
+                Additional key word arguments passed along to the `clearml.Logger.report_single_value` or
+                `clearml.Logger.report_scalar` methods.
+        """
+        clearml_logger = self.task.get_logger()
+        for k, v in values.items():
+            if not isinstance(v, (int, float)):
+                logger.warning_once(
+                    "Accelerator is attempting to log a value of "
+                    f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
+                    "This invocation of ClearML logger's  report_scalar() "
+                    "is incorrect so we dropped this attribute."
+                )
+                continue
+            if step is None:
+                clearml_logger.report_single_value(name=k, value=v, **kwargs)
+                continue
+            title, series = ClearMLTracker._get_title_series(k)
+            clearml_logger.report_scalar(title=title, series=series, value=v, iteration=step, **kwargs)
+    @on_main_process
+    def log_images(self, values: dict, step: Optional[int] = None, **kwargs):
+        """
+        Logs `images` to the current run.
+        Args:
+            values (`Dict[str, List[Union[np.ndarray, PIL.Image]]`):
+                Values to be logged as key-value pairs. The values need to have type `List` of `np.ndarray` or
+            step (`int`, *optional*):
+                The run step. If included, the log will be affiliated with this step.
+            kwargs:
+                Additional key word arguments passed along to the `clearml.Logger.report_image` method.
+        """
+        clearml_logger = self.task.get_logger()
+        for k, v in values.items():
+            title, series = ClearMLTracker._get_title_series(k)
+            clearml_logger.report_image(title=title, series=series, iteration=step, image=v, **kwargs)
+    @on_main_process
+    def log_table(
+        self,
+        table_name: str,
+        columns: List[str] = None,
+        data: List[List[Any]] = None,
+        dataframe: Any = None,
+        step: Optional[int] = None,
+        **kwargs,
+    ):
+        """
+        Log a Table to the task. Can be defined eitherwith `columns` and `data` or with `dataframe`.
+        Args:
+            table_name (`str`):
+                The name of the table
+            columns (list of `str`, *optional*):
+                The name of the columns on the table
+            data (List of List of Any data type, *optional*):
+                The data to be logged in the table. If `columns` is not specified, then the first entry in data will be
+                the name of the columns of the table
+            dataframe (Any data type, *optional*):
+                The data to be logged in the table
+            step (`int`, *optional*):
+                The run step. If included, the log will be affiliated with this step.
+            kwargs:
+                Additional key word arguments passed along to the `clearml.Logger.report_table` method.
+        """
+        to_report = dataframe
+        if dataframe is None:
+            if data is None:
+                raise ValueError(
+                    "`ClearMLTracker.log_table` requires that `data` to be supplied if `dataframe` is `None`"
+                )
+            to_report = [columns] + data if columns else data
+        title, series = ClearMLTracker._get_title_series(table_name)
+        self.task.get_logger().report_table(title=title, series=series, table_plot=to_report, iteration=step, **kwargs)
+    @on_main_process
+    def finish(self):
+        """
+        Close the ClearML task. If the task was initialized externally (e.g. by manually calling `Task.init`), this
+        function is a noop
+        """
+        if self.task and not self._initialized_externally:
+            self.task.close()
+    @staticmethod
+    def _get_title_series(name):
+        for prefix in ["eval", "test", "train"]:
+            if name.startswith(prefix + "_"):
+                return name[len(prefix) + 1 :], prefix
+        return name, "train"
+class DVCLiveTracker(GeneralTracker):
+    """
+    A `Tracker` class that supports `dvclive`. Should be initialized at the start of your script.
+    Args:
+        run_name (`str`, *optional*):
+            Ignored for dvclive. See `kwargs` instead.
+        kwargs:
+            Additional key word arguments passed along to [`dvclive.Live()`](https://dvc.org/doc/dvclive/live).
+    Example:
+    ```py
+    from accelerate import Accelerator
+    accelerator = Accelerator(log_with="dvclive")
+    accelerator.init_trackers(project_name="my_project", init_kwargs={"dvclive": {"dir": "my_directory"}})
+    ```
+    """
+    name = "dvclive"
+    requires_logging_directory = False
+    @on_main_process
+    def __init__(self, run_name: Optional[str] = None, live: Optional[Any] = None, **kwargs):
+        from dvclive import Live
+        super().__init__()
+        self.live = live if live is not None else Live(**kwargs)
+    @property
+    def tracker(self):
+        return self.live
+    @on_main_process
+    def store_init_configuration(self, values: dict):
+        """
+        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment. Stores the
+        hyperparameters in a yaml file for future use.
+        Args:
+            values (Dictionary `str` to `bool`, `str`, `float`, `int`, or a List or Dict of those types):
+                Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
+                `str`, `float`, or `int`.
+        """
+        self.live.log_params(values)
+    @on_main_process
+    def log(self, values: dict, step: Optional[int] = None, **kwargs):
+        """
+        Logs `values` to the current run.
+        Args:
+            values (Dictionary `str` to `str`, `float`, or `int`):
+                Values to be logged as key-value pairs. The values need to have type `str`, `float`, or `int`.
+            step (`int`, *optional*):
+                The run step. If included, the log will be affiliated with this step.
+            kwargs:
+                Additional key word arguments passed along to `dvclive.Live.log_metric()`.
+        """
+        from dvclive.plots import Metric
+        if step is not None:
+            self.live.step = step
+        for k, v in values.items():
+            if Metric.could_log(v):
+                self.live.log_metric(k, v, **kwargs)
+            else:
+                logger.warning_once(
+                    "Accelerator attempted to log a value of "
+                    f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
+                    "This invocation of DVCLive's Live.log_metric() "
+                    "is incorrect so we dropped this attribute."
+                )
+    @on_main_process
+    def finish(self):
+        """
+        Closes `dvclive.Live()`.
+        """
+        self.live.end()
+LOGGER_TYPE_TO_CLASS = {
+    "aim": AimTracker,
+    "comet_ml": CometMLTracker,
+    "mlflow": MLflowTracker,
+    "tensorboard": TensorBoardTracker,
+    "wandb": WandBTracker,
+    "clearml": ClearMLTracker,
+    "dvclive": DVCLiveTracker,
+    "visualdl": VisualdlTracker,
+}
+def filter_trackers(
+    log_with: List[Union[str, LoggerType, GeneralTracker]], logging_dir: Union[str, os.PathLike] = None
+):
+    """
+    Takes in a list of potential tracker types and checks that:
+        - The tracker wanted is available in that environment
+        - Filters out repeats of tracker types
+        - If `all` is in `log_with`, will return all trackers in the environment
+        - If a tracker requires a `logging_dir`, ensures that `logging_dir` is not `None`
+    Args:
+        log_with (list of `str`, [`~utils.LoggerType`] or [`~tracking.GeneralTracker`], *optional*):
+            A list of loggers to be setup for experiment tracking. Should be one or several of:
+            - `"all"`
+            - `"tensorboard"`
+            - `"wandb"`
+            - `"comet_ml"`
+            - `"mlflow"`
+            - `"dvclive"`
+            - `"visualdl"`
+            If `"all"` is selected, will pick up all available trackers in the environment and initialize them. Can
+            also accept implementations of `GeneralTracker` for custom trackers, and can be combined with `"all"`.
+        logging_dir (`str`, `os.PathLike`, *optional*):
+            A path to a directory for storing logs of locally-compatible loggers.
+    """
+    loggers = []
+    if log_with is not None:
+        if not isinstance(log_with, (list, tuple)):
+            log_with = [log_with]
+        if "all" in log_with or LoggerType.ALL in log_with:
+            loggers = [o for o in log_with if issubclass(type(o), GeneralTracker)] + get_available_trackers()
+        else:
+            for log_type in log_with:
+                if log_type not in LoggerType and not issubclass(type(log_type), GeneralTracker):
+                    raise ValueError(f"Unsupported logging capability: {log_type}. Choose between {LoggerType.list()}")
+                if issubclass(type(log_type), GeneralTracker):
+                    loggers.append(log_type)
+                else:
+                    log_type = LoggerType(log_type)
+                    if log_type not in loggers:
+                        if log_type in get_available_trackers():
+                            tracker_init = LOGGER_TYPE_TO_CLASS[str(log_type)]
+                            if getattr(tracker_init, "requires_logging_directory"):
+                                if logging_dir is None:
+                                    raise ValueError(
+                                        f"Logging with `{log_type}` requires a `logging_dir` to be passed in."
+                                    )
+                            loggers.append(log_type)
+                        else:
+                            logger.debug(f"Tried adding logger {log_type}, but package is unavailable in the system.")
+    return loggers

PaddleMIX/ppdiffusers/ppdiffusers/callbacks.py ADDED Viewed

	@@ -0,0 +1,156 @@

+from typing import Any, Dict, List
+from .configuration_utils import ConfigMixin, register_to_config
+from .utils import CONFIG_NAME
+class PipelineCallback(ConfigMixin):
+    """
+    Base class for all the official callbacks used in a pipeline. This class provides a structure for implementing
+    custom callbacks and ensures that all callbacks have a consistent interface.
+    Please implement the following:
+        `tensor_inputs`: This should return a list of tensor inputs specific to your callback. You will only be able to
+        include
+            variables listed in the `._callback_tensor_inputs` attribute of your pipeline class.
+        `callback_fn`: This method defines the core functionality of your callback.
+    """
+    config_name = CONFIG_NAME
+    @register_to_config
+    def __init__(self, cutoff_step_ratio=1.0, cutoff_step_index=None):
+        super().__init__()
+        if (cutoff_step_ratio is None and cutoff_step_index is None) or (
+            cutoff_step_ratio is not None and cutoff_step_index is not None
+        ):
+            raise ValueError("Either cutoff_step_ratio or cutoff_step_index should be provided, not both or none.")
+        if cutoff_step_ratio is not None and (
+            not isinstance(cutoff_step_ratio, float) or not (0.0 <= cutoff_step_ratio <= 1.0)
+        ):
+            raise ValueError("cutoff_step_ratio must be a float between 0.0 and 1.0.")
+    @property
+    def tensor_inputs(self) -> List[str]:
+        raise NotImplementedError(f"You need to set the attribute `tensor_inputs` for {self.__class__}")
+    def callback_fn(self, pipeline, step_index, timesteps, callback_kwargs) -> Dict[str, Any]:
+        raise NotImplementedError(f"You need to implement the method `callback_fn` for {self.__class__}")
+    def __call__(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
+        return self.callback_fn(pipeline, step_index, timestep, callback_kwargs)
+class MultiPipelineCallbacks:
+    """
+    This class is designed to handle multiple pipeline callbacks. It accepts a list of PipelineCallback objects and
+    provides a unified interface for calling all of them.
+    """
+    def __init__(self, callbacks: List[PipelineCallback]):
+        self.callbacks = callbacks
+    @property
+    def tensor_inputs(self) -> List[str]:
+        return [input for callback in self.callbacks for input in callback.tensor_inputs]
+    def __call__(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
+        """
+        Calls all the callbacks in order with the given arguments and returns the final callback_kwargs.
+        """
+        for callback in self.callbacks:
+            callback_kwargs = callback(pipeline, step_index, timestep, callback_kwargs)
+        return callback_kwargs
+class SDCFGCutoffCallback(PipelineCallback):
+    """
+    Callback function for Stable Diffusion Pipelines. After certain number of steps (set by `cutoff_step_ratio` or
+    `cutoff_step_index`), this callback will disable the CFG.
+    Note: This callback mutates the pipeline by changing the `_guidance_scale` attribute to 0.0 after the cutoff step.
+    """
+    tensor_inputs = ["prompt_embeds"]
+    def callback_fn(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
+        cutoff_step_ratio = self.config.cutoff_step_ratio
+        cutoff_step_index = self.config.cutoff_step_index
+        # Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
+        cutoff_step = (
+            cutoff_step_index if cutoff_step_index is not None else int(pipeline.num_timesteps * cutoff_step_ratio)
+        )
+        if step_index == cutoff_step:
+            prompt_embeds = callback_kwargs[self.tensor_inputs[0]]
+            prompt_embeds = prompt_embeds[-1:]  # "-1" denotes the embeddings for conditional text tokens.
+            pipeline._guidance_scale = 0.0
+            callback_kwargs[self.tensor_inputs[0]] = prompt_embeds
+        return callback_kwargs
+class SDXLCFGCutoffCallback(PipelineCallback):
+    """
+    Callback function for Stable Diffusion XL Pipelines. After certain number of steps (set by `cutoff_step_ratio` or
+    `cutoff_step_index`), this callback will disable the CFG.
+    Note: This callback mutates the pipeline by changing the `_guidance_scale` attribute to 0.0 after the cutoff step.
+    """
+    tensor_inputs = ["prompt_embeds", "add_text_embeds", "add_time_ids"]
+    def callback_fn(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
+        cutoff_step_ratio = self.config.cutoff_step_ratio
+        cutoff_step_index = self.config.cutoff_step_index
+        # Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
+        cutoff_step = (
+            cutoff_step_index if cutoff_step_index is not None else int(pipeline.num_timesteps * cutoff_step_ratio)
+        )
+        if step_index == cutoff_step:
+            prompt_embeds = callback_kwargs[self.tensor_inputs[0]]
+            prompt_embeds = prompt_embeds[-1:]  # "-1" denotes the embeddings for conditional text tokens.
+            add_text_embeds = callback_kwargs[self.tensor_inputs[1]]
+            add_text_embeds = add_text_embeds[-1:]  # "-1" denotes the embeddings for conditional pooled text tokens
+            add_time_ids = callback_kwargs[self.tensor_inputs[2]]
+            add_time_ids = add_time_ids[-1:]  # "-1" denotes the embeddings for conditional added time vector
+            pipeline._guidance_scale = 0.0
+            callback_kwargs[self.tensor_inputs[0]] = prompt_embeds
+            callback_kwargs[self.tensor_inputs[1]] = add_text_embeds
+            callback_kwargs[self.tensor_inputs[2]] = add_time_ids
+        return callback_kwargs
+class IPAdapterScaleCutoffCallback(PipelineCallback):
+    """
+    Callback function for any pipeline that inherits `IPAdapterMixin`. After certain number of steps (set by
+    `cutoff_step_ratio` or `cutoff_step_index`), this callback will set the IP Adapter scale to `0.0`.
+    Note: This callback mutates the IP Adapter attention processors by setting the scale to 0.0 after the cutoff step.
+    """
+    tensor_inputs = []
+    def callback_fn(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
+        cutoff_step_ratio = self.config.cutoff_step_ratio
+        cutoff_step_index = self.config.cutoff_step_index
+        # Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
+        cutoff_step = (
+            cutoff_step_index if cutoff_step_index is not None else int(pipeline.num_timesteps * cutoff_step_ratio)
+        )
+        if step_index == cutoff_step:
+            pipeline.set_ip_adapter_scale(0.0)
+        return callback_kwargs

PaddleMIX/ppdiffusers/ppdiffusers/configuration_utils.py ADDED Viewed

	@@ -0,0 +1,695 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ConfigMixin base class and utilities."""
+import functools
+import importlib
+import inspect
+import json
+import os
+import re
+from collections import OrderedDict
+from pathlib import PosixPath
+from typing import Any, Dict, Tuple, Union
+from .utils.download_utils import SaveToAistudioMixin
+from .utils.hub_utils import PushToHubMixin
+try:
+    from omegaconf.listconfig import ListConfig
+    _omegaconf_available = True
+except:
+    _omegaconf_available = False
+import numpy as np
+from huggingface_hub import create_repo
+from .utils import (
+    DIFFUSERS_CACHE,
+    FROM_AISTUDIO,
+    FROM_HF_HUB,
+    PPDIFFUSERS_CACHE,
+    DummyObject,
+    bos_aistudio_hf_download,
+    deprecate,
+    extract_commit_hash,
+    http_user_agent,
+    logging,
+)
+from .version import VERSION as __version__
+logger = logging.get_logger(__name__)
+_re_configuration_file = re.compile(r"config\.(.*)\.json")
+class FrozenDict(OrderedDict):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        for key, value in self.items():
+            setattr(self, key, value)
+        self.__frozen = True
+    def __delitem__(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
+    def setdefault(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
+    def pop(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+    def update(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
+    def __setattr__(self, name, value):
+        if hasattr(self, "__frozen") and self.__frozen:
+            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
+        super().__setattr__(name, value)
+    def __setitem__(self, name, value):
+        if hasattr(self, "__frozen") and self.__frozen:
+            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
+        super().__setitem__(name, value)
+class ConfigMixin(PushToHubMixin, SaveToAistudioMixin):
+    r"""
+    Base class for all configuration classes. All configuration parameters are stored under `self.config`. Also
+    provides the [`~ConfigMixin.from_config`] and [`~ConfigMixin.save_config`] methods for loading, downloading, and
+    saving classes that inherit from [`ConfigMixin`].
+    Class attributes:
+        - **config_name** (`str`) -- A filename under which the config should stored when calling
+          [`~ConfigMixin.save_config`] (should be overridden by parent class).
+        - **ignore_for_config** (`List[str]`) -- A list of attributes that should not be saved in the config (should be
+          overridden by subclass).
+        - **has_compatibles** (`bool`) -- Whether the class has compatible classes (should be overridden by subclass).
+        - **_deprecated_kwargs** (`List[str]`) -- Keyword arguments that are deprecated. Note that the `init` function
+          should only have a `kwargs` argument if at least one argument is deprecated (should be overridden by
+          subclass).
+    """
+    config_name = None
+    ignore_for_config = []
+    has_compatibles = False
+    _deprecated_kwargs = []
+    def register_to_config(self, **kwargs):
+        if self.config_name is None:
+            raise NotImplementedError(f"Make sure that {self.__class__} has defined a class name `config_name`")
+        # Special case for `kwargs` used in deprecation warning added to schedulers
+        # TODO: remove this when we remove the deprecation warning, and the `kwargs` argument,
+        # or solve in a more general way.
+        kwargs.pop("kwargs", None)
+        if not hasattr(self, "_internal_dict"):
+            internal_dict = kwargs
+        else:
+            previous_dict = dict(self._internal_dict)
+            internal_dict = {**self._internal_dict, **kwargs}
+            logger.debug(f"Updating config from {previous_dict} to {internal_dict}")
+        self._internal_dict = FrozenDict(internal_dict)
+    def __getattr__(self, name: str) -> Any:
+        """The only reason we overwrite `getattr` here is to gracefully deprecate accessing
+        config attributes directly. See https://github.com/huggingface/diffusers/pull/3129
+        Tihs funtion is mostly copied from PyTorch's __getattr__ overwrite:
+        https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        """
+        is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
+        is_attribute = name in self.__dict__
+        if is_in_config and not is_attribute:
+            deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'scheduler.config.{name}'."
+            deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False)
+            return self._internal_dict[name]
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+    def save_config(
+        self,
+        save_directory: Union[str, os.PathLike],
+        push_to_hub: bool = False,
+        save_to_aistudio: bool = False,
+        to_diffusers: bool = False,
+        **kwargs,
+    ):
+        """
+        Save a configuration object to the directory specified in `save_directory` so that it can be reloaded using the
+        [`~ConfigMixin.from_config`] class method.
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file is saved (will be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+        os.makedirs(save_directory, exist_ok=True)
+        # If we save using the predefined names, we can load using `from_config`
+        output_config_file = os.path.join(save_directory, self.config_name)
+        self.to_json_file(output_config_file, to_diffusers=to_diffusers)
+        logger.info(f"Configuration saved in {output_config_file}")
+        commit_message = kwargs.pop("commit_message", None)
+        create_pr = kwargs.pop("create_pr", False)
+        token = kwargs.pop("token", None)
+        token_kwargs = {}
+        if token is not None:
+            token_kwargs["token"] = token
+        private = kwargs.pop("private", False)
+        exist_ok = kwargs.pop("exist_ok", True)
+        repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+        license = kwargs.pop("license", "creativeml-openrail-m")
+        if save_to_aistudio:
+            from aistudio_sdk.hub import create_repo as aistudio_create_repo
+            assert "/" in repo_id, "Please specify the repo id in format of `user_id/repo_name`"
+            res = aistudio_create_repo(repo_id=repo_id, private=private, license=license, **token_kwargs)
+            if "error_code" in res:
+                if res["error_code"] == 10003 and exist_ok:
+                    logger.info(
+                        f"Repo {repo_id} already exists, it will override files with the same name. To avoid this, please set exist_ok=False"
+                    )
+                else:
+                    logger.error(
+                        f"Failed to create repo {repo_id}, error_code: {res['error_code']}, error_msg: {res['error_msg']}"
+                    )
+            else:
+                logger.info(f"Successfully created repo {repo_id}")
+            self._upload_folder_aistudio(
+                save_directory,
+                repo_id,
+                commit_message=commit_message,
+                **token_kwargs,
+            )
+        if push_to_hub:
+            repo_id = create_repo(repo_id, exist_ok=exist_ok, private=private, **token_kwargs).repo_id
+            self._upload_folder(
+                save_directory,
+                repo_id,
+                commit_message=commit_message,
+                create_pr=create_pr,
+                **token_kwargs,
+            )
+    @classmethod
+    def from_config(cls, config: Union[FrozenDict, Dict[str, Any]] = None, return_unused_kwargs=False, **kwargs):
+        r"""
+        Instantiate a Python class from a config dictionary.
+        Parameters:
+            config (`Dict[str, Any]`):
+                A config dictionary from which the Python class is instantiated. Make sure to only load configuration
+                files of compatible classes.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                Whether kwargs that are not consumed by the Python class should be returned or not.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to update the configuration object (after it is loaded) and initiate the Python class.
+                `**kwargs` are passed directly to the underlying scheduler/model's `__init__` method and eventually
+                overwrite the same named arguments in `config`.
+        Returns:
+            [`ModelMixin`] or [`SchedulerMixin`]:
+                A model or scheduler object instantiated from a config dictionary.
+        Examples:
+        ```python
+        >>> from ppdiffusers import DDPMScheduler, DDIMScheduler, PNDMScheduler
+        >>> # Download scheduler from huggingface.co and cache.
+        >>> scheduler = DDPMScheduler.from_pretrained("google/ddpm-cifar10-32")
+        >>> # Instantiate DDIM scheduler class with same config as DDPM
+        >>> scheduler = DDIMScheduler.from_config(scheduler.config)
+        >>> # Instantiate PNDM scheduler class with same config as DDPM
+        >>> scheduler = PNDMScheduler.from_config(scheduler.config)
+        ```
+        """
+        # <===== TO BE REMOVED WITH DEPRECATION
+        # TODO(Patrick) - make sure to remove the following lines when config=="model_path" is deprecated
+        if "pretrained_model_name_or_path" in kwargs:
+            config = kwargs.pop("pretrained_model_name_or_path")
+        if config is None:
+            raise ValueError("Please make sure to provide a config as the first positional argument.")
+        # ======>
+        if not isinstance(config, dict):
+            deprecation_message = "It is deprecated to pass a pretrained model name or path to `from_config`."
+            if "Scheduler" in cls.__name__:
+                deprecation_message += (
+                    f"If you were trying to load a scheduler, please use {cls}.from_pretrained(...) instead."
+                    " Otherwise, please make sure to pass a configuration dictionary instead. This functionality will"
+                    " be removed in v1.0.0."
+                )
+            elif "Model" in cls.__name__:
+                deprecation_message += (
+                    f"If you were trying to load a model, please use {cls}.load_config(...) followed by"
+                    f" {cls}.from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary"
+                    " instead. This functionality will be removed in v1.0.0."
+                )
+            deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+            config, kwargs = cls.load_config(pretrained_model_name_or_path=config, return_unused_kwargs=True, **kwargs)
+        init_dict, unused_kwargs, hidden_dict = cls.extract_init_dict(config, **kwargs)
+        # Allow dtype to be specified on initialization
+        if "dtype" in unused_kwargs:
+            init_dict["dtype"] = unused_kwargs.pop("dtype")
+        # add possible deprecated kwargs
+        for deprecated_kwarg in cls._deprecated_kwargs:
+            if deprecated_kwarg in unused_kwargs:
+                init_dict[deprecated_kwarg] = unused_kwargs.pop(deprecated_kwarg)
+        # Return model and optionally state and/or unused_kwargs
+        model = cls(**init_dict)
+        # make sure to also save config parameters that might be used for compatible classes
+        model.register_to_config(**hidden_dict)
+        # add hidden kwargs of compatible classes to unused_kwargs
+        unused_kwargs = {**unused_kwargs, **hidden_dict}
+        if return_unused_kwargs:
+            return (model, unused_kwargs)
+        else:
+            return model
+    @classmethod
+    def get_config_dict(cls, *args, **kwargs):
+        deprecation_message = (
+            f" The function get_config_dict is deprecated. Please use {cls}.load_config instead. This function will be"
+            " removed in version v1.0.0"
+        )
+        deprecate("get_config_dict", "1.0.0", deprecation_message, standard_warn=False)
+        return cls.load_config(*args, **kwargs)
+    @classmethod
+    def load_config(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        return_unused_kwargs=False,
+        return_commit_hash=False,
+        **kwargs,
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        r"""
+        Load a model or scheduler configuration.
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing model weights saved with
+                      [`~ConfigMixin.save_config`].
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False):
+                Whether unused keyword arguments of the config are returned.
+            return_commit_hash (`bool`, *optional*, defaults to `False):
+                Whether the `commit_hash` of the loaded configuration are returned.
+        Returns:
+            `dict`:
+                A dictionary of all the parameters stored in a JSON configuration file.
+        """
+        from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
+        from_aistudio = kwargs.pop("from_aistudio", FROM_AISTUDIO)
+        cache_dir = kwargs.pop("cache_dir", None)
+        if cache_dir is None:
+            if from_aistudio:
+                cache_dir = PPDIFFUSERS_CACHE
+            elif from_hf_hub:
+                cache_dir = DIFFUSERS_CACHE
+            else:
+                cache_dir = PPDIFFUSERS_CACHE
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+        _ = kwargs.pop("mirror", None)
+        subfolder = kwargs.pop("subfolder", "")
+        if subfolder is None:
+            subfolder = ""
+        user_agent = kwargs.pop("user_agent", {})
+        user_agent = {**user_agent, "file_type": "config"}
+        user_agent = http_user_agent(user_agent)
+        # new add return_config_file
+        return_config_file = kwargs.pop("return_config_file", False)
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        if cls.config_name is None:
+            raise ValueError(
+                "`self.config_name` is not defined. Note that one should not load a config from "
+                "`ConfigMixin`. Please make sure to define `config_name` in a class inheriting from `ConfigMixin`"
+            )
+        if os.path.isfile(pretrained_model_name_or_path):
+            config_file = pretrained_model_name_or_path
+        elif os.path.isdir(pretrained_model_name_or_path):
+            if os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)):
+                # Load from a PyTorch checkpoint
+                config_file = os.path.join(pretrained_model_name_or_path, cls.config_name)
+            elif subfolder is not None and os.path.isfile(
+                os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
+            ):
+                config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
+            else:
+                raise EnvironmentError(
+                    f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}."
+                )
+        else:
+            config_file = bos_aistudio_hf_download(
+                pretrained_model_name_or_path,
+                cls.config_name,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                user_agent=user_agent,
+                subfolder=subfolder,
+                revision=revision,
+                from_hf_hub=from_hf_hub,
+                from_aistudio=from_aistudio,
+            )
+        try:
+            # Load config dict
+            config_dict = cls._dict_from_json_file(config_file)
+            commit_hash = extract_commit_hash(config_file)
+        except (json.JSONDecodeError, UnicodeDecodeError):
+            raise EnvironmentError(f"It looks like the config file at '{config_file}' is not a valid JSON file.")
+        if not (return_unused_kwargs or return_commit_hash or return_config_file):
+            return config_dict
+        outputs = (config_dict,)
+        if return_unused_kwargs:
+            outputs += (kwargs,)
+        if return_commit_hash:
+            outputs += (commit_hash,)
+        if return_config_file:
+            outputs += (config_file,)
+        return outputs
+    @staticmethod
+    def _get_init_keys(cls):
+        return set(dict(inspect.signature(cls.__init__).parameters).keys())
+    @classmethod
+    def extract_init_dict(cls, config_dict, **kwargs):
+        # Skip keys that were not present in the original config, so default __init__ values were used
+        used_defaults = config_dict.get("_use_default_values", [])
+        config_dict = {k: v for k, v in config_dict.items() if k not in used_defaults and k != "_use_default_values"}
+        # 0. Copy origin config dict
+        original_dict = dict(config_dict.items())
+        # 1. Retrieve expected config attributes from __init__ signature
+        expected_keys = cls._get_init_keys(cls)
+        expected_keys.remove("self")
+        # remove general kwargs if present in dict
+        if "kwargs" in expected_keys:
+            expected_keys.remove("kwargs")
+        # 2. Remove attributes that cannot be expected from expected config attributes
+        # remove keys to be ignored
+        if len(cls.ignore_for_config) > 0:
+            expected_keys = expected_keys - set(cls.ignore_for_config)
+        # load ppdiffusers library to import compatible and original scheduler
+        ppdiffusers_library = importlib.import_module(__name__.split(".")[0])
+        if cls.has_compatibles:
+            compatible_classes = [c for c in cls._get_compatibles() if not isinstance(c, DummyObject)]
+        else:
+            compatible_classes = []
+        expected_keys_comp_cls = set()
+        for c in compatible_classes:
+            expected_keys_c = cls._get_init_keys(c)
+            expected_keys_comp_cls = expected_keys_comp_cls.union(expected_keys_c)
+        expected_keys_comp_cls = expected_keys_comp_cls - cls._get_init_keys(cls)
+        config_dict = {k: v for k, v in config_dict.items() if k not in expected_keys_comp_cls}
+        # remove attributes from orig class that cannot be expected
+        orig_cls_name = config_dict.pop("_class_name", cls.__name__)
+        if (
+            isinstance(orig_cls_name, str)
+            and orig_cls_name != cls.__name__
+            and hasattr(ppdiffusers_library, orig_cls_name)
+        ):
+            orig_cls = getattr(ppdiffusers_library, orig_cls_name)
+            unexpected_keys_from_orig = cls._get_init_keys(orig_cls) - expected_keys
+            config_dict = {k: v for k, v in config_dict.items() if k not in unexpected_keys_from_orig}
+        elif not isinstance(orig_cls_name, str) and not isinstance(orig_cls_name, (list, tuple)):
+            raise ValueError(
+                "Make sure that the `_class_name` is of type string or list of string (for custom pipelines)."
+            )
+        # remove private attributes
+        config_dict = {k: v for k, v in config_dict.items() if not k.startswith("_")}
+        # 3. Create keyword arguments that will be passed to __init__ from expected keyword arguments
+        init_dict = {}
+        for key in expected_keys:
+            # if config param is passed to kwarg and is present in config dict
+            # it should overwrite existing config dict key
+            if key in kwargs and key in config_dict:
+                config_dict[key] = kwargs.pop(key)
+            if key in kwargs:
+                # overwrite key
+                init_dict[key] = kwargs.pop(key)
+            elif key in config_dict:
+                # use value from config dict
+                init_dict[key] = config_dict.pop(key)
+        # 4. Give nice warning if unexpected values have been passed
+        if len(config_dict) > 0:
+            logger.warning(
+                f"The config attributes {config_dict} were passed to {cls.__name__}, "
+                "but are not expected and will be ignored. Please verify your "
+                f"{cls.config_name} configuration file."
+            )
+        # 5. Give nice info if config attributes are initiliazed to default because they have not been passed
+        passed_keys = set(init_dict.keys())
+        if len(expected_keys - passed_keys) > 0:
+            logger.info(
+                f"{expected_keys - passed_keys} was not found in config. Values will be initialized to default values."
+            )
+        # 6. Define unused keyword arguments
+        unused_kwargs = {**config_dict, **kwargs}
+        # 7. Define "hidden" config parameters that were saved for compatible classes
+        hidden_config_dict = {k: v for k, v in original_dict.items() if k not in init_dict}
+        return init_dict, unused_kwargs, hidden_config_dict
+    @classmethod
+    def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        data = json.loads(text)
+        if "_diffusers_version" in data and "_ppdiffusers_version" not in data:
+            data["_ppdiffusers_version"] = data.pop("_diffusers_version", __version__)
+        if "_diffusers_version" not in data and "_ppdiffusers_version" not in data:
+            data["_ppdiffusers_version"] = __version__
+        # remove Onnx and Flax prefix
+        _class_name = data.get("_class_name", None)
+        if _class_name is not None:
+            if _class_name.startswith("Flax"):
+                data["_class_name"] = _class_name[4:]
+            elif _class_name.startswith("Onnx"):
+                data["_class_name"] = "FastDeploy" + _class_name[4:]
+        return data
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+    @property
+    def config(self) -> Dict[str, Any]:
+        """
+        Returns the config of the class as a frozen dictionary
+        Returns:
+            `Dict[str, Any]`: Config of the class.
+        """
+        return self._internal_dict
+    def to_json_string(self, to_diffusers=False) -> str:
+        """
+        Serializes the configuration instance to a JSON string.
+        Returns:
+            `str`:
+                String containing all the attributes that make up the configuration instance in JSON format.
+        """
+        config_dict = self._internal_dict if hasattr(self, "_internal_dict") else {}
+        config_dict["_class_name"] = self.__class__.__name__
+        # json
+        if to_diffusers:
+            config_dict["_diffusers_version"] = __version__
+        else:
+            config_dict["_ppdiffusers_version"] = __version__
+        def to_json_saveable(value):
+            if isinstance(value, np.ndarray):
+                value = value.tolist()
+            elif isinstance(value, PosixPath):
+                value = str(value)
+            elif _omegaconf_available and isinstance(value, ListConfig):
+                value = list(value)
+            return value
+        config_dict = {k: to_json_saveable(v) for k, v in config_dict.items()}
+        if to_diffusers:
+            config_dict.pop("_ppdiffusers_version", None)
+        else:
+            config_dict.pop("_diffusers_version", None)
+        # Don't save "_ignore_files" or "_use_default_values"
+        config_dict.pop("_ignore_files", None)
+        config_dict.pop("_use_default_values", None)
+        json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+        if to_diffusers:
+            json_string = json_string.replace('"ppdiffusers"', '"diffusers"').replace(
+                '"ppdiffusers.transformers"', '"transformers"'
+            )
+        return json_string
+    def to_json_file(self, json_file_path: Union[str, os.PathLike], to_diffusers=False):
+        """
+        Save the configuration instance's parameters to a JSON file.
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file to save a configuration instance's parameters.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string(to_diffusers=to_diffusers))
+def register_to_config(init):
+    r"""
+    Decorator to apply on the init of classes inheriting from [`ConfigMixin`] so that all the arguments are
+    automatically sent to `self.register_for_config`. To ignore a specific argument accepted by the init but that
+    shouldn't be registered in the config, use the `ignore_for_config` class variable
+    Warning: Once decorated, all private arguments (beginning with an underscore) are trashed and not sent to the init!
+    """
+    @functools.wraps(init)
+    def inner_init(self, *args, **kwargs):
+        # Ignore private kwargs in the init.
+        init_kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")}
+        config_init_kwargs = {k: v for k, v in kwargs.items() if k.startswith("_")}
+        if not isinstance(self, ConfigMixin):
+            raise RuntimeError(
+                f"`@register_for_config` was applied to {self.__class__.__name__} init method, but this class does "
+                "not inherit from `ConfigMixin`."
+            )
+        ignore = getattr(self, "ignore_for_config", [])
+        # Get positional arguments aligned with kwargs
+        new_kwargs = {}
+        signature = inspect.signature(init)
+        parameters = {
+            name: p.default for i, (name, p) in enumerate(signature.parameters.items()) if i > 0 and name not in ignore
+        }
+        for arg, name in zip(args, parameters.keys()):
+            new_kwargs[name] = arg
+        # Then add all kwargs
+        new_kwargs.update(
+            {
+                k: init_kwargs.get(k, default)
+                for k, default in parameters.items()
+                if k not in ignore and k not in new_kwargs
+            }
+        )
+        # Take note of the parameters that were not present in the loaded config
+        if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0:
+            new_kwargs["_use_default_values"] = list(set(new_kwargs.keys()) - set(init_kwargs))
+        new_kwargs = {**config_init_kwargs, **new_kwargs}
+        getattr(self, "register_to_config")(**new_kwargs)
+        init(self, *args, **init_kwargs)
+    return inner_init

PaddleMIX/ppdiffusers/ppdiffusers/image_processor.py ADDED Viewed

	@@ -0,0 +1,671 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import paddle
+import PIL.Image
+from PIL import Image
+from .configuration_utils import ConfigMixin, register_to_config
+from .utils import CONFIG_NAME, PIL_INTERPOLATION, deprecate
+PipelineImageInput = Union[
+    PIL.Image.Image,
+    np.ndarray,
+    paddle.Tensor,
+    List[PIL.Image.Image],
+    List[np.ndarray],
+    List[paddle.Tensor],
+]
+PipelineDepthInput = Union[
+    PIL.Image.Image,
+    np.ndarray,
+    paddle.Tensor,
+    List[PIL.Image.Image],
+    List[np.ndarray],
+    List[paddle.Tensor],
+]
+class VaeImageProcessor(ConfigMixin):
+    """
+    Image processor for VAE.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. Can accept
+            `height` and `width` arguments from [`image_processor.VaeImageProcessor.preprocess`] method.
+        vae_scale_factor (`int`, *optional*, defaults to `8`):
+            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
+        resample (`str`, *optional*, defaults to `lanczos`):
+            Resampling filter to use when resizing the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image to [-1,1].
+        do_binarize (`bool`, *optional*, defaults to `False`):
+            Whether to binarize the image to 0/1.
+        do_convert_rgb (`bool`, *optional*, defaults to be `False`):
+            Whether to convert the images to RGB format.
+        do_convert_grayscale (`bool`, *optional*, defaults to be `False`):
+            Whether to convert the images to grayscale format.
+    """
+    config_name = CONFIG_NAME
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 8,
+        vae_latent_channels: int = 4,
+        resample: str = "lanczos",
+        do_normalize: bool = True,
+        do_binarize: bool = False,
+        do_convert_rgb: bool = False,
+        do_convert_grayscale: bool = False,
+    ):
+        super().__init__()
+        if do_convert_rgb and do_convert_grayscale:
+            raise ValueError(
+                "`do_convert_rgb` and `do_convert_grayscale` can not both be set to `True`,"
+                " if you intended to convert the image into RGB format, please set `do_convert_grayscale = False`.",
+                " if you intended to convert the image into grayscale format, please set `do_convert_rgb = False`",
+            )
+            self.config.do_convert_rgb = False
+    @staticmethod
+    def numpy_to_pil(images: np.ndarray) -> PIL.Image.Image:
+        """
+        Convert a numpy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image) for image in images]
+        return pil_images
+    @staticmethod
+    def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
+        """
+        Convert a PIL image or a list of PIL images to NumPy arrays.
+        """
+        if not isinstance(images, list):
+            images = [images]
+        images = [np.array(image).astype(np.float32) / 255.0 for image in images]
+        images = np.stack(images, axis=0)
+        return images
+    @staticmethod
+    def numpy_to_pd(images: np.ndarray) -> paddle.Tensor:
+        """
+        Convert a NumPy image to a Paddle tensor.
+        """
+        if images.ndim == 3:
+            images = images[..., None]
+        images = paddle.to_tensor(images.transpose(0, 3, 1, 2))
+        return images
+    @staticmethod
+    def pd_to_numpy(images: paddle.Tensor) -> np.ndarray:
+        """
+        Convert a Paddle tensor to a NumPy image.
+        """
+        images = images.cast("float32").cpu().transpose([0, 2, 3, 1]).numpy()
+        return images
+    @staticmethod
+    def normalize(images: Union[np.ndarray, paddle.Tensor]) -> Union[np.ndarray, paddle.Tensor]:
+        """
+        Normalize an image array to [-1,1].
+        """
+        return 2.0 * images - 1.0
+    @staticmethod
+    def denormalize(images: Union[np.ndarray, paddle.Tensor]) -> Union[np.ndarray, paddle.Tensor]:
+        """
+        Denormalize an image array to [0,1].
+        """
+        return (images / 2 + 0.5).clip(0, 1)
+    @staticmethod
+    def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
+        """
+        Converts a PIL image to RGB format.
+        """
+        image = image.convert("RGB")
+        return image
+    @staticmethod
+    def convert_to_grayscale(image: PIL.Image.Image) -> PIL.Image.Image:
+        """
+        Converts a PIL image to grayscale format.
+        """
+        image = image.convert("L")
+        return image
+    def get_default_height_width(
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+    ) -> Tuple[int, int]:
+        """
+        This function return the height and width that are downscaled to the next integer multiple of
+        `vae_scale_factor`.
+        Args:
+            image(`PIL.Image.Image`, `np.ndarray` or `paddle.Tensor`):
+                The image input, can be a PIL image, numpy array or paddle tensor. if it is a numpy array, should have
+                shape `[batch, height, width]` or `[batch, height, width, channel]` if it is a paddle tensor, should
+                have shape `[batch, channel, height, width]`.
+            height (`int`, *optional*, defaults to `None`):
+                The height in preprocessed image. If `None`, will use the height of `image` input.
+            width (`int`, *optional*`, defaults to `None`):
+                The width in preprocessed. If `None`, will use the width of the `image` input.
+        """
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, paddle.Tensor):
+                height = image.shape[2]
+            else:
+                height = image.shape[1]
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, paddle.Tensor):
+                width = image.shape[3]
+            else:
+                width = image.shape[2]
+        width, height = (
+            x - x % self.config.vae_scale_factor for x in (width, height)
+        )  # resize to integer multiple of vae_scale_factor
+        return height, width
+    def resize(
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+    ) -> Union[PIL.Image.Image, np.ndarray, paddle.Tensor]:
+        """
+        Resize image.
+        Args:
+            image (`PIL.Image.Image`, `np.ndarray` or `paddle.Tensor`):
+                The image input, can be a PIL image, numpy array or paddle tensor.
+            height (`int`, *optional*, defaults to `None`):
+                The height to resize to.
+            width (`int`, *optional*`, defaults to `None`):
+                The width to resize to.
+        Returns:
+            `PIL.Image.Image`, `np.ndarray` or `paddle.Tensor`:
+                The resized image.
+        """
+        if isinstance(image, PIL.Image.Image):
+            image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample])
+        elif isinstance(image, paddle.Tensor):
+            image = paddle.nn.functional.interpolate(
+                image,
+                size=(height, width),
+            )
+        elif isinstance(image, np.ndarray):
+            image = self.numpy_to_pd(image)
+            image = paddle.nn.functional.interpolate(
+                image,
+                size=(height, width),
+            )
+            image = self.pd_to_numpy(image)
+        return image
+    def binarize(self, image: PIL.Image.Image) -> PIL.Image.Image:
+        """
+        Create a mask.
+        Args:
+            image (`PIL.Image.Image`):
+                The image input, should be a PIL image.
+        Returns:
+            `PIL.Image.Image`:
+                The binarized image. Values less than 0.5 are set to 0, values greater than 0.5 are set to 1.
+        """
+        image[image < 0.5] = 0
+        image[image >= 0.5] = 1
+        return image
+    def preprocess(
+        self,
+        image: Union[paddle.Tensor, PIL.Image.Image, np.ndarray],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+    ) -> paddle.Tensor:
+        """
+        Preprocess the image input. Accepted formats are PIL images, NumPy arrays or Paddle tensors.
+        """
+        supported_formats = (PIL.Image.Image, np.ndarray, paddle.Tensor)
+        # Expand the missing dimension for 3-dimensional paddle tensor or numpy array that represents grayscale image
+        if self.config.do_convert_grayscale and isinstance(image, (paddle.Tensor, np.ndarray)) and image.ndim == 3:
+            if isinstance(image, paddle.Tensor):
+                # if image is a paddle tensor could have 2 possible shapes:
+                #    1. batch x height x width: we should insert the channel dimension at position 1
+                #    2. channnel x height x width: we should insert batch dimension at position 0,
+                #       however, since both channel and batch dimension has same size 1, it is same to insert at position 1
+                #    for simplicity, we insert a dimension of size 1 at position 1 for both cases
+                image = image.unsqueeze(1)
+            else:
+                # if it is a numpy array, it could have 2 possible shapes:
+                #   1. batch x height x width: insert channel dimension on last position
+                #   2. height x width x channel: insert batch dimension on first position
+                if image.shape[-1] == 1:
+                    image = np.expand_dims(image, axis=0)
+                else:
+                    image = np.expand_dims(image, axis=-1)
+        if isinstance(image, supported_formats):
+            image = [image]
+        elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)):
+            raise ValueError(
+                f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support {', '.join(supported_formats)}"
+            )
+        if isinstance(image[0], PIL.Image.Image):
+            if self.config.do_convert_rgb:
+                image = [self.convert_to_rgb(i) for i in image]
+            elif self.config.do_convert_grayscale:
+                image = [self.convert_to_grayscale(i) for i in image]
+            if self.config.do_resize:
+                height, width = self.get_default_height_width(image[0], height, width)
+                image = [self.resize(i, height, width) for i in image]
+            image = self.pil_to_numpy(image)  # to np
+            image = self.numpy_to_pd(image)  # to pt
+        elif isinstance(image[0], np.ndarray):
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+            image = self.numpy_to_pd(image)
+            height, width = self.get_default_height_width(image, height, width)
+            if self.config.do_resize:
+                image = self.resize(image, height, width)
+        elif isinstance(image[0], paddle.Tensor):
+            image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0)
+            if self.config.do_convert_grayscale and image.ndim == 3:
+                image = image.unsqueeze(1)
+            channel = image.shape[1]
+            # don't need any preprocess if the image is latents
+            if channel == 4:
+                return image
+            height, width = self.get_default_height_width(image, height, width)
+            if self.config.do_resize:
+                image = self.resize(image, height, width)
+        # expected range [0,1], normalize to [-1,1]
+        do_normalize = self.config.do_normalize
+        if do_normalize and image.min() < 0:
+            warnings.warn(
+                "Passing `image` as paddle tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
+                f"when passing as paddle tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
+                FutureWarning,
+            )
+            do_normalize = False
+        if do_normalize:
+            image = self.normalize(image)
+        if self.config.do_binarize:
+            image = self.binarize(image)
+        # laixinlu: add this, for paddle not auto support float32 * bool
+        if isinstance(image, paddle.Tensor) and image.dtype == paddle.bool:
+            image = image.cast(dtype="float32")
+        return image
+    def postprocess(
+        self,
+        image: paddle.Tensor,
+        output_type: str = "pil",
+        do_denormalize: Optional[List[bool]] = None,
+    ) -> Union[PIL.Image.Image, np.ndarray, paddle.Tensor]:
+        """
+        Postprocess the image output from tensor to `output_type`.
+        Args:
+            image (`paddle.Tensor`):
+                The image input, should be a paddle tensor with shape `B x C x H x W`.
+            output_type (`str`, *optional*, defaults to `pil`):
+                The output type of the image, can be one of `pil`, `np`, `pd`, `latent`.
+            do_denormalize (`List[bool]`, *optional*, defaults to `None`):
+                Whether to denormalize the image to [0,1]. If `None`, will use the value of `do_normalize` in the
+                `VaeImageProcessor` config.
+        Returns:
+            `PIL.Image.Image`, `np.ndarray` or `paddle.Tensor`:
+                The postprocessed image.
+        """
+        if not isinstance(image, paddle.Tensor):
+            raise ValueError(
+                f"Input for postprocessing is in incorrect format: {type(image)}. We only support paddle tensor"
+            )
+        if output_type not in ["latent", "pd", "np", "pil"]:
+            deprecation_message = (
+                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
+                "`pil`, `np`, `pd`, `latent`"
+            )
+            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
+            output_type = "np"
+        if output_type == "latent":
+            return image
+        if do_denormalize is None:
+            do_denormalize = [self.config.do_normalize] * image.shape[0]
+        image = paddle.stack(
+            [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])]
+        )
+        if output_type == "pd":
+            return image
+        image = self.pd_to_numpy(image)
+        if output_type == "np":
+            return image
+        if output_type == "pil":
+            return self.numpy_to_pil(image)
+class VaeImageProcessorLDM3D(VaeImageProcessor):
+    """
+    Image processor for VAE LDM3D.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`.
+        vae_scale_factor (`int`, *optional*, defaults to `8`):
+            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
+        resample (`str`, *optional*, defaults to `lanczos`):
+            Resampling filter to use when resizing the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image to [-1,1].
+    """
+    config_name = CONFIG_NAME
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 8,
+        resample: str = "lanczos",
+        do_normalize: bool = True,
+    ):
+        super().__init__()
+    @staticmethod
+    def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]:
+        """
+        Convert a NumPy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image[:, :, :3]) for image in images]
+        return pil_images
+    @staticmethod
+    def depth_pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
+        """
+        Convert a PIL image or a list of PIL images to NumPy arrays.
+        """
+        if not isinstance(images, list):
+            images = [images]
+        images = [np.array(image).astype(np.float32) / (2**16 - 1) for image in images]
+        images = np.stack(images, axis=0)
+        return images
+    @staticmethod
+    def rgblike_to_depthmap(image: Union[np.ndarray, paddle.Tensor]) -> Union[np.ndarray, paddle.Tensor]:
+        """
+        Args:
+            image: RGB-like depth image
+        Returns: depth map
+        """
+        return image[:, :, 1] * 2**8 + image[:, :, 2]
+    def numpy_to_depth(self, images: np.ndarray) -> List[PIL.Image.Image]:
+        """
+        Convert a NumPy depth image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images_depth = images[:, :, :, 3:]
+        if images.shape[-1] == 6:
+            images_depth = (images_depth * 255).round().astype("uint8")
+            pil_images = [
+                Image.fromarray(self.rgblike_to_depthmap(image_depth), mode="I;16") for image_depth in images_depth
+            ]
+        elif images.shape[-1] == 4:
+            images_depth = (images_depth * 65535.0).astype(np.uint16)
+            pil_images = [Image.fromarray(image_depth, mode="I;16") for image_depth in images_depth]
+        else:
+            raise Exception("Not supported")
+        return pil_images
+    def postprocess(
+        self,
+        image: paddle.Tensor,
+        output_type: str = "pil",
+        do_denormalize: Optional[List[bool]] = None,
+    ) -> Union[PIL.Image.Image, np.ndarray, paddle.Tensor]:
+        """
+        Postprocess the image output from tensor to `output_type`.
+        Args:
+            image (`paddle.Tensor`):
+                The image input, should be a paddle tensor with shape `B x C x H x W`.
+            output_type (`str`, *optional*, defaults to `pil`):
+                The output type of the image, can be one of `pil`, `np`, `pd`, `latent`.
+            do_denormalize (`List[bool]`, *optional*, defaults to `None`):
+                Whether to denormalize the image to [0,1]. If `None`, will use the value of `do_normalize` in the
+                `VaeImageProcessor` config.
+        Returns:
+            `PIL.Image.Image`, `np.ndarray` or `paddle.Tensor`:
+                The postprocessed image.
+        """
+        if not isinstance(image, paddle.Tensor):
+            raise ValueError(
+                f"Input for postprocessing is in incorrect format: {type(image)}. We only support paddle tensor"
+            )
+        if output_type not in ["latent", "pd", "np", "pil"]:
+            deprecation_message = (
+                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
+                "`pil`, `np`, `pd`, `latent`"
+            )
+            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
+            output_type = "np"
+        if do_denormalize is None:
+            do_denormalize = [self.config.do_normalize] * image.shape[0]
+        image = paddle.stack(
+            [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])]
+        )
+        image = self.pd_to_numpy(image)
+        if output_type == "np":
+            if image.shape[-1] == 6:
+                image_depth = np.stack([self.rgblike_to_depthmap(im[:, :, 3:]) for im in image], axis=0)
+            else:
+                image_depth = image[:, :, :, 3:]
+            return image[:, :, :, :3], image_depth
+        if output_type == "pil":
+            return self.numpy_to_pil(image), self.numpy_to_depth(image)
+        else:
+            raise Exception(f"This type {output_type} is not supported")
+    def preprocess(
+        self,
+        rgb: Union[paddle.Tensor, PIL.Image.Image, np.ndarray],
+        depth: Union[paddle.Tensor, PIL.Image.Image, np.ndarray],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        target_res: Optional[int] = None,
+    ) -> paddle.Tensor:
+        """
+        Preprocess the image input. Accepted formats are PIL images, NumPy arrays or Paddle tensors.
+        """
+        supported_formats = (PIL.Image.Image, np.ndarray, paddle.Tensor)
+        # Expand the missing dimension for 3-dimensional paddle tensor or numpy array that represents grayscale image
+        if self.config.do_convert_grayscale and isinstance(rgb, (paddle.Tensor, np.ndarray)) and rgb.ndim == 3:
+            raise Exception("This is not yet supported")
+        if isinstance(rgb, supported_formats):
+            rgb = [rgb]
+            depth = [depth]
+        elif not (isinstance(rgb, list) and all(isinstance(i, supported_formats) for i in rgb)):
+            raise ValueError(
+                f"Input is in incorrect format: {[type(i) for i in rgb]}. Currently, we only support {', '.join(supported_formats)}"
+            )
+        if isinstance(rgb[0], PIL.Image.Image):
+            if self.config.do_convert_rgb:
+                raise Exception("This is not yet supported")
+                # rgb = [self.convert_to_rgb(i) for i in rgb]
+                # depth = [self.convert_to_depth(i) for i in depth]  #TODO define convert_to_depth
+            if self.config.do_resize or target_res:
+                height, width = self.get_default_height_width(rgb[0], height, width) if not target_res else target_res
+                rgb = [self.resize(i, height, width) for i in rgb]
+                depth = [self.resize(i, height, width) for i in depth]
+            rgb = self.pil_to_numpy(rgb)  # to np
+            rgb = self.numpy_to_pd(rgb)  # to pt
+            depth = self.depth_pil_to_numpy(depth)  # to np
+            depth = self.numpy_to_pd(depth)  # to pt
+        elif isinstance(rgb[0], np.ndarray):
+            rgb = np.concatenate(rgb, axis=0) if rgb[0].ndim == 4 else np.stack(rgb, axis=0)
+            rgb = self.numpy_to_pd(rgb)
+            height, width = self.get_default_height_width(rgb, height, width)
+            if self.config.do_resize:
+                rgb = self.resize(rgb, height, width)
+            depth = np.concatenate(depth, axis=0) if rgb[0].ndim == 4 else np.stack(depth, axis=0)
+            depth = self.numpy_to_pd(depth)
+            height, width = self.get_default_height_width(depth, height, width)
+            if self.config.do_resize:
+                depth = self.resize(depth, height, width)
+        elif isinstance(rgb[0], paddle.Tensor):
+            raise Exception("This is not yet supported")
+            # rgb = paddle.concat(rgb, axis=0) if rgb[0].ndim == 4 else paddle.stack(rgb, axis=0)
+            # if self.config.do_convert_grayscale and rgb.ndim == 3:
+            #     rgb = rgb.unsqueeze(1)
+            # channel = rgb.shape[1]
+            # height, width = self.get_default_height_width(rgb, height, width)
+            # if self.config.do_resize:
+            #     rgb = self.resize(rgb, height, width)
+            # depth = paddle.cat(depth, axis=0) if depth[0].ndim == 4 else paddle.stack(depth, axis=0)
+            # if self.config.do_convert_grayscale and depth.ndim == 3:
+            #     depth = depth.unsqueeze(1)
+            # channel = depth.shape[1]
+            # # don't need any preprocess if the image is latents
+            # if depth == 4:
+            #     return rgb, depth
+            # height, width = self.get_default_height_width(depth, height, width)
+            # if self.config.do_resize:
+            #     depth = self.resize(depth, height, width)
+        # expected range [0,1], normalize to [-1,1]
+        do_normalize = self.config.do_normalize
+        if rgb.min() < 0 and do_normalize:
+            warnings.warn(
+                "Passing `image` as paddle tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
+                f"when passing as paddle tensor or numpy Array. You passed `image` with value range [{rgb.min()},{rgb.max()}]",
+                FutureWarning,
+            )
+            do_normalize = False
+        if do_normalize:
+            rgb = self.normalize(rgb)
+            depth = self.normalize(depth)
+        if self.config.do_binarize:
+            rgb = self.binarize(rgb)
+            depth = self.binarize(depth)
+        return rgb, depth
+def is_valid_image(image):
+    return isinstance(image, PIL.Image.Image) or isinstance(image, (np.ndarray, paddle.Tensor)) and image.ndim in (2, 3)
+def is_valid_image_imagelist(images):
+    # check if the image input is one of the supported formats for image and image list:
+    # it can be either one of below 3
+    # (1) a 4d pytorch tensor or numpy array,
+    # (2) a valid image: PIL.Image.Image, 2-d np.ndarray or torch.Tensor (grayscale image), 3-d np.ndarray or torch.Tensor
+    # (3) a list of valid image
+    if isinstance(images, (np.ndarray, paddle.Tensor)) and images.ndim == 4:
+        return True
+    elif is_valid_image(images):
+        return True
+    elif isinstance(images, list):
+        return all(is_valid_image(image) for image in images)
+    return False

PaddleMIX/ppdiffusers/ppdiffusers/initializer.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# NOTE: This file is deprecated and will be removed in a future version.
+# It only exists so that temporarely `from ppdiffusers.utils.initializer_utils import *` works
+# flake8: noqa
+from .utils.initializer_utils import *  # noqa: F401

PaddleMIX/ppdiffusers/ppdiffusers/models/attention_processor.py ADDED Viewed

The diff for this file is too large to render. See raw diff

PaddleMIX/ppdiffusers/ppdiffusers/models/autoencoder_kl_cogvideox.py ADDED Viewed

	@@ -0,0 +1,1190 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+import numpy as np
+import paddle
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import logging
+from ..utils.accelerate_utils import apply_forward_hook
+from .activations import get_activation
+from .downsampling import CogVideoXDownsample3D
+from .modeling_outputs import AutoencoderKLOutput
+from .modeling_utils import ModelMixin
+from .upsampling import CogVideoXUpsample3D
+from .vae import DecoderOutput, DiagonalGaussianDistribution
+logger = logging.get_logger(__name__)
+class CogVideoXSafeConv3d(paddle.nn.Conv3D):
+    """
+    A 3D convolution layer that splits the input tensor into smaller parts to avoid OOM in CogVideoX Model.
+    """
+    def forward(self, input: paddle.Tensor) -> paddle.Tensor:
+        memory_count = paddle.prod(x=paddle.to_tensor(data=tuple(input.shape))).item() * 2 / 1024**3
+        if memory_count > 2:
+            kernel_size = self.kernel_size[0]
+            part_num = int(memory_count / 2) + 1
+            input_chunks = paddle.chunk(x=input, chunks=part_num, axis=2)
+            if kernel_size > 1:
+                input_chunks = [input_chunks[0]] + [
+                    paddle.concat(x=(input_chunks[i - 1][:, :, -kernel_size + 1 :], input_chunks[i]), axis=2)
+                    for i in range(1, len(input_chunks))
+                ]
+            output_chunks = []
+            for input_chunk in input_chunks:
+                output_chunks.append(super().forward(input_chunk))
+            output = paddle.concat(x=output_chunks, axis=2)
+            return output
+        else:
+            return super().forward(input)
+class CogVideoXCausalConv3d(paddle.nn.Layer):
+    """A 3D causal convolution layer that pads the input tensor to ensure causality in CogVideoX Model.
+    Args:
+        in_channels (`int`): Number of channels in the input tensor.
+        out_channels (`int`): Number of output channels produced by the convolution.
+        kernel_size (`int` or `Tuple[int, int, int]`): Kernel size of the convolutional kernel.
+        stride (`int`, defaults to `1`): Stride of the convolution.
+        dilation (`int`, defaults to `1`): Dilation rate of the convolution.
+        pad_mode (`str`, defaults to `"constant"`): Padding mode.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        stride: int = 1,
+        dilation: int = 1,
+        pad_mode: str = "constant",
+    ):
+        super().__init__()
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size,) * 3
+        time_kernel_size, height_kernel_size, width_kernel_size = kernel_size
+        self.pad_mode = pad_mode
+        time_pad = dilation * (time_kernel_size - 1) + (1 - stride)
+        height_pad = height_kernel_size // 2
+        width_pad = width_kernel_size // 2
+        self.height_pad = height_pad
+        self.width_pad = width_pad
+        self.time_pad = time_pad
+        self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_pad, 0)
+        self.temporal_dim = 2
+        self.time_kernel_size = time_kernel_size
+        stride = stride, 1, 1
+        dilation = dilation, 1, 1
+        self.conv = CogVideoXSafeConv3d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+        )
+        self.conv_cache = None
+    def fake_context_parallel_forward(self, inputs: paddle.Tensor) -> paddle.Tensor:
+        kernel_size = self.time_kernel_size
+        if kernel_size > 1:
+            cached_inputs = (
+                [self.conv_cache] if self.conv_cache is not None else [inputs[:, :, :1]] * (kernel_size - 1)
+            )
+            inputs = paddle.concat(x=cached_inputs + [inputs], axis=2)
+        return inputs
+    def _clear_fake_context_parallel_cache(self):
+        del self.conv_cache
+        self.conv_cache = None
+    def forward(self, inputs: paddle.Tensor) -> paddle.Tensor:
+        inputs = self.fake_context_parallel_forward(inputs)
+        self._clear_fake_context_parallel_cache()
+        self.conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()
+        padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad, 0, 0)
+        inputs = paddle.nn.functional.pad(x=inputs, pad=padding_2d, mode="constant", value=0, data_format="NCDHW")
+        output = self.conv(inputs)
+        return output
+class CogVideoXSpatialNorm3D(paddle.nn.Layer):
+    """
+    Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002. This implementation is specific
+    to 3D-video like data.
+    CogVideoXSafeConv3d is used instead of nn.Conv3d to avoid OOM in CogVideoX Model.
+    Args:
+        f_channels (`int`):
+            The number of channels for input to group normalization layer, and output of the spatial norm layer.
+        zq_channels (`int`):
+            The number of channels for the quantized vector as described in the paper.
+        groups (`int`):
+            Number of groups to separate the channels into for group normalization.
+    """
+    def __init__(self, f_channels: int, zq_channels: int, groups: int = 32):
+        super().__init__()
+        self.norm_layer = paddle.nn.GroupNorm(
+            num_channels=f_channels, num_groups=groups, epsilon=1e-06, weight_attr=True, bias_attr=True
+        )
+        self.conv_y = CogVideoXCausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
+        self.conv_b = CogVideoXCausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
+    def forward(self, f: paddle.Tensor, zq: paddle.Tensor) -> paddle.Tensor:
+        if tuple(f.shape)[2] > 1 and tuple(f.shape)[2] % 2 == 1:
+            f_first, f_rest = f[:, :, :1], f[:, :, 1:]
+            f_first_size, f_rest_size = tuple(f_first.shape)[-3:], tuple(f_rest.shape)[-3:]
+            z_first, z_rest = zq[:, :, :1], zq[:, :, 1:]
+            z_first = paddle.nn.functional.interpolate(x=z_first, size=f_first_size)
+            z_rest = paddle.nn.functional.interpolate(x=z_rest, size=f_rest_size)
+            zq = paddle.concat(x=[z_first, z_rest], axis=2)
+        else:
+            zq = paddle.nn.functional.interpolate(x=zq, size=tuple(f.shape)[-3:])
+        norm_f = self.norm_layer(f)
+        new_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
+        return new_f
+class CogVideoXResnetBlock3D(paddle.nn.Layer):
+    """
+    A 3D ResNet block used in the CogVideoX model.
+    Args:
+        in_channels (`int`):
+            Number of input channels.
+        out_channels (`int`, *optional*):
+            Number of output channels. If None, defaults to `in_channels`.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        temb_channels (`int`, defaults to `512`):
+            Number of time embedding channels.
+        groups (`int`, defaults to `32`):
+            Number of groups to separate the channels into for group normalization.
+        eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        non_linearity (`str`, defaults to `"swish"`):
+            Activation function to use.
+        conv_shortcut (bool, defaults to `False`):
+            Whether or not to use a convolution shortcut.
+        spatial_norm_dim (`int`, *optional*):
+            The dimension to use for spatial norm if it is to be used instead of group norm.
+        pad_mode (str, defaults to `"first"`):
+            Padding mode.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        dropout: float = 0.0,
+        temb_channels: int = 512,
+        groups: int = 32,
+        eps: float = 1e-06,
+        non_linearity: str = "swish",
+        conv_shortcut: bool = False,
+        spatial_norm_dim: Optional[int] = None,
+        pad_mode: str = "first",
+    ):
+        super().__init__()
+        out_channels = out_channels or in_channels
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.nonlinearity = get_activation(non_linearity)
+        self.use_conv_shortcut = conv_shortcut
+        if spatial_norm_dim is None:
+            self.norm1 = paddle.nn.GroupNorm(num_channels=in_channels, num_groups=groups, epsilon=eps)
+            self.norm2 = paddle.nn.GroupNorm(num_channels=out_channels, num_groups=groups, epsilon=eps)
+        else:
+            self.norm1 = CogVideoXSpatialNorm3D(f_channels=in_channels, zq_channels=spatial_norm_dim, groups=groups)
+            self.norm2 = CogVideoXSpatialNorm3D(f_channels=out_channels, zq_channels=spatial_norm_dim, groups=groups)
+        self.conv1 = CogVideoXCausalConv3d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=3, pad_mode=pad_mode
+        )
+        if temb_channels > 0:
+            self.temb_proj = paddle.nn.Linear(in_features=temb_channels, out_features=out_channels)
+        self.dropout = paddle.nn.Dropout(p=dropout)
+        self.conv2 = CogVideoXCausalConv3d(
+            in_channels=out_channels, out_channels=out_channels, kernel_size=3, pad_mode=pad_mode
+        )
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = CogVideoXCausalConv3d(
+                    in_channels=in_channels, out_channels=out_channels, kernel_size=3, pad_mode=pad_mode
+                )
+            else:
+                self.conv_shortcut = CogVideoXSafeConv3d(
+                    in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1, padding=0
+                )
+    def forward(
+        self, inputs: paddle.Tensor, temb: Optional[paddle.Tensor] = None, zq: Optional[paddle.Tensor] = None
+    ) -> paddle.Tensor:
+        hidden_states = inputs
+        if zq is not None:
+            hidden_states = self.norm1(hidden_states, zq)
+        else:
+            hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        if temb is not None:
+            hidden_states = hidden_states + self.temb_proj(self.nonlinearity(temb))[:, :, None, None, None]
+        if zq is not None:
+            hidden_states = self.norm2(hidden_states, zq)
+        else:
+            hidden_states = self.norm2(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.in_channels != self.out_channels:
+            inputs = self.conv_shortcut(inputs)
+        hidden_states = hidden_states + inputs
+        return hidden_states
+class CogVideoXDownBlock3D(paddle.nn.Layer):
+    """
+    A downsampling block used in the CogVideoX model.
+    Args:
+        in_channels (`int`):
+            Number of input channels.
+        out_channels (`int`, *optional*):
+            Number of output channels. If None, defaults to `in_channels`.
+        temb_channels (`int`, defaults to `512`):
+            Number of time embedding channels.
+        num_layers (`int`, defaults to `1`):
+            Number of resnet layers.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        resnet_eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        resnet_act_fn (`str`, defaults to `"swish"`):
+            Activation function to use.
+        resnet_groups (`int`, defaults to `32`):
+            Number of groups to separate the channels into for group normalization.
+        add_downsample (`bool`, defaults to `True`):
+            Whether or not to use a downsampling layer. If not used, output dimension would be same as input dimension.
+        compress_time (`bool`, defaults to `False`):
+            Whether or not to downsample across temporal dimension.
+        pad_mode (str, defaults to `"first"`):
+            Padding mode.
+    """
+    _supports_gradient_checkpointing = True
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-06,
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        add_downsample: bool = True,
+        downsample_padding: int = 0,
+        compress_time: bool = False,
+        pad_mode: str = "first",
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            in_channel = in_channels if i == 0 else out_channels
+            resnets.append(
+                CogVideoXResnetBlock3D(
+                    in_channels=in_channel,
+                    out_channels=out_channels,
+                    dropout=dropout,
+                    temb_channels=temb_channels,
+                    groups=resnet_groups,
+                    eps=resnet_eps,
+                    non_linearity=resnet_act_fn,
+                    pad_mode=pad_mode,
+                )
+            )
+        self.resnets = paddle.nn.LayerList(sublayers=resnets)
+        self.downsamplers = None
+        if add_downsample:
+            self.downsamplers = paddle.nn.LayerList(
+                sublayers=[
+                    CogVideoXDownsample3D(
+                        out_channels, out_channels, padding=downsample_padding, compress_time=compress_time
+                    )
+                ]
+            )
+        self.gradient_checkpointing = False
+    def forward(
+        self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None, zq: Optional[paddle.Tensor] = None
+    ) -> paddle.Tensor:
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def create_forward(*inputs):
+                        return module(*inputs)
+                    return create_forward
+                hidden_states = paddle.distributed.fleet.utils.recompute(
+                    create_custom_forward(resnet), hidden_states, temb, zq
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, zq)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+        return hidden_states
+class CogVideoXMidBlock3D(paddle.nn.Layer):
+    """
+    A middle block used in the CogVideoX model.
+    Args:
+        in_channels (`int`):
+            Number of input channels.
+        temb_channels (`int`, defaults to `512`):
+            Number of time embedding channels.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        num_layers (`int`, defaults to `1`):
+            Number of resnet layers.
+        resnet_eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        resnet_act_fn (`str`, defaults to `"swish"`):
+            Activation function to use.
+        resnet_groups (`int`, defaults to `32`):
+            Number of groups to separate the channels into for group normalization.
+        spatial_norm_dim (`int`, *optional*):
+            The dimension to use for spatial norm if it is to be used instead of group norm.
+        pad_mode (str, defaults to `"first"`):
+            Padding mode.
+    """
+    _supports_gradient_checkpointing = True
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-06,
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        spatial_norm_dim: Optional[int] = None,
+        pad_mode: str = "first",
+    ):
+        super().__init__()
+        resnets = []
+        for _ in range(num_layers):
+            resnets.append(
+                CogVideoXResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    dropout=dropout,
+                    temb_channels=temb_channels,
+                    groups=resnet_groups,
+                    eps=resnet_eps,
+                    spatial_norm_dim=spatial_norm_dim,
+                    non_linearity=resnet_act_fn,
+                    pad_mode=pad_mode,
+                )
+            )
+        self.resnets = paddle.nn.LayerList(sublayers=resnets)
+        self.gradient_checkpointing = False
+    def forward(
+        self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None, zq: Optional[paddle.Tensor] = None
+    ) -> paddle.Tensor:
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def create_forward(*inputs):
+                        return module(*inputs)
+                    return create_forward
+                hidden_states = paddle.distributed.fleet.utils.recompute(
+                    create_custom_forward(resnet), hidden_states, temb, zq
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, zq)
+        return hidden_states
+class CogVideoXUpBlock3D(paddle.nn.Layer):
+    """
+    An upsampling block used in the CogVideoX model.
+    Args:
+        in_channels (`int`):
+            Number of input channels.
+        out_channels (`int`, *optional*):
+            Number of output channels. If None, defaults to `in_channels`.
+        temb_channels (`int`, defaults to `512`):
+            Number of time embedding channels.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        num_layers (`int`, defaults to `1`):
+            Number of resnet layers.
+        resnet_eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        resnet_act_fn (`str`, defaults to `"swish"`):
+            Activation function to use.
+        resnet_groups (`int`, defaults to `32`):
+            Number of groups to separate the channels into for group normalization.
+        spatial_norm_dim (`int`, defaults to `16`):
+            The dimension to use for spatial norm if it is to be used instead of group norm.
+        add_upsample (`bool`, defaults to `True`):
+            Whether or not to use a upsampling layer. If not used, output dimension would be same as input dimension.
+        compress_time (`bool`, defaults to `False`):
+            Whether or not to downsample across temporal dimension.
+        pad_mode (str, defaults to `"first"`):
+            Padding mode.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-06,
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        spatial_norm_dim: int = 16,
+        add_upsample: bool = True,
+        upsample_padding: int = 1,
+        compress_time: bool = False,
+        pad_mode: str = "first",
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            in_channel = in_channels if i == 0 else out_channels
+            resnets.append(
+                CogVideoXResnetBlock3D(
+                    in_channels=in_channel,
+                    out_channels=out_channels,
+                    dropout=dropout,
+                    temb_channels=temb_channels,
+                    groups=resnet_groups,
+                    eps=resnet_eps,
+                    non_linearity=resnet_act_fn,
+                    spatial_norm_dim=spatial_norm_dim,
+                    pad_mode=pad_mode,
+                )
+            )
+        self.resnets = paddle.nn.LayerList(sublayers=resnets)
+        self.upsamplers = None
+        if add_upsample:
+            self.upsamplers = paddle.nn.LayerList(
+                sublayers=[
+                    CogVideoXUpsample3D(
+                        out_channels, out_channels, padding=upsample_padding, compress_time=compress_time
+                    )
+                ]
+            )
+        self.gradient_checkpointing = False
+    def forward(
+        self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None, zq: Optional[paddle.Tensor] = None
+    ) -> paddle.Tensor:
+        """Forward method of the `CogVideoXUpBlock3D` class."""
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def create_forward(*inputs):
+                        return module(*inputs)
+                    return create_forward
+                hidden_states = paddle.distributed.fleet.utils.recompute(
+                    create_custom_forward(resnet), hidden_states, temb, zq
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, zq)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+        return hidden_states
+class CogVideoXEncoder3D(paddle.nn.Layer):
+    """
+    The `CogVideoXEncoder3D` layer of a variational autoencoder that encodes its input into a latent representation.
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        down_block_types (`Tuple[str, ...]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            The types of down blocks to use. See `~diffusers.models.unet_2d_blocks.get_down_block` for available
+            options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+    """
+    _supports_gradient_checkpointing = True
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 16,
+        down_block_types: Tuple[str, ...] = (
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+        ),
+        block_out_channels: Tuple[int, ...] = (128, 256, 256, 512),
+        layers_per_block: int = 3,
+        act_fn: str = "silu",
+        norm_eps: float = 1e-06,
+        norm_num_groups: int = 32,
+        dropout: float = 0.0,
+        pad_mode: str = "first",
+        temporal_compression_ratio: float = 4,
+    ):
+        super().__init__()
+        temporal_compress_level = int(np.log2(temporal_compression_ratio))
+        self.conv_in = CogVideoXCausalConv3d(in_channels, block_out_channels[0], kernel_size=3, pad_mode=pad_mode)
+        self.down_blocks = paddle.nn.LayerList(sublayers=[])
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            compress_time = i < temporal_compress_level
+            if down_block_type == "CogVideoXDownBlock3D":
+                down_block = CogVideoXDownBlock3D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    temb_channels=0,
+                    dropout=dropout,
+                    num_layers=layers_per_block,
+                    resnet_eps=norm_eps,
+                    resnet_act_fn=act_fn,
+                    resnet_groups=norm_num_groups,
+                    add_downsample=not is_final_block,
+                    compress_time=compress_time,
+                )
+            else:
+                raise ValueError("Invalid `down_block_type` encountered. Must be `CogVideoXDownBlock3D`")
+            self.down_blocks.append(down_block)
+        self.mid_block = CogVideoXMidBlock3D(
+            in_channels=block_out_channels[-1],
+            temb_channels=0,
+            dropout=dropout,
+            num_layers=2,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            resnet_groups=norm_num_groups,
+            pad_mode=pad_mode,
+        )
+        self.norm_out = paddle.nn.GroupNorm(
+            num_groups=norm_num_groups, num_channels=block_out_channels[-1], epsilon=1e-06
+        )
+        self.conv_act = paddle.nn.Silu()
+        self.conv_out = CogVideoXCausalConv3d(
+            block_out_channels[-1], 2 * out_channels, kernel_size=3, pad_mode=pad_mode
+        )
+        self.gradient_checkpointing = False
+    def forward(self, sample: paddle.Tensor, temb: Optional[paddle.Tensor] = None) -> paddle.Tensor:
+        """The forward method of the `CogVideoXEncoder3D` class."""
+        hidden_states = self.conv_in(sample)
+        if self.training and self.gradient_checkpointing:
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+                return custom_forward
+            for down_block in self.down_blocks:
+                hidden_states = paddle.distributed.fleet.utils.recompute(
+                    create_custom_forward(down_block), hidden_states, temb, None
+                )
+            hidden_states = paddle.distributed.fleet.utils.recompute(
+                create_custom_forward(self.mid_block), hidden_states, temb, None
+            )
+        else:
+            for down_block in self.down_blocks:
+                hidden_states = down_block(hidden_states, temb, None)
+            hidden_states = self.mid_block(hidden_states, temb, None)
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        return hidden_states
+class CogVideoXDecoder3D(paddle.nn.Layer):
+    """
+    The `CogVideoXDecoder3D` layer of a variational autoencoder that decodes its latent representation into an output
+    sample.
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+    """
+    _supports_gradient_checkpointing = True
+    def __init__(
+        self,
+        in_channels: int = 16,
+        out_channels: int = 3,
+        up_block_types: Tuple[str, ...] = (
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+        ),
+        block_out_channels: Tuple[int, ...] = (128, 256, 256, 512),
+        layers_per_block: int = 3,
+        act_fn: str = "silu",
+        norm_eps: float = 1e-06,
+        norm_num_groups: int = 32,
+        dropout: float = 0.0,
+        pad_mode: str = "first",
+        temporal_compression_ratio: float = 4,
+    ):
+        super().__init__()
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        self.conv_in = CogVideoXCausalConv3d(
+            in_channels, reversed_block_out_channels[0], kernel_size=3, pad_mode=pad_mode
+        )
+        self.mid_block = CogVideoXMidBlock3D(
+            in_channels=reversed_block_out_channels[0],
+            temb_channels=0,
+            num_layers=2,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            resnet_groups=norm_num_groups,
+            spatial_norm_dim=in_channels,
+            pad_mode=pad_mode,
+        )
+        self.up_blocks = paddle.nn.LayerList(sublayers=[])
+        output_channel = reversed_block_out_channels[0]
+        temporal_compress_level = int(np.log2(temporal_compression_ratio))
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            compress_time = i < temporal_compress_level
+            if up_block_type == "CogVideoXUpBlock3D":
+                up_block = CogVideoXUpBlock3D(
+                    in_channels=prev_output_channel,
+                    out_channels=output_channel,
+                    temb_channels=0,
+                    dropout=dropout,
+                    num_layers=layers_per_block + 1,
+                    resnet_eps=norm_eps,
+                    resnet_act_fn=act_fn,
+                    resnet_groups=norm_num_groups,
+                    spatial_norm_dim=in_channels,
+                    add_upsample=not is_final_block,
+                    compress_time=compress_time,
+                    pad_mode=pad_mode,
+                )
+                prev_output_channel = output_channel
+            else:
+                raise ValueError("Invalid `up_block_type` encountered. Must be `CogVideoXUpBlock3D`")
+            self.up_blocks.append(up_block)
+        self.norm_out = CogVideoXSpatialNorm3D(reversed_block_out_channels[-1], in_channels, groups=norm_num_groups)
+        self.conv_act = paddle.nn.Silu()
+        self.conv_out = CogVideoXCausalConv3d(
+            reversed_block_out_channels[-1], out_channels, kernel_size=3, pad_mode=pad_mode
+        )
+        self.gradient_checkpointing = False
+    def forward(self, sample: paddle.Tensor, temb: Optional[paddle.Tensor] = None) -> paddle.Tensor:
+        """The forward method of the `CogVideoXDecoder3D` class."""
+        hidden_states = self.conv_in(sample)
+        if self.training and self.gradient_checkpointing:
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+                return custom_forward
+            hidden_states = paddle.distributed.fleet.utils.recompute(
+                create_custom_forward(self.mid_block), hidden_states, temb, sample
+            )
+            for up_block in self.up_blocks:
+                hidden_states = paddle.distributed.fleet.utils.recompute(
+                    create_custom_forward(up_block), hidden_states, temb, sample
+                )
+        else:
+            hidden_states = self.mid_block(hidden_states, temb, sample)
+            for up_block in self.up_blocks:
+                hidden_states = up_block(hidden_states, temb, sample)
+        hidden_states = self.norm_out(hidden_states, sample)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        return hidden_states
+class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin):
+    """
+    A VAE model with KL loss for encoding images into latents and decoding latent representations into images. Used in
+    [CogVideoX](https://github.com/THUDM/CogVideo).
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of block output channels.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        scaling_factor (`float`, *optional*, defaults to `1.15258426`):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+        force_upcast (`bool`, *optional*, default to `True`):
+            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
+            can be fine-tuned / trained to a lower range without loosing too much precision in which case
+            `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
+    """
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["CogVideoXResnetBlock3D"]
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = (
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+        ),
+        up_block_types: Tuple[str] = (
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+        ),
+        block_out_channels: Tuple[int] = (128, 256, 256, 512),
+        latent_channels: int = 16,
+        layers_per_block: int = 3,
+        act_fn: str = "silu",
+        norm_eps: float = 1e-06,
+        norm_num_groups: int = 32,
+        temporal_compression_ratio: float = 4,
+        sample_height: int = 480,
+        sample_width: int = 720,
+        scaling_factor: float = 1.15258426,
+        shift_factor: Optional[float] = None,
+        latents_mean: Optional[Tuple[float]] = None,
+        latents_std: Optional[Tuple[float]] = None,
+        force_upcast: float = True,
+        use_quant_conv: bool = False,
+        use_post_quant_conv: bool = False,
+    ):
+        super().__init__()
+        self.encoder = CogVideoXEncoder3D(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_eps=norm_eps,
+            norm_num_groups=norm_num_groups,
+            temporal_compression_ratio=temporal_compression_ratio,
+        )
+        self.decoder = CogVideoXDecoder3D(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_eps=norm_eps,
+            norm_num_groups=norm_num_groups,
+            temporal_compression_ratio=temporal_compression_ratio,
+        )
+        self.quant_conv = CogVideoXSafeConv3d(2 * out_channels, 2 * out_channels, 1) if use_quant_conv else None
+        self.post_quant_conv = CogVideoXSafeConv3d(out_channels, out_channels, 1) if use_post_quant_conv else None
+        self.use_slicing = False
+        self.use_tiling = False
+        self.num_latent_frames_batch_size = 2
+        self.num_sample_frames_batch_size = 8
+        self.tile_sample_min_height = sample_height // 2
+        self.tile_sample_min_width = sample_width // 2
+        self.tile_latent_min_height = int(self.tile_sample_min_height / 2 ** (len(self.config.block_out_channels) - 1))
+        self.tile_latent_min_width = int(self.tile_sample_min_width / 2 ** (len(self.config.block_out_channels) - 1))
+        self.tile_overlap_factor_height = 1 / 6
+        self.tile_overlap_factor_width = 1 / 5
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CogVideoXEncoder3D, CogVideoXDecoder3D)):
+            module.gradient_checkpointing = value
+    def _clear_fake_context_parallel_cache(self):
+        for name, module in self.named_sublayers():
+            if isinstance(module, CogVideoXCausalConv3d):
+                logger.debug(f"Clearing fake Context Parallel cache for layer: {name}")
+                module._clear_fake_context_parallel_cache()
+    def enable_tiling(
+        self,
+        tile_sample_min_height: Optional[int] = None,
+        tile_sample_min_width: Optional[int] = None,
+        tile_overlap_factor_height: Optional[float] = None,
+        tile_overlap_factor_width: Optional[float] = None,
+    ) -> None:
+        """
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        Args:
+            tile_sample_min_height (`int`, *optional*):
+                The minimum height required for a sample to be separated into tiles across the height dimension.
+            tile_sample_min_width (`int`, *optional*):
+                The minimum width required for a sample to be separated into tiles across the width dimension.
+            tile_overlap_factor_height (`int`, *optional*):
+                The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
+                no tiling artifacts produced across the height dimension. Must be between 0 and 1. Setting a higher
+                value might cause more tiles to be processed leading to slow down of the decoding process.
+            tile_overlap_factor_width (`int`, *optional*):
+                The minimum amount of overlap between two consecutive horizontal tiles. This is to ensure that there
+                are no tiling artifacts produced across the width dimension. Must be between 0 and 1. Setting a higher
+                value might cause more tiles to be processed leading to slow down of the decoding process.
+        """
+        self.use_tiling = True
+        self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
+        self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
+        self.tile_latent_min_height = int(self.tile_sample_min_height / 2 ** (len(self.config.block_out_channels) - 1))
+        self.tile_latent_min_width = int(self.tile_sample_min_width / 2 ** (len(self.config.block_out_channels) - 1))
+        self.tile_overlap_factor_height = tile_overlap_factor_height or self.tile_overlap_factor_height
+        self.tile_overlap_factor_width = tile_overlap_factor_width or self.tile_overlap_factor_width
+    def disable_tiling(self) -> None:
+        """
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_tiling = False
+    def enable_slicing(self) -> None:
+        """
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+    def disable_slicing(self) -> None:
+        """
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+    def _encode(self, x: paddle.Tensor) -> paddle.Tensor:
+        batch_size, num_channels, num_frames, height, width = tuple(x.shape)
+        if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
+            return self.tiled_encode(x)
+        frame_batch_size = self.num_sample_frames_batch_size
+        num_batches = num_frames // frame_batch_size if num_frames > 1 else 1
+        enc = []
+        for i in range(num_batches):
+            remaining_frames = num_frames % frame_batch_size
+            start_frame = frame_batch_size * i + (0 if i == 0 else remaining_frames)
+            end_frame = frame_batch_size * (i + 1) + remaining_frames
+            x_intermediate = x[:, :, start_frame:end_frame]
+            x_intermediate = self.encoder(x_intermediate)
+            if self.quant_conv is not None:
+                x_intermediate = self.quant_conv(x_intermediate)
+            enc.append(x_intermediate)
+        self._clear_fake_context_parallel_cache()
+        enc = paddle.concat(x=enc, axis=2)
+        return enc
+    @apply_forward_hook
+    def encode(
+        self, x: paddle.Tensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images into latents.
+        Args:
+            x (`torch.Tensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+        Returns:
+                The latent representations of the encoded videos. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        if self.use_slicing and tuple(x.shape)[0] > 1:
+            encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
+            h = paddle.concat(x=encoded_slices)
+        else:
+            h = self._encode(x)
+        posterior = DiagonalGaussianDistribution(h)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def _decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]:
+        batch_size, num_channels, num_frames, height, width = tuple(z.shape)
+        if self.use_tiling and (width > self.tile_latent_min_width or height > self.tile_latent_min_height):
+            return self.tiled_decode(z, return_dict=return_dict)
+        frame_batch_size = self.num_latent_frames_batch_size
+        num_batches = num_frames // frame_batch_size
+        dec = []
+        for i in range(num_batches):
+            remaining_frames = num_frames % frame_batch_size
+            start_frame = frame_batch_size * i + (0 if i == 0 else remaining_frames)
+            end_frame = frame_batch_size * (i + 1) + remaining_frames
+            z_intermediate = z[:, :, start_frame:end_frame]
+            if self.post_quant_conv is not None:
+                z_intermediate = self.post_quant_conv(z_intermediate)
+            z_intermediate = self.decoder(z_intermediate)
+            dec.append(z_intermediate)
+        self._clear_fake_context_parallel_cache()
+        dec = paddle.concat(x=dec, axis=2)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+    @apply_forward_hook
+    def decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]:
+        """
+        Decode a batch of images.
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        if self.use_slicing and tuple(z.shape)[0] > 1:
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+            decoded = paddle.concat(x=decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+        if not return_dict:
+            return (decoded,)
+        return DecoderOutput(sample=decoded)
+    def blend_v(self, a: paddle.Tensor, b: paddle.Tensor, blend_extent: int) -> paddle.Tensor:
+        blend_extent = min(tuple(a.shape)[3], tuple(b.shape)[3], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
+                y / blend_extent
+            )
+        return b
+    def blend_h(self, a: paddle.Tensor, b: paddle.Tensor, blend_extent: int) -> paddle.Tensor:
+        blend_extent = min(tuple(a.shape)[4], tuple(b.shape)[4], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
+                x / blend_extent
+            )
+        return b
+    def tiled_encode(self, x: paddle.Tensor) -> paddle.Tensor:
+        """Encode a batch of images using a tiled encoder.
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+        Args:
+            x (`torch.Tensor`): Input batch of videos.
+        Returns:
+            `torch.Tensor`:
+                The latent representation of the encoded videos.
+        """
+        batch_size, num_channels, num_frames, height, width = tuple(x.shape)
+        overlap_height = int(self.tile_sample_min_height * (1 - self.tile_overlap_factor_height))
+        overlap_width = int(self.tile_sample_min_width * (1 - self.tile_overlap_factor_width))
+        blend_extent_height = int(self.tile_latent_min_height * self.tile_overlap_factor_height)
+        blend_extent_width = int(self.tile_latent_min_width * self.tile_overlap_factor_width)
+        row_limit_height = self.tile_latent_min_height - blend_extent_height
+        row_limit_width = self.tile_latent_min_width - blend_extent_width
+        frame_batch_size = self.num_sample_frames_batch_size
+        rows = []
+        for i in range(0, height, overlap_height):
+            row = []
+            for j in range(0, width, overlap_width):
+                num_batches = num_frames // frame_batch_size if num_frames > 1 else 1
+                time = []
+                for k in range(num_batches):
+                    remaining_frames = num_frames % frame_batch_size
+                    start_frame = frame_batch_size * k + (0 if k == 0 else remaining_frames)
+                    end_frame = frame_batch_size * (k + 1) + remaining_frames
+                    tile = x[
+                        :,
+                        :,
+                        start_frame:end_frame,
+                        i : i + self.tile_sample_min_height,
+                        j : j + self.tile_sample_min_width,
+                    ]
+                    tile = self.encoder(tile)
+                    if self.quant_conv is not None:
+                        tile = self.quant_conv(tile)
+                    time.append(tile)
+                self._clear_fake_context_parallel_cache()
+                row.append(paddle.concat(x=time, axis=2))
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent_width)
+                result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width])
+            result_rows.append(paddle.concat(x=result_row, axis=4))
+        enc = paddle.concat(x=result_rows, axis=3)
+        return enc
+    def tiled_decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]:
+        """
+        Decode a batch of images using a tiled decoder.
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        batch_size, num_channels, num_frames, height, width = tuple(z.shape)
+        overlap_height = int(self.tile_latent_min_height * (1 - self.tile_overlap_factor_height))
+        overlap_width = int(self.tile_latent_min_width * (1 - self.tile_overlap_factor_width))
+        blend_extent_height = int(self.tile_sample_min_height * self.tile_overlap_factor_height)
+        blend_extent_width = int(self.tile_sample_min_width * self.tile_overlap_factor_width)
+        row_limit_height = self.tile_sample_min_height - blend_extent_height
+        row_limit_width = self.tile_sample_min_width - blend_extent_width
+        frame_batch_size = self.num_latent_frames_batch_size
+        rows = []
+        for i in range(0, height, overlap_height):
+            row = []
+            for j in range(0, width, overlap_width):
+                num_batches = num_frames // frame_batch_size
+                time = []
+                for k in range(num_batches):
+                    remaining_frames = num_frames % frame_batch_size
+                    start_frame = frame_batch_size * k + (0 if k == 0 else remaining_frames)
+                    end_frame = frame_batch_size * (k + 1) + remaining_frames
+                    tile = z[
+                        :,
+                        :,
+                        start_frame:end_frame,
+                        i : i + self.tile_latent_min_height,
+                        j : j + self.tile_latent_min_width,
+                    ]
+                    if self.post_quant_conv is not None:
+                        tile = self.post_quant_conv(tile)
+                    tile = self.decoder(tile)
+                    time.append(tile)
+                self._clear_fake_context_parallel_cache()
+                row.append(paddle.concat(x=time, axis=2))
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent_width)
+                result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width])
+            result_rows.append(paddle.concat(x=result_row, axis=4))
+        dec = paddle.concat(x=result_rows, axis=3)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+    def forward(
+        self,
+        sample: paddle.Tensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[paddle.seed] = None,
+    ) -> Union[paddle.Tensor, paddle.Tensor]:
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z)
+        if not return_dict:
+            return (dec,)
+        return dec

PaddleMIX/ppdiffusers/ppdiffusers/models/autoencoder_kl_temporal_decoder.py ADDED Viewed

	@@ -0,0 +1,396 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Optional, Tuple, Union
+import paddle
+import paddle.nn as nn
+from paddle.distributed.fleet.utils import recompute
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..loaders import FromOriginalVAEMixin
+from ..utils import recompute_use_reentrant
+from ..utils.accelerate_utils import apply_forward_hook
+from .attention_processor import (
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnProcessor,
+)
+from .modeling_outputs import AutoencoderKLOutput
+from .modeling_utils import ModelMixin
+from .unet_3d_blocks import MidBlockTemporalDecoder, UpBlockTemporalDecoder
+from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder
+class TemporalDecoder(nn.Layer):
+    def __init__(
+        self,
+        in_channels: int = 4,
+        out_channels: int = 3,
+        block_out_channels: Tuple[int] = (128, 256, 512, 512),
+        layers_per_block: int = 2,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+        self.conv_in = nn.Conv2D(in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1)
+        self.mid_block = MidBlockTemporalDecoder(
+            num_layers=self.layers_per_block,
+            in_channels=block_out_channels[-1],
+            out_channels=block_out_channels[-1],
+            attention_head_dim=block_out_channels[-1],
+        )
+        # up
+        self.up_blocks = nn.LayerList([])
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i in range(len(block_out_channels)):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            up_block = UpBlockTemporalDecoder(
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                add_upsample=not is_final_block,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=32, epsilon=1e-6)
+        self.conv_act = nn.Silu()
+        self.conv_out = nn.Conv2D(
+            in_channels=block_out_channels[0],
+            out_channels=out_channels,
+            kernel_size=3,
+            padding=1,
+        )
+        conv_out_kernel_size = (3, 1, 1)
+        padding = [int(k // 2) for k in conv_out_kernel_size]
+        self.time_conv_out = nn.Conv3D(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=conv_out_kernel_size,
+            padding=padding,
+        )
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        sample: paddle.Tensor,
+        image_only_indicator: paddle.Tensor,
+        num_frames: int = 1,
+    ) -> paddle.Tensor:
+        r"""The forward method of the `Decoder` class."""
+        sample = self.conv_in(sample)
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+        if self.gradient_checkpointing and not sample.stop_gradient:
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+                return custom_forward
+            ckpt_kwargs = {} if recompute_use_reentrant() else {"use_reentrant": False}
+            # middle
+            sample = recompute(
+                create_custom_forward(self.mid_block),
+                sample,
+                image_only_indicator,
+                **ckpt_kwargs,
+            )
+            sample = sample.cast(upscale_dtype)
+            # up
+            for up_block in self.up_blocks:
+                sample = recompute(
+                    create_custom_forward(up_block),
+                    sample,
+                    image_only_indicator,
+                    **ckpt_kwargs,
+                )
+        else:
+            # middle
+            sample = self.mid_block(sample, image_only_indicator=image_only_indicator)
+            sample = sample.cast(upscale_dtype)
+            # up
+            for up_block in self.up_blocks:
+                sample = up_block(sample, image_only_indicator=image_only_indicator)
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        batch_frames, channels, height, width = sample.shape
+        batch_size = batch_frames // num_frames
+        sample = sample[None, :].reshape([batch_size, num_frames, channels, height, width]).transpose([0, 2, 1, 3, 4])
+        sample = self.time_conv_out(sample)
+        sample = sample.transpose([0, 2, 1, 3, 4]).reshape([batch_frames, channels, height, width])
+        return sample
+class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
+    r"""
+    A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            Tuple of downsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of block output channels.
+        layers_per_block: (`int`, *optional*, defaults to 1): Number of layers per block.
+        latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
+        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        scaling_factor (`float`, *optional*, defaults to 0.18215):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+        force_upcast (`bool`, *optional*, default to `True`):
+            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
+            can be fine-tuned / trained to a lower range without loosing too much precision in which case
+            `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        latent_channels: int = 4,
+        sample_size: int = 32,
+        scaling_factor: float = 0.18215,
+        force_upcast: float = True,
+    ):
+        super().__init__()
+        # pass init params to Encoder
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            double_z=True,
+        )
+        # pass init params to Decoder
+        self.decoder = TemporalDecoder(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+        )
+        self.quant_conv = nn.Conv2D(2 * latent_channels, 2 * latent_channels, 1)
+        sample_size = (
+            self.config.sample_size[0]
+            if isinstance(self.config.sample_size, (list, tuple))
+            else self.config.sample_size
+        )
+        self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
+        self.tile_overlap_factor = 0.25
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (Encoder, TemporalDecoder)):
+            module.gradient_checkpointing = value
+    @property
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor, _remove_lora=_remove_lora)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor, _remove_lora=True)
+    @apply_forward_hook
+    def encode(
+        self, x: paddle.Tensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images into latents.
+        Args:
+            x (`paddle.Tensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+        Returns:
+                The latent representations of the encoded images. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        # TODO junnyu, support float16
+        x = x.cast(self.encoder.conv_in.weight.dtype)
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    @apply_forward_hook
+    def decode(
+        self,
+        z: paddle.Tensor,
+        num_frames: int,
+        return_dict: bool = True,
+    ) -> Union[DecoderOutput, paddle.Tensor]:
+        """
+        Decode a batch of images.
+        Args:
+            z (`paddle.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        # TODO junnyu, add this to support pure fp16
+        z = z.cast(self.quant_conv.weight.dtype)
+        batch_size = z.shape[0] // num_frames
+        image_only_indicator = paddle.zeros([batch_size, num_frames], dtype=z.dtype)
+        decoded = self.decoder(z, num_frames=num_frames, image_only_indicator=image_only_indicator)
+        if not return_dict:
+            return (decoded,)
+        return DecoderOutput(sample=decoded)
+    def forward(
+        self,
+        sample: paddle.Tensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[paddle.Generator] = None,
+        num_frames: int = 1,
+    ) -> Union[DecoderOutput, paddle.Tensor]:
+        r"""
+        Args:
+            sample (`paddle.Tensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z, num_frames=num_frames).sample
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)

PaddleMIX/ppdiffusers/ppdiffusers/models/autoencoder_tiny.py ADDED Viewed

	@@ -0,0 +1,363 @@

+# Copyright 2023 Ollin Boer Bohan and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import paddle
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.accelerate_utils import apply_forward_hook
+from .modeling_utils import ModelMixin
+from .vae import DecoderOutput, DecoderTiny, EncoderTiny
+@dataclass
+class AutoencoderTinyOutput(BaseOutput):
+    """
+    Output of AutoencoderTiny encoding method.
+    Args:
+        latents (`paddle.Tensor`): Encoded outputs of the `Encoder`.
+    """
+    latents: paddle.Tensor
+class AutoencoderTiny(ModelMixin, ConfigMixin):
+    r"""
+    A tiny distilled VAE model for encoding images into latents and decoding latent representations into images.
+    [`AutoencoderTiny`] is a wrapper around the original implementation of `TAESD`.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for its generic methods implemented for
+    all models (such as downloading or saving).
+    Parameters:
+        in_channels (`int`, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (`int`,  *optional*, defaults to 3): Number of channels in the output.
+        encoder_block_out_channels (`Tuple[int]`, *optional*, defaults to `(64, 64, 64, 64)`):
+            Tuple of integers representing the number of output channels for each encoder block. The length of the
+            tuple should be equal to the number of encoder blocks.
+        decoder_block_out_channels (`Tuple[int]`, *optional*, defaults to `(64, 64, 64, 64)`):
+            Tuple of integers representing the number of output channels for each decoder block. The length of the
+            tuple should be equal to the number of decoder blocks.
+        act_fn (`str`, *optional*, defaults to `"relu"`):
+            Activation function to be used throughout the model.
+        latent_channels (`int`, *optional*, defaults to 4):
+            Number of channels in the latent representation. The latent space acts as a compressed representation of
+            the input image.
+        upsampling_scaling_factor (`int`, *optional*, defaults to 2):
+            Scaling factor for upsampling in the decoder. It determines the size of the output image during the
+            upsampling process.
+        num_encoder_blocks (`Tuple[int]`, *optional*, defaults to `(1, 3, 3, 3)`):
+            Tuple of integers representing the number of encoder blocks at each stage of the encoding process. The
+            length of the tuple should be equal to the number of stages in the encoder. Each stage has a different
+            number of encoder blocks.
+        num_decoder_blocks (`Tuple[int]`, *optional*, defaults to `(3, 3, 3, 1)`):
+            Tuple of integers representing the number of decoder blocks at each stage of the decoding process. The
+            length of the tuple should be equal to the number of stages in the decoder. Each stage has a different
+            number of decoder blocks.
+        latent_magnitude (`float`, *optional*, defaults to 3.0):
+            Magnitude of the latent representation. This parameter scales the latent representation values to control
+            the extent of information preservation.
+        latent_shift (float, *optional*, defaults to 0.5):
+            Shift applied to the latent representation. This parameter controls the center of the latent space.
+        scaling_factor (`float`, *optional*, defaults to 1.0):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper. For this Autoencoder,
+            however, no such scaling factor was used, hence the value of 1.0 as the default.
+        force_upcast (`bool`, *optional*, default to `False`):
+            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
+            can be fine-tuned / trained to a lower range without losing too much precision, in which case
+            `force_upcast` can be set to `False` (see this fp16-friendly
+            [AutoEncoder](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        encoder_block_out_channels: Tuple[int, ...] = (64, 64, 64, 64),
+        decoder_block_out_channels: Tuple[int, ...] = (64, 64, 64, 64),
+        act_fn: str = "relu",
+        latent_channels: int = 4,
+        upsampling_scaling_factor: int = 2,
+        num_encoder_blocks: Tuple[int, ...] = (1, 3, 3, 3),
+        num_decoder_blocks: Tuple[int, ...] = (3, 3, 3, 1),
+        latent_magnitude: int = 3,
+        latent_shift: float = 0.5,
+        force_upcast: bool = False,
+        scaling_factor: float = 1.0,
+    ):
+        super().__init__()
+        if len(encoder_block_out_channels) != len(num_encoder_blocks):
+            raise ValueError("`encoder_block_out_channels` should have the same length as `num_encoder_blocks`.")
+        if len(decoder_block_out_channels) != len(num_decoder_blocks):
+            raise ValueError("`decoder_block_out_channels` should have the same length as `num_decoder_blocks`.")
+        self.encoder = EncoderTiny(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            num_blocks=num_encoder_blocks,
+            block_out_channels=encoder_block_out_channels,
+            act_fn=act_fn,
+        )
+        self.decoder = DecoderTiny(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            num_blocks=num_decoder_blocks,
+            block_out_channels=decoder_block_out_channels,
+            upsampling_scaling_factor=upsampling_scaling_factor,
+            act_fn=act_fn,
+        )
+        self.latent_magnitude = latent_magnitude
+        self.latent_shift = latent_shift
+        self.scaling_factor = scaling_factor
+        self.use_slicing = False
+        self.use_tiling = False
+        # only relevant if vae tiling is enabled
+        self.spatial_scale_factor = 2**out_channels
+        self.tile_overlap_factor = 0.125
+        self.tile_sample_min_size = 512
+        self.tile_latent_min_size = self.tile_sample_min_size // self.spatial_scale_factor
+        self.register_to_config(block_out_channels=decoder_block_out_channels)
+        self.register_to_config(force_upcast=False)
+    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
+        if isinstance(module, (EncoderTiny, DecoderTiny)):
+            module.gradient_checkpointing = value
+    def scale_latents(self, x: paddle.Tensor) -> paddle.Tensor:
+        """raw latents -> [0, 1]"""
+        return ((x / 2 * self.latent_magnitude) + self.latent_shift).clip(0, 1)
+    def unscale_latents(self, x: paddle.Tensor) -> paddle.Tensor:
+        """[0, 1] -> raw latents"""
+        return (x - self.latent_shift) * (2 * self.latent_magnitude)
+    def enable_slicing(self) -> None:
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+    def disable_slicing(self) -> None:
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+    def enable_tiling(self, use_tiling: bool = True) -> None:
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.use_tiling = use_tiling
+    def disable_tiling(self) -> None:
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.enable_tiling(False)
+    def _tiled_encode(self, x: paddle.Tensor) -> paddle.Tensor:
+        r"""Encode a batch of images using a tiled encoder.
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output.
+        Args:
+            x (`paddle.Tensor`): Input batch of images.
+        Returns:
+            `paddle.Tensor`: Encoded batch of images.
+        """
+        # scale of encoder output relative to input
+        sf = self.spatial_scale_factor
+        tile_size = self.tile_sample_min_size
+        # number of pixels to blend and to traverse between tile
+        blend_size = int(tile_size * self.tile_overlap_factor)
+        traverse_size = tile_size - blend_size
+        # tiles index (up/left)
+        ti = range(0, x.shape[-2], traverse_size)
+        tj = range(0, x.shape[-1], traverse_size)
+        # mask for blending
+        blend_masks = paddle.stack(
+            paddle.meshgrid([paddle.arange(tile_size / sf) / (blend_size / sf - 1)] * 2, indexing="ij")
+        )
+        blend_masks = blend_masks.clip(0, 1)
+        # output array
+        out = paddle.zeros([x.shape[0], 4, x.shape[-2] // sf, x.shape[-1] // sf])
+        for i in ti:
+            for j in tj:
+                tile_in = x[..., i : i + tile_size, j : j + tile_size]
+                # tile result
+                tile_out = out[..., i // sf : (i + tile_size) // sf, j // sf : (j + tile_size) // sf]
+                tile = self.encoder(tile_in)
+                h, w = tile.shape[-2], tile.shape[-1]
+                # blend tile result into output
+                blend_mask_i = paddle.ones_like(blend_masks[0]) if i == 0 else blend_masks[0]
+                blend_mask_j = paddle.ones_like(blend_masks[1]) if j == 0 else blend_masks[1]
+                blend_mask = blend_mask_i * blend_mask_j
+                tile, blend_mask = tile[..., :h, :w], blend_mask[..., :h, :w]
+                # NOTE this copy_ method is not work in paddlepaddle
+                # tile_out.copy_(blend_mask * tile + (1 - blend_mask) * tile_out)
+                out[..., i // sf : (i + tile_size) // sf, j // sf : (j + tile_size) // sf] = (
+                    blend_mask * tile + (1 - blend_mask) * tile_out
+                )
+        return out
+    def _tiled_decode(self, x: paddle.Tensor) -> paddle.Tensor:
+        r"""Encode a batch of images using a tiled encoder.
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output.
+        Args:
+            x (`paddle.Tensor`): Input batch of images.
+        Returns:
+            `paddle.Tensor`: Encoded batch of images.
+        """
+        # scale of decoder output relative to input
+        sf = self.spatial_scale_factor
+        tile_size = self.tile_latent_min_size
+        # number of pixels to blend and to traverse between tiles
+        blend_size = int(tile_size * self.tile_overlap_factor)
+        traverse_size = tile_size - blend_size
+        # tiles index (up/left)
+        ti = range(0, x.shape[-2], traverse_size)
+        tj = range(0, x.shape[-1], traverse_size)
+        # mask for blending
+        blend_masks = paddle.stack(
+            paddle.meshgrid([paddle.arange(tile_size * sf) / (blend_size * sf - 1)] * 2, indexing="ij")
+        )
+        blend_masks = blend_masks.clip(0, 1)
+        # output array
+        out = paddle.zeros([x.shape[0], 3, x.shape[-2] * sf, x.shape[-1] * sf])
+        for i in ti:
+            for j in tj:
+                tile_in = x[..., i : i + tile_size, j : j + tile_size]
+                # tile result
+                tile_out = out[..., i * sf : (i + tile_size) * sf, j * sf : (j + tile_size) * sf]
+                tile = self.decoder(tile_in)
+                h, w = tile.shape[-2], tile.shape[-1]
+                # blend tile result into output
+                blend_mask_i = paddle.ones_like(blend_masks[0]) if i == 0 else blend_masks[0]
+                blend_mask_j = paddle.ones_like(blend_masks[1]) if j == 0 else blend_masks[1]
+                blend_mask = (blend_mask_i * blend_mask_j)[..., :h, :w]
+                # NOTE this copy_ method is not work in paddlepaddle
+                # tile_out.copy_(blend_mask * tile + (1 - blend_mask) * tile_out)
+                out[..., i * sf : (i + tile_size) * sf, j * sf : (j + tile_size) * sf] = (
+                    blend_mask * tile + (1 - blend_mask) * tile_out
+                )
+        return out
+    @apply_forward_hook
+    def encode(self, x: paddle.Tensor, return_dict: bool = True) -> Union[AutoencoderTinyOutput, Tuple[paddle.Tensor]]:
+        # TODO junnyu, support float16
+        x = x.cast(self.encoder.layers[0].weight.dtype)
+        if self.use_slicing and x.shape[0] > 1:
+            output = [
+                self._tiled_encode(x_slice) if self.use_tiling else self.encoder(x) for x_slice in x.chunk(x.shape[0])
+            ]
+            output = paddle.concat(output)
+        else:
+            output = self._tiled_encode(x) if self.use_tiling else self.encoder(x)
+        if not return_dict:
+            return (output,)
+        return AutoencoderTinyOutput(latents=output)
+    @apply_forward_hook
+    def decode(
+        self, x: paddle.Tensor, generator: Optional[paddle.Generator] = None, return_dict: bool = True
+    ) -> Union[DecoderOutput, Tuple[paddle.Tensor]]:
+        # TODO junnyu, add this to support pure fp16
+        x = x.cast(self.decoder.layers[0].weight.dtype)
+        if self.use_slicing and x.shape[0] > 1:
+            output = [
+                self._tiled_decode(x_slice) if self.use_tiling else self.decoder(x) for x_slice in x.chunk(x.shape[0])
+            ]
+            output = paddle.concat(output)
+        else:
+            output = self._tiled_decode(x) if self.use_tiling else self.decoder(x)
+        if not return_dict:
+            return (output,)
+        return DecoderOutput(sample=output)
+    def forward(
+        self,
+        sample: paddle.Tensor,
+        return_dict: bool = True,
+    ) -> Union[DecoderOutput, Tuple[paddle.Tensor]]:
+        r"""
+        Args:
+            sample (`paddle.Tensor`): Input sample.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        enc = self.encode(sample).latents
+        # scale latents to be in [0, 1], then quantize latents to a byte tensor,
+        # as if we were storing the latents in an RGBA uint8 image.
+        scaled_enc = (self.scale_latents(enc) * 255).round().cast("byte")
+        # unquantize latents back into [0, 1], then unscale latents back to their original range,
+        # as if we were loading the latents from an RGBA uint8 image.
+        unscaled_enc = self.unscale_latents(scaled_enc / 255.0)
+        dec = self.decode(unscaled_enc)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)

PaddleMIX/ppdiffusers/ppdiffusers/models/cogvideox_transformer_3d.py ADDED Viewed

	@@ -0,0 +1,394 @@

+import paddle
+from typing import Any, Dict, Optional, Tuple, Union
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import logging
+from ..utils.paddle_utils import maybe_allow_in_graph
+from .attention import Attention, FeedForward
+from .attention_processor import AttentionProcessor, CogVideoXAttnProcessor2_0, FusedCogVideoXAttnProcessor2_0
+from .embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps
+from .modeling_outputs import Transformer2DModelOutput
+from .modeling_utils import ModelMixin
+from .normalization import AdaLayerNorm, CogVideoXLayerNormZero
+logger = logging.get_logger(__name__)
+@maybe_allow_in_graph
+class CogVideoXBlock(paddle.nn.Layer):
+    """
+    Transformer block used in [CogVideoX](https://github.com/THUDM/CogVideo) model.
+    Parameters:
+        dim (`int`):
+            The number of channels in the input and output.
+        num_attention_heads (`int`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`):
+            The number of channels in each head.
+        time_embed_dim (`int`):
+            The number of channels in timestep embedding.
+        dropout (`float`, defaults to `0.0`):
+            The dropout probability to use.
+        activation_fn (`str`, defaults to `"gelu-approximate"`):
+            Activation function to be used in feed-forward.
+        attention_bias (`bool`, defaults to `False`):
+            Whether or not to use bias in attention projection layers.
+        qk_norm (`bool`, defaults to `True`):
+            Whether or not to use normalization after query and key projections in Attention.
+        norm_elementwise_affine (`bool`, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_eps (`float`, defaults to `1e-5`):
+            Epsilon value for normalization layers.
+        final_dropout (`bool` defaults to `False`):
+            Whether to apply a final dropout after the last feed-forward layer.
+        ff_inner_dim (`int`, *optional*, defaults to `None`):
+            Custom hidden dimension of Feed-forward layer. If not provided, `4 * dim` is used.
+        ff_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in Feed-forward layer.
+        attention_out_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in Attention output projection layer.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        time_embed_dim: int,
+        dropout: float=0.0,
+        activation_fn: str='gelu-approximate',
+        attention_bias: bool=False,
+        qk_norm: bool=True,
+        norm_elementwise_affine: bool=True,
+        norm_eps: float=1e-05,
+        final_dropout: bool=True,
+        ff_inner_dim: Optional[int]=None,
+        ff_bias: bool=True,
+        attention_out_bias: bool=True
+    ):
+        super().__init__()
+        # 1. self attention
+        self.norm1 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
+        self.attn1 = Attention(
+            query_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            qk_norm='layer_norm' if qk_norm else None,
+            eps=1e-06,
+            bias=attention_bias,
+            out_bias=attention_out_bias,
+            processor=CogVideoXAttnProcessor2_0()
+        )
+        # 2. feed forward
+        self.norm2 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias
+        )
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        encoder_hidden_states:paddle.Tensor,
+        temb: paddle.Tensor,
+        image_rotary_emb: Optional[Tuple[paddle.Tensor, paddle.Tensor]]=None
+    ) ->paddle.Tensor:
+        text_seq_length = encoder_hidden_states.shape[1]
+        # norm and modulate
+        norm_hidden_states, norm_encoder_hidden_states, gate_msa, enc_gate_msa = self.norm1(
+            hidden_states, encoder_hidden_states, temb
+        )
+        # attention
+        attn_hidden_states, attn_encoder_hidden_states = self.attn1(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb
+        )
+        hidden_states = hidden_states + gate_msa * attn_hidden_states
+        encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder_hidden_states
+        # norm and modulate
+        norm_hidden_states, norm_encoder_hidden_states, gate_ff, enc_gate_ff = self.norm2(
+            hidden_states, encoder_hidden_states, temb
+        )
+        # feed forward
+        norm_hidden_states = paddle.concat([norm_encoder_hidden_states, norm_hidden_states], axis=1)
+        ff_output = self.ff(norm_hidden_states)
+        hidden_states = hidden_states + gate_ff * ff_output[:, text_seq_length:]
+        encoder_hidden_states = (encoder_hidden_states + enc_gate_ff * ff_output[:, :text_seq_length])
+        return hidden_states, encoder_hidden_states
+class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
+    """
+    A Transformer model for video-like data in [CogVideoX](https://github.com/THUDM/CogVideo).
+    Parameters:
+        num_attention_heads (`int`, defaults to `30`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, defaults to `64`):
+            The number of channels in each head.
+        in_channels (`int`, defaults to `16`):
+            The number of channels in the input.
+        out_channels (`int`, *optional*, defaults to `16`):
+            The number of channels in the output.
+        flip_sin_to_cos (`bool`, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        time_embed_dim (`int`, defaults to `512`):
+            Output dimension of timestep embeddings.
+        text_embed_dim (`int`, defaults to `4096`):
+            Input dimension of text embeddings from the text encoder.
+        num_layers (`int`, defaults to `30`):
+            The number of layers of Transformer blocks to use.
+        dropout (`float`, defaults to `0.0`):
+            The dropout probability to use.
+        attention_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in the attention projection layers.
+        sample_width (`int`, defaults to `90`):
+            The width of the input latents.
+        sample_height (`int`, defaults to `60`):
+            The height of the input latents.
+        sample_frames (`int`, defaults to `49`):
+            The number of frames in the input latents. Note that this parameter was incorrectly initialized to 49
+            instead of 13 because CogVideoX processed 13 latent frames at once in its default and recommended settings,
+            but cannot be changed to the correct value to ensure backwards compatibility. To create a transformer with
+            K latent frames, the correct value to pass here would be: ((K - 1) * temporal_compression_ratio + 1).
+        patch_size (`int`, defaults to `2`):
+            The size of the patches to use in the patch embedding layer.
+        temporal_compression_ratio (`int`, defaults to `4`):
+            The compression ratio across the temporal dimension. See documentation for `sample_frames`.
+        max_text_seq_length (`int`, defaults to `226`):
+            The maximum sequence length of the input text embeddings.
+        activation_fn (`str`, defaults to `"gelu-approximate"`):
+            Activation function to use in feed-forward.
+        timestep_activation_fn (`str`, defaults to `"silu"`):
+            Activation function to use when generating the timestep embeddings.
+        norm_elementwise_affine (`bool`, defaults to `True`):
+            Whether or not to use elementwise affine in normalization layers.
+        norm_eps (`float`, defaults to `1e-5`):
+            The epsilon value to use in normalization layers.
+        spatial_interpolation_scale (`float`, defaults to `1.875`):
+            Scaling factor to apply in 3D positional embeddings across spatial dimensions.
+        temporal_interpolation_scale (`float`, defaults to `1.0`):
+            Scaling factor to apply in 3D positional embeddings across temporal dimensions.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(self, num_attention_heads: int=30, attention_head_dim: int
+        =64, in_channels: int=16, out_channels: Optional[int]=16,
+        flip_sin_to_cos: bool=True, freq_shift: int=0, time_embed_dim: int=
+        512, text_embed_dim: int=4096, num_layers: int=30, dropout: float=
+        0.0, attention_bias: bool=True, sample_width: int=90, sample_height:
+        int=60, sample_frames: int=49, patch_size: int=2,
+        temporal_compression_ratio: int=4, max_text_seq_length: int=226,
+        activation_fn: str='gelu-approximate', timestep_activation_fn: str=
+        'silu', norm_elementwise_affine: bool=True, norm_eps: float=1e-05,
+        spatial_interpolation_scale: float=1.875,
+        temporal_interpolation_scale: float=1.0,
+        use_rotary_positional_embeddings: bool=False,
+        use_learned_positional_embeddings: bool=False):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        if (not use_rotary_positional_embeddings and
+            use_learned_positional_embeddings):
+            raise ValueError(
+                "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional embeddings. If you're using a custom model and/or believe this should be supported, please open an issue at https://github.com/huggingface/diffusers/issues."
+                )
+        self.patch_embed = CogVideoXPatchEmbed(patch_size=patch_size,
+            in_channels=in_channels, embed_dim=inner_dim, text_embed_dim=
+            text_embed_dim, bias=True, sample_width=sample_width,
+            sample_height=sample_height, sample_frames=sample_frames,
+            temporal_compression_ratio=temporal_compression_ratio,
+            max_text_seq_length=max_text_seq_length,
+            spatial_interpolation_scale=spatial_interpolation_scale,
+            temporal_interpolation_scale=temporal_interpolation_scale,
+            use_positional_embeddings=not use_rotary_positional_embeddings,
+            use_learned_positional_embeddings=use_learned_positional_embeddings
+            )
+        self.embedding_dropout = paddle.nn.Dropout(p=dropout)
+        self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
+        self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim,
+            timestep_activation_fn)
+        self.transformer_blocks = paddle.nn.LayerList(sublayers=[
+            CogVideoXBlock(dim=inner_dim, num_attention_heads=
+            num_attention_heads, attention_head_dim=attention_head_dim,
+            time_embed_dim=time_embed_dim, dropout=dropout, activation_fn=
+            activation_fn, attention_bias=attention_bias,
+            norm_elementwise_affine=norm_elementwise_affine, norm_eps=
+            norm_eps) for _ in range(num_layers)])
+        self.norm_final = paddle.nn.LayerNorm(normalized_shape=inner_dim,
+            epsilon=norm_eps, weight_attr=norm_elementwise_affine,
+            bias_attr=norm_elementwise_affine)
+        self.norm_out = AdaLayerNorm(embedding_dim=time_embed_dim,
+            output_dim=2 * inner_dim, norm_elementwise_affine=
+            norm_elementwise_affine, norm_eps=norm_eps, chunk_dim=1)
+        self.proj_out = paddle.nn.Linear(in_features=inner_dim,
+            out_features=patch_size * patch_size * out_channels)
+        self.gradient_checkpointing = False
+    def _set_gradient_checkpointing(self, module, value=False):
+        self.gradient_checkpointing = value
+    @property
+    def attn_processors(self) ->Dict[str, AttentionProcessor]:
+        """
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: paddle.nn.Layer,
+            processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, 'get_processor'):
+                processors[f'{name}.processor'] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f'{name}.{sub_name}', child,
+                    processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[
+        str, AttentionProcessor]]):
+        """
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f'A dict of processors was passed, but the number of processors {len(processor)} does not match the number of attention layers: {count}. Please make sure to pass {count} processor classes.'
+                )
+        def fn_recursive_attn_processor(name: str, module: paddle.nn.Layer,
+            processor):
+            if hasattr(module, 'set_processor'):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f'{name}.processor'))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f'{name}.{sub_name}', child,
+                    processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is ���� experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if 'Added' in str(attn_processor.__class__.__name__):
+                raise ValueError(
+                    '`fuse_qkv_projections()` is not supported for models having added KV projections.'
+                    )
+        self.original_attn_processors = self.attn_processors
+        for module in self.sublayers():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+        self.set_attn_processor(FusedCogVideoXAttnProcessor2_0())
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        encoder_hidden_states: paddle.Tensor,
+        timestep: Union[int, float, paddle.Tensor],
+        timestep_cond: Optional[paddle.Tensor]=None,
+        image_rotary_emb: Optional[Tuple[paddle.Tensor, paddle.Tensor]]=None,
+        return_dict: bool=True
+    ):
+        batch_size, num_frames, channels, height, width = hidden_states.shape
+        # 1. Time embedding
+        timesteps = timestep
+        t_emb = self.time_proj(timesteps)
+        t_emb = t_emb.cast(hidden_states.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        # 2. Patch embedding
+        hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
+        hidden_states = self.embedding_dropout(hidden_states)
+        text_seq_length = tuple(encoder_hidden_states.shape)[1]
+        encoder_hidden_states = hidden_states[:, :text_seq_length]
+        hidden_states = hidden_states[:, text_seq_length:]
+        # 3. Transformer blocks
+        for i, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                raise NotImplementedError
+            else:
+                hidden_states, encoder_hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=emb,
+                    image_rotary_emb=image_rotary_emb
+                )
+            # print("hidden_states:", hidden_states.abs().mean().item(), hidden_states.min().item(), hidden_states.max().item())
+            # print("encoder_hidden_states:", encoder_hidden_states.abs().mean().item(), encoder_hidden_states.min().item(), encoder_hidden_states.max().item())
+        if not self.config.use_rotary_positional_embeddings:
+            # 2B
+            hidden_states = self.norm_final(hidden_states)
+        else:
+            # 5B
+            hidden_states = paddle.concat(x=[encoder_hidden_states,
+                hidden_states], axis=1)
+            hidden_states = self.norm_final(hidden_states)
+            hidden_states = hidden_states[:, text_seq_length:]
+        # 4. Final block
+        hidden_states = self.norm_out(hidden_states, temb=emb)
+        hidden_states = self.proj_out(hidden_states)
+        # 5. Unpatchify
+        p = self.config.patch_size
+        output = hidden_states.reshape([batch_size, num_frames, height // p, width // p, -1, p, p])
+        output = output.transpose(perm=[0, 1, 4, 2, 5, 3, 6]).flatten(5, 6).flatten(3, 4)
+        if not return_dict:
+            return output,
+        return Transformer2DModelOutput(sample=output)

PaddleMIX/ppdiffusers/ppdiffusers/models/consistency_decoder_vae.py ADDED Viewed

	@@ -0,0 +1,445 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple, Union
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..schedulers import ConsistencyDecoderScheduler
+from ..utils import BaseOutput
+from ..utils.accelerate_utils import apply_forward_hook
+from ..utils.paddle_utils import randn_tensor
+from .attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from .modeling_utils import ModelMixin
+from .unet_2d import UNet2DModel
+from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder
+@dataclass
+class ConsistencyDecoderVAEOutput(BaseOutput):
+    """
+    Output of encoding method.
+    Args:
+        latent_dist (`DiagonalGaussianDistribution`):
+            Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`.
+            `DiagonalGaussianDistribution` allows for sampling latents from the distribution.
+    """
+    latent_dist: "DiagonalGaussianDistribution"
+class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
+    r"""
+    The consistency decoder used with DALL-E 3.
+    Examples:
+        ```py
+        >>> import paddle
+        >>> from ppdiffusers import StableDiffusionPipeline, ConsistencyDecoderVAE
+        >>> vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", paddle_dtype=paddle.float16)
+        >>> pipe = StableDiffusionPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", vae=vae, paddle_dtype=paddle.float16
+        ... )
+        >>> pipe("horse", generator=paddle.Generator().manual_seed(0)).images
+        ```
+    """
+    @register_to_config
+    def __init__(
+        self,
+        scaling_factor: float = 0.18215,
+        latent_channels: int = 4,
+        encoder_act_fn: str = "silu",
+        encoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
+        encoder_double_z: bool = True,
+        encoder_down_block_types: Tuple[str, ...] = (
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+        ),
+        encoder_in_channels: int = 3,
+        encoder_layers_per_block: int = 2,
+        encoder_norm_num_groups: int = 32,
+        encoder_out_channels: int = 4,
+        decoder_add_attention: bool = False,
+        decoder_block_out_channels: Tuple[int, ...] = (320, 640, 1024, 1024),
+        decoder_down_block_types: Tuple[str, ...] = (
+            "ResnetDownsampleBlock2D",
+            "ResnetDownsampleBlock2D",
+            "ResnetDownsampleBlock2D",
+            "ResnetDownsampleBlock2D",
+        ),
+        decoder_downsample_padding: int = 1,
+        decoder_in_channels: int = 7,
+        decoder_layers_per_block: int = 3,
+        decoder_norm_eps: float = 1e-05,
+        decoder_norm_num_groups: int = 32,
+        decoder_num_train_timesteps: int = 1024,
+        decoder_out_channels: int = 6,
+        decoder_resnet_time_scale_shift: str = "scale_shift",
+        decoder_time_embedding_type: str = "learned",
+        decoder_up_block_types: Tuple[str, ...] = (
+            "ResnetUpsampleBlock2D",
+            "ResnetUpsampleBlock2D",
+            "ResnetUpsampleBlock2D",
+            "ResnetUpsampleBlock2D",
+        ),
+    ):
+        super().__init__()
+        self.encoder = Encoder(
+            act_fn=encoder_act_fn,
+            block_out_channels=encoder_block_out_channels,
+            double_z=encoder_double_z,
+            down_block_types=encoder_down_block_types,
+            in_channels=encoder_in_channels,
+            layers_per_block=encoder_layers_per_block,
+            norm_num_groups=encoder_norm_num_groups,
+            out_channels=encoder_out_channels,
+        )
+        self.decoder_unet = UNet2DModel(
+            add_attention=decoder_add_attention,
+            block_out_channels=decoder_block_out_channels,
+            down_block_types=decoder_down_block_types,
+            downsample_padding=decoder_downsample_padding,
+            in_channels=decoder_in_channels,
+            layers_per_block=decoder_layers_per_block,
+            norm_eps=decoder_norm_eps,
+            norm_num_groups=decoder_norm_num_groups,
+            num_train_timesteps=decoder_num_train_timesteps,
+            out_channels=decoder_out_channels,
+            resnet_time_scale_shift=decoder_resnet_time_scale_shift,
+            time_embedding_type=decoder_time_embedding_type,
+            up_block_types=decoder_up_block_types,
+        )
+        self.decoder_scheduler = ConsistencyDecoderScheduler()
+        self.register_to_config(block_out_channels=encoder_block_out_channels)
+        self.register_to_config(force_upcast=False)
+        self.register_buffer(
+            "means",
+            paddle.to_tensor([0.38862467, 0.02253063, 0.07381133, -0.0171294])[None, :, None, None],
+            persistable=False,
+        )
+        self.register_buffer(
+            "stds",
+            paddle.to_tensor([0.9654121, 1.0440036, 0.76147926, 0.77022034])[None, :, None, None],
+            persistable=False,
+        )
+        self.quant_conv = nn.Conv2D(2 * latent_channels, 2 * latent_channels, 1)
+        self.use_slicing = False
+        self.use_tiling = False
+    # Copied from ppdiffusers.models.autoencoder_kl.AutoencoderKL.enable_tiling
+    def enable_tiling(self, use_tiling: bool = True):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.use_tiling = use_tiling
+    # Copied from ppdiffusers.models.autoencoder_kl.AutoencoderKL.disable_tiling
+    def disable_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.enable_tiling(False)
+    # Copied from ppdiffusers.models.autoencoder_kl.AutoencoderKL.enable_slicing
+    def enable_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+    # Copied from ppdiffusers.models.autoencoder_kl.AutoencoderKL.disable_slicing
+    def disable_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+    @property
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor, _remove_lora=_remove_lora)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor, _remove_lora=True)
+    @apply_forward_hook
+    def encode(
+        self, x: paddle.Tensor, return_dict: bool = True
+    ) -> Union[ConsistencyDecoderVAEOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images into latents.
+        Args:
+            x (`paddle.Tensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.consistency_decoder_vae.ConsistencyDecoderOutput`] instead of a plain
+                tuple.
+        Returns:
+                The latent representations of the encoded images. If `return_dict` is True, a
+                [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] is returned, otherwise a plain `tuple`
+                is returned.
+        """
+        # TODO junnyu, support float16
+        x = x.cast(self.encoder.conv_in.weight.dtype)
+        if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
+            return self.tiled_encode(x, return_dict=return_dict)
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self.encoder(x_slice) for x_slice in x.chunk(x.shape[0])]
+            h = paddle.concat(encoded_slices)
+        else:
+            h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        if not return_dict:
+            return (posterior,)
+        return ConsistencyDecoderVAEOutput(latent_dist=posterior)
+    @apply_forward_hook
+    def decode(
+        self,
+        z: paddle.Tensor,
+        generator: Optional[paddle.Generator] = None,
+        return_dict: bool = True,
+        num_inference_steps: int = 2,
+    ) -> Union[DecoderOutput, Tuple[paddle.Tensor]]:
+        z = (z * self.config.scaling_factor - self.means) / self.stds
+        scale_factor = 2 ** (len(self.config.block_out_channels) - 1)
+        z = F.interpolate(z, mode="nearest", scale_factor=scale_factor)
+        batch_size, _, height, width = z.shape
+        self.decoder_scheduler.set_timesteps(num_inference_steps)
+        x_t = self.decoder_scheduler.init_noise_sigma * randn_tensor(
+            (batch_size, 3, height, width),
+            generator=generator,
+            dtype=z.dtype,
+        )
+        for t in self.decoder_scheduler.timesteps:
+            model_input = paddle.concat([self.decoder_scheduler.scale_model_input(x_t, t).cast(z.dtype), z], axis=1)
+            model_output = self.decoder_unet(model_input, t).sample[:, :3, :, :]
+            prev_sample = self.decoder_scheduler.step(model_output, t, x_t, generator).prev_sample
+            x_t = prev_sample
+        x_0 = x_t
+        if not return_dict:
+            return (x_0,)
+        return DecoderOutput(sample=x_0)
+    # Copied from ppdiffusers.models.autoencoder_kl.AutoencoderKL.blend_v
+    def blend_v(self, a: paddle.Tensor, b: paddle.Tensor, blend_extent: int) -> paddle.Tensor:
+        blend_extent = min(a.shape[2], b.shape[2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
+        return b
+    # Copied from ppdiffusers.models.autoencoder_kl.AutoencoderKL.blend_h
+    def blend_h(self, a: paddle.Tensor, b: paddle.Tensor, blend_extent: int) -> paddle.Tensor:
+        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
+        return b
+    def tiled_encode(self, x: paddle.Tensor, return_dict: bool = True) -> ConsistencyDecoderVAEOutput:
+        r"""Encode a batch of images using a tiled encoder.
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+        Args:
+            x (`paddle.Tensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] instead of a
+                plain tuple.
+        Returns:
+            [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] or `tuple`:
+                If return_dict is True, a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] is returned,
+                otherwise a plain `tuple` is returned.
+        """
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+        # Split the image into 512x512 tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[2], overlap_size):
+            row = []
+            for j in range(0, x.shape[3], overlap_size):
+                tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
+                tile = self.encoder(tile)
+                tile = self.quant_conv(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :row_limit, :row_limit])
+            result_rows.append(paddle.concat(result_row, axis=3))
+        moments = paddle.concat(result_rows, axis=2)
+        posterior = DiagonalGaussianDistribution(moments)
+        if not return_dict:
+            return (posterior,)
+        return ConsistencyDecoderVAEOutput(latent_dist=posterior)
+    def forward(
+        self,
+        sample: paddle.Tensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[paddle.Generator] = None,
+    ) -> Union[DecoderOutput, Tuple[paddle.Tensor]]:
+        r"""
+        Args:
+            sample (`paddle.Tensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+            generator (`torch.Generator`, *optional*, defaults to `None`):
+                Generator to use for sampling.
+        Returns:
+            [`DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`DecoderOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z, generator=generator).sample
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)

PaddleMIX/ppdiffusers/ppdiffusers/models/controlnet.py ADDED Viewed

	@@ -0,0 +1,889 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..loaders import FromOriginalControlnetMixin
+from ..utils import BaseOutput, logging
+from .attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from .embeddings import (
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from .modeling_utils import ModelMixin
+from .unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    DownBlock2D,
+    UNetMidBlock2D,
+    UNetMidBlock2DCrossAttn,
+    get_down_block,
+)
+from .unet_2d_condition import UNet2DConditionModel
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class ControlNetOutput(BaseOutput):
+    """
+    The output of [`ControlNetModel`].
+    Args:
+        down_block_res_samples (`tuple[paddle.Tensor]`):
+            A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should
+            be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be
+            used to condition the original UNet's downsampling activations.
+        mid_down_block_re_sample (`paddle.Tensor`):
+            The activation of the middle block (the lowest sample resolution). Each tensor should be of shape
+            `(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`.
+            Output can be used to condition the original UNet's middle block activation.
+    """
+    down_block_res_samples: Tuple[paddle.Tensor]
+    mid_block_res_sample: paddle.Tensor
+class ControlNetConditioningEmbedding(nn.Layer):
+    """
+    Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
+    [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
+    training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
+    convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
+    (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
+    model) to encode image-space conditions ... into feature maps ..."
+    """
+    def __init__(
+        self,
+        conditioning_embedding_channels: int,
+        conditioning_channels: int = 3,
+        block_out_channels: Tuple[int, ...] = (16, 32, 96, 256),
+    ):
+        super().__init__()
+        self.conv_in = nn.Conv2D(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
+        self.blocks = nn.LayerList([])
+        for i in range(len(block_out_channels) - 1):
+            channel_in = block_out_channels[i]
+            channel_out = block_out_channels[i + 1]
+            self.blocks.append(nn.Conv2D(channel_in, channel_in, kernel_size=3, padding=1))
+            self.blocks.append(nn.Conv2D(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
+        self.conv_out = zero_module(
+            nn.Conv2D(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
+        )
+    def forward(self, conditioning):
+        embedding = self.conv_in(conditioning)
+        embedding = F.silu(embedding)
+        for block in self.blocks:
+            embedding = block(embedding)
+            embedding = F.silu(embedding)
+        embedding = self.conv_out(embedding)
+        return embedding
+class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
+    """
+    A ControlNet model.
+    Args:
+        in_channels (`int`, defaults to 4):
+            The number of channels in the input sample.
+        flip_sin_to_cos (`bool`, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, defaults to 0):
+            The frequency shift to apply to the time embedding.
+        down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`):
+        block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, defaults to 2):
+            The number of layers per block.
+        downsample_padding (`int`, defaults to 1):
+            The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, defaults to 1):
+            The scale factor to use for the mid block.
+        act_fn (`str`, defaults to "silu"):
+            The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use for the normalization. If None, normalization and activation layers is skipped
+            in post-processing.
+        norm_eps (`float`, defaults to 1e-5):
+            The epsilon to use for the normalization.
+        cross_attention_dim (`int`, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`Union[int, Tuple[int]]`, defaults to 8):
+            The dimension of the attention heads.
+        use_linear_projection (`bool`, defaults to `False`):
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from None,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        num_class_embeds (`int`, *optional*, defaults to 0):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        upcast_attention (`bool`, defaults to `False`):
+        resnet_time_scale_shift (`str`, defaults to `"default"`):
+            Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`.
+        projection_class_embeddings_input_dim (`int`, *optional*, defaults to `None`):
+            The dimension of the `class_labels` input when `class_embed_type="projection"`. Required when
+            `class_embed_type="projection"`.
+        controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
+            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
+        conditioning_embedding_out_channels (`tuple[int]`, *optional*, defaults to `(16, 32, 96, 256)`):
+            The tuple of output channel for each block in the `conditioning_embedding` layer.
+        global_pool_conditions (`bool`, defaults to `False`):
+            TODO(Patrick) - unused parameter.
+        addition_embed_type_num_heads (`int`, defaults to 64):
+            The number of heads to use for the `TextTimeEmbedding` layer.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 4,
+        conditioning_channels: int = 3,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str, ...] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int, ...]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        controlnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
+        global_pool_conditions: bool = False,
+        addition_embed_type_num_heads: int = 64,
+    ):
+        super().__init__()
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+        # Check inputs
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        # input
+        conv_in_kernel = 3
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2D(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+        )
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+        # control net conditioning embedding
+        self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
+            conditioning_embedding_channels=block_out_channels[0],
+            block_out_channels=conditioning_embedding_out_channels,
+            conditioning_channels=conditioning_channels,
+        )
+        self.down_blocks = nn.LayerList([])
+        self.controlnet_down_blocks = nn.LayerList([])
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+        # down
+        output_channel = block_out_channels[0]
+        controlnet_block = nn.Conv2D(output_channel, output_channel, kernel_size=1)
+        controlnet_block = zero_module(controlnet_block)
+        self.controlnet_down_blocks.append(controlnet_block)
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[i],
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                downsample_padding=downsample_padding,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+            )
+            self.down_blocks.append(down_block)
+            for _ in range(layers_per_block):
+                controlnet_block = nn.Conv2D(output_channel, output_channel, kernel_size=1)
+                controlnet_block = zero_module(controlnet_block)
+                self.controlnet_down_blocks.append(controlnet_block)
+            if not is_final_block:
+                controlnet_block = nn.Conv2D(output_channel, output_channel, kernel_size=1)
+                controlnet_block = zero_module(controlnet_block)
+                self.controlnet_down_blocks.append(controlnet_block)
+        # mid
+        mid_block_channel = block_out_channels[-1]
+        controlnet_block = nn.Conv2D(mid_block_channel, mid_block_channel, kernel_size=1)
+        controlnet_block = zero_module(controlnet_block)
+        self.controlnet_mid_block = controlnet_block
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=mid_block_channel,
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+            )
+        elif mid_block_type == "UNetMidBlock2D":
+            self.mid_block = UNetMidBlock2D(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                num_layers=0,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                add_attention=False,
+            )
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+    @classmethod
+    def from_unet(
+        cls,
+        unet: UNet2DConditionModel,
+        controlnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
+        load_weights_from_unet: bool = True,
+        conditioning_channels: int = 3,
+    ):
+        r"""
+        Instantiate a [`ControlNetModel`] from [`UNet2DConditionModel`].
+        Parameters:
+            unet (`UNet2DConditionModel`):
+                The UNet model weights to copy to the [`ControlNetModel`]. All configuration options are also copied
+                where applicable.
+        """
+        transformer_layers_per_block = (
+            unet.config.transformer_layers_per_block if "transformer_layers_per_block" in unet.config else 1
+        )
+        encoder_hid_dim = unet.config.encoder_hid_dim if "encoder_hid_dim" in unet.config else None
+        encoder_hid_dim_type = unet.config.encoder_hid_dim_type if "encoder_hid_dim_type" in unet.config else None
+        addition_embed_type = unet.config.addition_embed_type if "addition_embed_type" in unet.config else None
+        addition_time_embed_dim = (
+            unet.config.addition_time_embed_dim if "addition_time_embed_dim" in unet.config else None
+        )
+        controlnet = cls(
+            encoder_hid_dim=encoder_hid_dim,
+            encoder_hid_dim_type=encoder_hid_dim_type,
+            addition_embed_type=addition_embed_type,
+            addition_time_embed_dim=addition_time_embed_dim,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=unet.config.in_channels,
+            flip_sin_to_cos=unet.config.flip_sin_to_cos,
+            freq_shift=unet.config.freq_shift,
+            down_block_types=unet.config.down_block_types,
+            only_cross_attention=unet.config.only_cross_attention,
+            block_out_channels=unet.config.block_out_channels,
+            layers_per_block=unet.config.layers_per_block,
+            downsample_padding=unet.config.downsample_padding,
+            mid_block_scale_factor=unet.config.mid_block_scale_factor,
+            act_fn=unet.config.act_fn,
+            norm_num_groups=unet.config.norm_num_groups,
+            norm_eps=unet.config.norm_eps,
+            cross_attention_dim=unet.config.cross_attention_dim,
+            attention_head_dim=unet.config.attention_head_dim,
+            num_attention_heads=unet.config.num_attention_heads,
+            use_linear_projection=unet.config.use_linear_projection,
+            class_embed_type=unet.config.class_embed_type,
+            num_class_embeds=unet.config.num_class_embeds,
+            upcast_attention=unet.config.upcast_attention,
+            resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
+            projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
+            mid_block_type=unet.config.mid_block_type,
+            controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
+            conditioning_embedding_out_channels=conditioning_embedding_out_channels,
+            conditioning_channels=conditioning_channels,
+        )
+        if load_weights_from_unet:
+            controlnet.conv_in.load_dict(unet.conv_in.state_dict())
+            controlnet.time_proj.load_dict(unet.time_proj.state_dict())
+            controlnet.time_embedding.load_dict(unet.time_embedding.state_dict())
+            if controlnet.class_embedding:
+                controlnet.class_embedding.load_dict(unet.class_embedding.state_dict())
+            controlnet.down_blocks.load_dict(unet.down_blocks.state_dict())
+            controlnet.mid_block.load_dict(unet.mid_block.state_dict())
+        return controlnet
+    @property
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor, _remove_lora=_remove_lora)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor, _remove_lora=True)
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attention_slice
+    def set_attention_slice(self, slice_size: Union[str, int, List[int]]) -> None:
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+        num_sliceable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
+        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        sample: paddle.Tensor,
+        timestep: Union[paddle.Tensor, float, int],
+        encoder_hidden_states: paddle.Tensor,
+        controlnet_cond: paddle.Tensor,
+        conditioning_scale: float = 1.0,
+        class_labels: Optional[paddle.Tensor] = None,
+        timestep_cond: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        added_cond_kwargs: Optional[Dict[str, paddle.Tensor]] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guess_mode: bool = False,
+        return_dict: bool = True,
+    ) -> Union[ControlNetOutput, Tuple[Tuple[paddle.Tensor, ...], paddle.Tensor]]:
+        """
+        The [`ControlNetModel`] forward method.
+        Args:
+            sample (`paddle.Tensor`):
+                The noisy input tensor.
+            timestep (`Union[paddle.Tensor, float, int]`):
+                The number of timesteps to denoise an input.
+            encoder_hidden_states (`paddle.Tensor`):
+                The encoder hidden states.
+            controlnet_cond (`paddle.Tensor`):
+                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
+            conditioning_scale (`float`, defaults to `1.0`):
+                The scale factor for ControlNet outputs.
+            class_labels (`paddle.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond (`paddle.Tensor`, *optional*, defaults to `None`):
+                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
+                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
+                embeddings.
+            attention_mask (`paddle.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            added_cond_kwargs (`dict`):
+                Additional conditions for the Stable Diffusion XL UNet.
+            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
+            guess_mode (`bool`, defaults to `False`):
+                In this mode, the ControlNet encoder tries its best to recognize the input content of the input even if
+                you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.controlnet.ControlNetOutput`] **or** `tuple`:
+                If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
+                returned where the first element is the sample tensor.
+        """
+        # TODO junnyu, add this to support pure fp16
+        sample = sample.cast(self.dtype)
+        # check channel order
+        channel_order = self.config.controlnet_conditioning_channel_order
+        if channel_order == "rgb":
+            # in rgb order by default
+            ...
+        elif channel_order == "bgr":
+            controlnet_cond = paddle.flip(controlnet_cond, axis=[1])
+        else:
+            raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.cast(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # 1. time
+        timesteps = timestep
+        if not paddle.is_tensor(timesteps):
+            timesteps = paddle.to_tensor([timesteps], dtype="int64")
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None]
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(
+            [
+                sample.shape[0],
+            ]
+        )
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.cast(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            # maybe cast it to float16
+            class_labels = class_labels.cast(sample.dtype)
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+            # maybe cast it to int64
+            if isinstance(self.class_embedding, nn.Embedding):
+                class_labels = class_labels.cast(paddle.int64)
+            class_emb = self.class_embedding(class_labels).cast(dtype=sample.dtype)
+            emb = emb + class_emb
+        if self.config.addition_embed_type is not None:
+            if self.config.addition_embed_type == "text":
+                aug_emb = self.add_embedding(encoder_hidden_states)
+            elif self.config.addition_embed_type == "text_time":
+                if "text_embeds" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                    )
+                text_embeds = added_cond_kwargs.get("text_embeds")
+                if "time_ids" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                    )
+                time_ids = added_cond_kwargs.get("time_ids")
+                time_embeds = self.add_time_proj(time_ids.flatten())
+                time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+                # make sure [text_embeds, time_embeds] has the same dtype
+                time_embeds = time_embeds.cast(text_embeds.dtype)
+                add_embeds = paddle.concat([text_embeds, time_embeds], axis=-1)
+                add_embeds = add_embeds.cast(emb.dtype)
+                aug_emb = self.add_embedding(add_embeds)
+        emb = emb + aug_emb if aug_emb is not None else emb
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
+        sample = sample + controlnet_cond
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+            down_block_res_samples += res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+        # 5. Control net blocks
+        controlnet_down_block_res_samples = ()
+        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
+            down_block_res_sample = controlnet_block(down_block_res_sample)
+            controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)
+        down_block_res_samples = controlnet_down_block_res_samples
+        mid_block_res_sample = self.controlnet_mid_block(sample)
+        # 6. scaling
+        if guess_mode and not self.config.global_pool_conditions:
+            scales = paddle.logspace(-1, 0, len(down_block_res_samples) + 1)  # 0.1 to 1.0
+            scales = scales * conditioning_scale
+            down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
+            mid_block_res_sample = mid_block_res_sample * scales[-1]  # last one
+        else:
+            if isinstance(conditioning_scale, (float, int)):
+                down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
+                mid_block_res_sample = mid_block_res_sample * conditioning_scale
+            else:
+                # NEW ADD, for multiple conditioning scales
+                down_block_res_samples = [
+                    sample * ccs for sample, ccs in zip(down_block_res_samples, conditioning_scale[:-1])
+                ]
+                mid_block_res_sample = mid_block_res_sample * conditioning_scale[-1]
+        if self.config.global_pool_conditions:
+            down_block_res_samples = [
+                paddle.mean(sample, axis=(2, 3), keepdim=True) for sample in down_block_res_samples
+            ]
+            mid_block_res_sample = paddle.mean(mid_block_res_sample, axis=(2, 3), keepdim=True)
+        if not return_dict:
+            return (down_block_res_samples, mid_block_res_sample)
+        return ControlNetOutput(
+            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
+        )
+@paddle.no_grad()
+def zero_module(module):
+    for p in module.parameters():
+        p.zero_()
+    return module

PaddleMIX/ppdiffusers/ppdiffusers/models/dit_llama_t2i.py ADDED Viewed

	@@ -0,0 +1,582 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.functional.flash_attention import (
+    flash_attention,
+    scaled_dot_product_attention,
+)
+from ..configuration_utils import ConfigMixin, register_to_config
+from .dit_llama import FeedForward, FinalLayer, TimestepEmbedder, TypePromote, modulate
+from .modeling_utils import ModelMixin
+from .transformer_2d import Transformer2DModelOutput
+class Attention(nn.Layer):
+    def __init__(self, dim, n_heads, n_kv_heads, qk_norm=True, fused_attn=True, y_dim=0):
+        """
+        Initialize the Attention module.
+        Args:
+            dim (int): Number of input dimensions.
+            n_heads (int): Number of heads.
+            n_kv_heads (Optional[int]): Number of kv heads, if using GQA.
+        Attributes:
+            n_kv_heads (int): Number of key and value heads.
+            n_local_heads (int): Number of local query heads.
+            n_local_kv_heads (int): Number of local key and value heads.
+            n_rep (int): Number of repetitions for local heads.
+            head_dim (int): Dimension size of each attention head.
+            wq (nn.Linear): Linear transformation for queries.
+            wk (nn.Linear): Linear transformation for keys.
+            wv (nn.Linear): Linear transformation for values.
+            wo (nn.Linear): Linear transformation for output.
+            cache_k (paddle.Tensor): Cached keys for attention.
+            cache_v (paddle.Tensor): Cached values for attention.
+        """
+        super().__init__()
+        self.n_kv_heads = n_heads if n_kv_heads is None else n_kv_heads
+        self.n_local_heads = n_heads
+        self.n_local_kv_heads = self.n_kv_heads
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = dim // n_heads
+        self.wq = nn.Linear(dim, n_heads * self.head_dim, bias_attr=False)
+        self.wk = nn.Linear(dim, self.n_kv_heads * self.head_dim, bias_attr=False)
+        self.wv = nn.Linear(dim, self.n_kv_heads * self.head_dim, bias_attr=False)
+        self.wo = nn.Linear(n_heads * self.head_dim, dim, bias_attr=False)
+        if y_dim > 0:
+            self.wk_y = nn.Linear(y_dim, self.n_kv_heads * self.head_dim, bias_attr=False)
+            self.wv_y = nn.Linear(y_dim, self.n_kv_heads * self.head_dim, bias_attr=False)
+            self.gate = nn.Parameter(paddle.zeros([self.n_local_heads]))
+        if qk_norm:
+            self.q_norm = nn.LayerNorm(self.n_local_heads * self.head_dim)
+            self.k_norm = nn.LayerNorm(self.n_local_kv_heads * self.head_dim)
+            if y_dim > 0:
+                self.ky_norm = nn.LayerNorm(self.n_local_kv_heads * self.head_dim)
+            else:
+                self.ky_norm = nn.Identity()
+        else:
+            self.q_norm = self.k_norm = nn.Identity()
+            self.ky_norm = nn.Identity()
+        self.fused_attn = fused_attn
+        self.scale = self.head_dim**-0.5
+    @staticmethod
+    def reshape_for_broadcast(freqs_cis, x):
+        """
+        Reshape frequency tensor for broadcasting it with another tensor.
+        This function reshapes the frequency tensor to have the same shape as
+        the target tensor 'x' for the purpose of broadcasting the frequency
+        tensor during element-wise operations.
+        Args:
+            freqs_cis (paddle.Tensor): Frequency tensor to be reshaped.
+            x (paddle.Tensor): Target tensor for broadcasting compatibility.
+        Returns:
+            paddle.Tensor: Reshaped frequency tensor.
+        Raises:
+            AssertionError: If the frequency tensor doesn't match the expected
+                shape.
+            AssertionError: If the target tensor 'x' doesn't have the expected
+                number of dimensions.
+        """
+        ndim = x.ndim
+        assert 0 <= 1 < ndim
+        assert tuple(freqs_cis.shape) == (tuple(x.shape)[1], tuple(x.shape)[-1])
+        shape = [(d if i == 1 or i == ndim - 1 else 1) for i, d in enumerate(tuple(x.shape))]
+        return freqs_cis.reshape([*shape])
+    @staticmethod
+    def apply_rotary_emb(xq, xk, freqs_cis):
+        """
+        Apply rotary embeddings to input tensors using the given frequency
+        tensor.
+        This function applies rotary embeddings to the given query 'xq' and
+        key 'xk' tensors using the provided frequency tensor 'freqs_cis'. The
+        input tensors are reshaped as complex numbers, and the frequency tensor
+        is reshaped for broadcasting compatibility. The resulting tensors
+        contain rotary embeddings and are returned as real tensors.
+        Args:
+            xq (paddle.Tensor): Query tensor to apply rotary embeddings.
+            xk (paddle.Tensor): Key tensor to apply rotary embeddings.
+            freqs_cis (paddle.Tensor): Precomputed frequency tensor for complex
+                exponentials.
+        Returns:
+            Tuple[paddle.Tensor, paddle.Tensor]: Tuple of modified query tensor
+                and key tensor with rotary embeddings.
+        """
+        with paddle.amp.auto_cast(enable=False):
+            xq_ = paddle.as_complex(xq.cast("float32").reshape([*tuple(xq.shape)[:-1], -1, 2]))
+            xk_ = paddle.as_complex(xk.cast("float32").reshape([*tuple(xk.shape)[:-1], -1, 2]))
+            freqs_cis = Attention.reshape_for_broadcast(freqs_cis, xq_)
+            xq_out = paddle.as_real(xq_ * freqs_cis).flatten(start_axis=3)
+            xk_out = paddle.as_real(xk_ * freqs_cis).flatten(start_axis=3)
+            return xq_out.cast(xq.dtype), xk_out.cast(xk.dtype)
+    def forward(self, x, freqs_cis, y, y_mask):
+        """
+        Forward pass of the attention module.
+        Args:
+            x (paddle.Tensor): Input tensor.
+            freqs_cis (paddle.Tensor): Precomputed frequency tensor.
+        Returns:
+            paddle.Tensor: Output tensor after attention.
+        """
+        bsz, seqlen, _ = tuple(x.shape)
+        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+        dtype = xq.dtype
+        xq = self.q_norm(xq)
+        xk = self.k_norm(xk)
+        xq = xq.reshape([bsz, seqlen, self.n_local_heads, self.head_dim])
+        xk = xk.reshape([bsz, seqlen, self.n_local_kv_heads, self.head_dim])
+        xv = xv.reshape([bsz, seqlen, self.n_local_kv_heads, self.head_dim])
+        xq, xk = Attention.apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
+        xq, xk = xq.cast(dtype), xk.cast(dtype)
+        n_rep = self.n_local_heads // self.n_local_kv_heads
+        if dtype in [paddle.float16, paddle.bfloat16]:
+            output, _ = flash_attention(
+                xq,
+                xk,
+                xv,
+                dropout=0.0,
+                causal=False,
+                return_softmax=False,
+            )
+        else:
+            if n_rep > 1:
+                xk = xk.unsqueeze(axis=3).tile([1, 1, 1, n_rep, 1]).flatten(start_axis=2, stop_axis=3)
+                xv = xv.unsqueeze(axis=3).tile([1, 1, 1, n_rep, 1]).flatten(start_axis=2, stop_axis=3)
+            if self.fused_attn:
+                output = F.scaled_dot_product_attention_(
+                    xq,
+                    xk,
+                    xv,
+                    dropout_p=0.0,
+                    is_causal=False,
+                )
+            else:
+                q = xq.transpose([0, 2, 1, 3]) * self.scale
+                attn = q @ xk.transpose([0, 2, 1, 3]).transpose([0, 1, 3, 2])
+                attn = F.softmax(attn, axis=-1)
+                output = attn @ xv.transpose([0, 2, 1, 3])
+                output = output.transpose([0, 2, 1, 3])
+        output = output.flatten(start_axis=-2)
+        if hasattr(self, "wk_y"):
+            yk = self.ky_norm(self.wk_y(y)).reshape([bsz, -1, self.n_local_kv_heads, self.head_dim])
+            yv = self.wv_y(y).reshape([bsz, -1, self.n_local_kv_heads, self.head_dim])
+            n_rep = self.n_local_heads // self.n_local_kv_heads
+            y_mask = y_mask.reshape([bsz, 1, 1, -1]).expand([bsz, self.n_local_heads, seqlen, -1])
+            if dtype in [paddle.float16, paddle.bfloat16]:
+                output_y = scaled_dot_product_attention(
+                    xq,
+                    yk,
+                    yv,
+                    attn_mask=y_mask.cast(dtype),  # no need to transpose
+                )
+            else:
+                if n_rep > 1:
+                    yk = yk.unsqueeze(3).tile([1, 1, 1, n_rep, 1]).flatten(2, 3)
+                    yv = yv.unsqueeze(3).tile([1, 1, 1, n_rep, 1]).flatten(2, 3)
+                output_y = F.scaled_dot_product_attention_(
+                    xq,
+                    yk,
+                    yv,
+                    attn_mask=y_mask,
+                )
+            output_y = output_y * self.gate.tanh().reshape([1, 1, -1, 1])
+            output_y = output_y.flatten(-2)
+            output = output + output_y
+        return self.wo(output)
+class TransformerBlock(nn.Layer):
+    def __init__(
+        self,
+        layer_id: int,
+        dim: int,
+        n_heads: int,
+        n_kv_heads: int,
+        multiple_of: int,
+        mlp_ratio: float,
+        ffn_dim_multiplier: float,
+        norm_eps: float,
+        qk_norm: bool,
+        fused_attn: bool,
+        y_dim: int,
+    ) -> None:
+        """
+        Initialize a TransformerBlock.
+        Args:
+            layer_id (int): Identifier for the layer.
+            dim (int): Embedding dimension of the input features.
+            n_heads (int): Number of attention heads.
+            n_kv_heads (Optional[int]): Number of attention heads in key and
+                value features (if using GQA), or set to None for the same as
+                query.
+            multiple_of (int): Value to ensure hidden dimension is a multiple
+                of this value in the FeedForward block.
+            ffn_dim_multiplier (float, optional): Custom multiplier for hidden
+                dimension in the FeedForward block. Defaults to None.
+            norm_eps (float): A small value added to the norm layer
+                denominators to avoid division-by-zero.
+        Attributes:
+            n_heads (int): Number of attention heads.
+            dim (int): Dimension size of the model.
+            head_dim (int): Dimension size of each attention head.
+            attention (Attention): Attention module.
+            feed_forward (FeedForward): FeedForward module.
+            layer_id (int): Identifier for the layer.
+            attention_norm (RMSNorm): Layer normalization for attention output.
+            ffn_norm (RMSNorm): Layer normalization for feedforward output.
+            adaLN_modulation (nn.Sequential): A small network to generate
+                feature modulation factors.
+        """
+        super().__init__()
+        self.dim = dim
+        self.head_dim = dim // n_heads
+        self.attention = Attention(dim, n_heads, n_kv_heads, qk_norm, fused_attn, y_dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.feed_forward = FeedForward(
+            dim=dim, hidden_dim=mlp_hidden_dim, multiple_of=multiple_of, ffn_dim_multiplier=ffn_dim_multiplier
+        )
+        self.layer_id = layer_id
+        self.attention_norm = nn.LayerNorm(dim, epsilon=norm_eps, bias_attr=False)
+        self.ffn_norm = nn.LayerNorm(dim, epsilon=norm_eps, bias_attr=False)
+        self.adaLN_modulation = nn.Sequential(
+            nn.Silu(),
+            nn.Linear(min(dim, 1024), 6 * dim),
+        )
+        self.attention_y_norm = nn.LayerNorm(y_dim, epsilon=norm_eps, bias_attr=False)
+    def forward(self, x, y, y_mask, freqs_cis, adaln_input=None):
+        """
+        Perform a forward pass through the TransformerBlock.
+        Args:
+            x (paddle.Tensor): Input tensor.
+            freqs_cis (paddle.Tensor): Precomputed cosine and sine frequencies.
+            mask (paddle.Tensor, optional): Masking tensor for attention.
+                Defaults to None.
+        Returns:
+            paddle.Tensor: Output tensor after applying attention and
+                feedforward layers.
+        """
+        y = y.cast(x.dtype)
+        if adaln_input is not None:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).chunk(
+                6, axis=1
+            )
+            h = x + gate_msa.unsqueeze(1) * self.attention(
+                modulate(self.attention_norm(x), shift_msa, scale_msa), freqs_cis, self.attention_y_norm(y), y_mask
+            )
+            out = h + gate_mlp.unsqueeze(1) * self.feed_forward(modulate(self.ffn_norm(h), shift_mlp, scale_mlp))
+        else:
+            h = x + self.attention(self.attention_norm(x), freqs_cis, self.attention_y_norm(y), y_mask)
+            out = h + self.feed_forward(self.ffn_norm(h))
+        return out
+class DiTLLaMAT2IModel(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+    _use_memory_efficient_attention_xformers = True
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 2,
+        in_channels: int = 4,
+        out_channels: int = 8,
+        max_seq_len: int = 4224,
+        num_layers: int = 32,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 96,
+        mlp_ratio: float = 4.0,
+        n_kv_heads=None,
+        multiple_of: int = 256,
+        ffn_dim_multiplier=None,
+        norm_eps: float = 1e-05,
+        learn_sigma: bool = True,
+        qk_norm: bool = True,
+        cap_feat_dim: int = 4096,
+        rope_scaling_factor: float = 1.0,
+    ):
+        super().__init__()
+        self.max_seq_len = max_seq_len
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if learn_sigma else in_channels
+        dim = attention_head_dim * num_attention_heads
+        self.num_layers = num_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.multiple_of = multiple_of
+        self.ffn_dim_multiplier = ffn_dim_multiplier
+        self.norm_eps = norm_eps
+        self.learn_sigma = learn_sigma
+        self.qk_norm = qk_norm
+        self.gradient_checkpointing = True
+        self.fused_attn = True
+        self.x_embedder = nn.Linear(in_channels * patch_size**2, dim)
+        self.t_embedder = TimestepEmbedder(min(dim, 1024))
+        self.cap_embedder = nn.Sequential(
+            nn.LayerNorm(cap_feat_dim),
+            nn.Linear(cap_feat_dim, min(dim, 1024)),
+        )
+        # 2. Define transformers blocks
+        self.layers = nn.LayerList(
+            [
+                TransformerBlock(
+                    layer_id=idx,
+                    dim=dim,
+                    n_heads=num_attention_heads,
+                    n_kv_heads=n_kv_heads,
+                    multiple_of=multiple_of,
+                    mlp_ratio=mlp_ratio,
+                    ffn_dim_multiplier=ffn_dim_multiplier,
+                    norm_eps=norm_eps,
+                    qk_norm=qk_norm,
+                    fused_attn=self.fused_attn,
+                    y_dim=cap_feat_dim,
+                )
+                for idx in range(num_layers)
+            ]
+        )
+        # 3. Define output layers
+        self.final_layer = FinalLayer(dim, patch_size, self.out_channels)
+        self.freqs_cis = self.precompute_freqs_cis(
+            dim // num_attention_heads, max_seq_len, rope_scaling_factor=rope_scaling_factor
+        )
+        self.eol_token = self.create_parameter(shape=[dim])
+        self.pad_token = self.create_parameter(shape=[dim])
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def enable_gradient_checkpointing(self, value=True):
+        self.gradient_checkpointing = value
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[str] = None):
+        self._use_memory_efficient_attention_xformers = True
+        self.fused_attn = True
+    def unpatchify(self, x, img_size, return_tensor=False):
+        """
+        Args:
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        pH = pW = self.patch_size
+        if return_tensor:
+            H, W = img_size[0]
+            B = x.shape[0]
+            L = (H // pH) * (W // pW + 1)  # one additional for eol
+            x = x[:, :L].reshape([B, H // pH, W // pW + 1, pH, pW, self.out_channels])
+            x = x[:, :, :-1]
+            x = x.transpose([0, 5, 1, 3, 2, 4]).flatten(4, 5).flatten(2, 3)
+            return x
+        else:
+            imgs = []
+            for i in range(x.shape[0]):
+                H, W = img_size[i]
+                L = (H // pH) * (W // pW + 1)
+                imgs.append(
+                    x[i][:L]
+                    .reshape([H // pH, W // pW + 1, pH, pW, self.out_channels])[:, :-1, :, :, :]
+                    .transpose([4, 0, 2, 1, 3])
+                    .flatten(3, 4)
+                    .flatten(1, 2)
+                )
+        return imgs
+    def patchify_and_embed(self, x):
+        if isinstance(x, paddle.Tensor):
+            pH = pW = self.patch_size
+            B, C, H, W = x.shape[:]
+            x = x.reshape([B, C, H // pH, pH, W // pW, pW]).transpose([0, 2, 4, 1, 3, 5]).flatten(3)
+            x = self.x_embedder(x)
+            x = paddle.concat(
+                [
+                    x,
+                    self.eol_token.reshape([1, 1, 1, -1]).expand([B, H // pH, 1, -1]),
+                ],
+                axis=2,
+            )
+            x = x.flatten(1, 2)
+            if x.shape[1] < self.max_seq_len:
+                x = paddle.concat(
+                    [
+                        x,
+                        self.pad_token.reshape([1, 1, -1]).expand([B, self.max_seq_len - x.shape[1], -1]),
+                    ],
+                    axis=1,
+                )
+            return x, [(H, W)] * B
+        else:
+            pH = pW = self.patch_size
+            x_embed = []
+            img_size = []
+            for img in x:
+                C, H, W = img.shape[:]
+                img_size.append((H, W))
+                img = img.reshape([C, H // pH, pH, W // pW, pW]).transpose([1, 3, 0, 2, 4]).flatten(2)
+                img = self.x_embedder(img)
+                img = paddle.concat(
+                    [
+                        img,
+                        self.eol_token.reshape([1, 1, -1]).expand([H // pH, 1, -1]),
+                    ],
+                    axis=1,
+                )
+                img = img.flatten(0, 1)
+                if img.shape[0] < self.max_seq_len:
+                    img = paddle.concat(
+                        [
+                            img,
+                            self.pad_token.reshape([1, -1]).expand([self.max_seq_len - img.shape[0], -1]),
+                        ],
+                        axis=0,
+                    )
+                x_embed.append(img)
+            x_embed = paddle.stack(x_embed, axis=0)
+            return x_embed, img_size
+    @staticmethod
+    def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, rope_scaling_factor: float = 1.0):
+        """
+        Precompute the frequency tensor for complex exponentials (cis) with
+        given dimensions.
+        This function calculates a frequency tensor with complex exponentials
+        using the given dimension 'dim' and the end index 'end'. The 'theta'
+        parameter scales the frequencies. The returned tensor contains complex
+        values in complex64 data type.
+        Args:
+            dim (int): Dimension of the frequency tensor.
+            end (int): End index for precomputing frequencies.
+            theta (float, optional): Scaling factor for frequency computation.
+                Defaults to 10000.0.
+        Returns:
+            paddle.Tensor: Precomputed frequency tensor with complex
+                exponentials.
+        """
+        freqs = 1.0 / theta ** (paddle.arange(start=0, end=dim, step=2)[: dim // 2].cast("float32") / dim)
+        t = paddle.arange(end=end, dtype=paddle.float32)
+        t = t / rope_scaling_factor
+        input_0, vec2_0 = TypePromote(t, freqs)
+        freqs = paddle.outer(input_0, vec2_0).cast("float32")
+        freqs_cis = paddle.complex(
+            paddle.ones_like(freqs) * paddle.cos(freqs), paddle.ones_like(freqs) * paddle.sin(freqs)
+        )
+        return freqs_cis
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        timestep: paddle.Tensor,
+        cap_feats: paddle.Tensor,
+        cap_mask: paddle.Tensor,
+        return_dict: bool = True,
+    ):
+        """
+        Args:
+            hidden_states: (N, C, H, W) tensor of spatial inputs (images or latent
+                representations of images)
+            timestep: (N,) tensor of diffusion timesteps
+            class_labels: (N,) tensor of class labels
+        """
+        hidden_states = hidden_states.cast(self.dtype)
+        timestep = timestep.cast(self.dtype)
+        # 1. Input
+        x_is_tensor = isinstance(hidden_states, paddle.Tensor)
+        hidden_states, img_size = self.patchify_and_embed(hidden_states)
+        t = self.t_embedder(timestep).cast(self.dtype)
+        cap_mask_float = cap_mask.cast("float32").unsqueeze(-1)
+        cap_feats_pool = (cap_feats * cap_mask_float).sum(axis=1) / cap_mask_float.sum(axis=1)
+        cap_emb = self.cap_embedder(cap_feats_pool.cast(self.dtype))
+        adaln_input = t + cap_emb
+        # 2. Blocks
+        for i, layer in enumerate(self.layers):
+            if self.gradient_checkpointing:
+                hidden_states = paddle.distributed.fleet.utils.recompute(
+                    layer, hidden_states, cap_feats, cap_mask, self.freqs_cis[: hidden_states.shape[1]], adaln_input
+                )
+            else:
+                hidden_states = layer(
+                    hidden_states,
+                    cap_feats,
+                    cap_mask,
+                    self.freqs_cis[: hidden_states.shape[1]],
+                    adaln_input,
+                )
+        # 3. Output
+        hidden_states = self.final_layer(hidden_states, adaln_input)
+        output = self.unpatchify(hidden_states, img_size, return_tensor=x_is_tensor)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

PaddleMIX/ppdiffusers/ppdiffusers/models/downsampling.py ADDED Viewed

	@@ -0,0 +1,383 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple
+import paddle
+from .normalization import RMSNorm
+from .upsampling import upfirdn2d_native
+class Downsample1D(paddle.nn.Layer):
+    """A 1D downsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        padding (`int`, default `1`):
+            padding for the convolution.
+        name (`str`, default `conv`):
+            name of the downsampling 1D layer.
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        out_channels: Optional[int] = None,
+        padding: int = 1,
+        name: str = "conv",
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = 2
+        self.name = name
+        if use_conv:
+            self.conv = paddle.nn.Conv1D(
+                in_channels=self.channels,
+                out_channels=self.out_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=padding,
+            )
+        else:
+            assert self.channels == self.out_channels
+            self.conv = paddle.nn.AvgPool1D(kernel_size=stride, stride=stride, exclusive=False)
+    def forward(self, inputs: paddle.Tensor) -> paddle.Tensor:
+        assert tuple(inputs.shape)[1] == self.channels
+        return self.conv(inputs)
+class Downsample2D(paddle.nn.Layer):
+    """A 2D downsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        padding (`int`, default `1`):
+            padding for the convolution.
+        name (`str`, default `conv`):
+            name of the downsampling 2D layer.
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        out_channels: Optional[int] = None,
+        padding: int = 1,
+        name: str = "conv",
+        kernel_size=3,
+        norm_type=None,
+        eps=None,
+        elementwise_affine=None,
+        bias=True,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = 2
+        self.name = name
+        if norm_type == "ln_norm":
+            self.norm = paddle.nn.LayerNorm(
+                normalized_shape=channels, epsilon=eps, weight_attr=elementwise_affine, bias_attr=elementwise_affine
+            )
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(channels, eps, elementwise_affine)
+        elif norm_type is None:
+            self.norm = None
+        else:
+            raise ValueError(f"unknown norm_type: {norm_type}")
+        if use_conv:
+            conv = paddle.nn.Conv2D(
+                in_channels=self.channels,
+                out_channels=self.out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                bias_attr=bias,
+            )
+        else:
+            assert self.channels == self.out_channels
+            conv = paddle.nn.AvgPool2D(kernel_size=stride, stride=stride, exclusive=False)
+        if name == "conv":
+            self.Conv2d_0 = conv
+            self.conv = conv
+        elif name == "Conv2d_0":
+            self.conv = conv
+        else:
+            self.conv = conv
+    def forward(self, hidden_states: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            print("scale", "1.0.0", deprecation_message)
+        assert tuple(hidden_states.shape)[1] == self.channels
+        if self.norm is not None:
+            hidden_states = self.norm(hidden_states.transpose(perm=[0, 2, 3, 1])).transpose(perm=[0, 3, 1, 2])
+        if self.use_conv and self.padding == 0:
+            pad = 0, 1, 0, 1
+            hidden_states = paddle.nn.functional.pad(
+                x=hidden_states, pad=pad, mode="constant", value=0, pad_from_left_axis=False
+            )
+        assert tuple(hidden_states.shape)[1] == self.channels
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+class FirDownsample2D(paddle.nn.Layer):
+    """A 2D FIR downsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        fir_kernel (`tuple`, default `(1, 3, 3, 1)`):
+            kernel for the FIR filter.
+    """
+    def __init__(
+        self,
+        channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        use_conv: bool = False,
+        fir_kernel: Tuple[int, int, int, int] = (1, 3, 3, 1),
+    ):
+        super().__init__()
+        out_channels = out_channels if out_channels else channels
+        if use_conv:
+            self.Conv2d_0 = paddle.nn.Conv2D(
+                in_channels=channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1
+            )
+        self.fir_kernel = fir_kernel
+        self.use_conv = use_conv
+        self.out_channels = out_channels
+    def _downsample_2d(
+        self,
+        hidden_states: paddle.Tensor,
+        weight: Optional[paddle.Tensor] = None,
+        kernel: Optional[paddle.Tensor] = None,
+        factor: int = 2,
+        gain: float = 1,
+    ) -> paddle.Tensor:
+        """Fused `Conv2d()` followed by `downsample_2d()`.
+        Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
+        efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of
+        arbitrary order.
+        Args:
+            hidden_states (`torch.Tensor`):
+                Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+            weight (`torch.Tensor`, *optional*):
+                Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`. Grouped convolution can be
+                performed by `inChannels = x.shape[0] // numGroups`.
+            kernel (`torch.Tensor`, *optional*):
+                FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which
+                corresponds to average pooling.
+            factor (`int`, *optional*, default to `2`):
+                Integer downsampling factor.
+            gain (`float`, *optional*, default to `1.0`):
+                Scaling factor for signal magnitude.
+        Returns:
+            output (`torch.Tensor`):
+                Tensor of the shape `[N, C, H // factor, W // factor]` or `[N, H // factor, W // factor, C]`, and same
+                datatype as `x`.
+        """
+        assert isinstance(factor, int) and factor >= 1
+        if kernel is None:
+            kernel = [1] * factor
+        kernel = paddle.to_tensor(data=kernel, dtype="float32")
+        if kernel.ndim == 1:
+            kernel = paddle.outer(x=kernel, y=kernel)
+        kernel /= paddle.sum(x=kernel)
+        kernel = kernel * gain
+        if self.use_conv:
+            _, _, convH, convW = tuple(weight.shape)
+            pad_value = tuple(kernel.shape)[0] - factor + (convW - 1)
+            stride_value = [factor, factor]
+            upfirdn_input = upfirdn2d_native(
+                hidden_states,
+                paddle.to_tensor(data=kernel, place=hidden_states.place),
+                pad=((pad_value + 1) // 2, pad_value // 2),
+            )
+            output = paddle.nn.functional.conv2d(x=upfirdn_input, weight=weight, stride=stride_value, padding=0)
+        else:
+            pad_value = tuple(kernel.shape)[0] - factor
+            output = upfirdn2d_native(
+                hidden_states,
+                paddle.to_tensor(data=kernel, place=hidden_states.place),
+                down=factor,
+                pad=((pad_value + 1) // 2, pad_value // 2),
+            )
+        return output
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        if self.use_conv:
+            downsample_input = self._downsample_2d(hidden_states, weight=self.Conv2d_0.weight, kernel=self.fir_kernel)
+            hidden_states = downsample_input + self.Conv2d_0.bias.reshape(1, -1, 1, 1)
+        else:
+            hidden_states = self._downsample_2d(hidden_states, kernel=self.fir_kernel, factor=2)
+        return hidden_states
+class KDownsample2D(paddle.nn.Layer):
+    """A 2D K-downsampling layer.
+    Parameters:
+        pad_mode (`str`, *optional*, default to `"reflect"`): the padding mode to use.
+    """
+    def __init__(self, pad_mode: str = "reflect"):
+        super().__init__()
+        self.pad_mode = pad_mode
+        kernel_1d = paddle.to_tensor(data=[[1 / 8, 3 / 8, 3 / 8, 1 / 8]])
+        self.pad = tuple(kernel_1d.shape)[1] // 2 - 1
+        self.register_buffer(name="kernel", tensor=kernel_1d.T @ kernel_1d, persistable=False)
+    def forward(self, inputs: paddle.Tensor) -> paddle.Tensor:
+        inputs = paddle.nn.functional.pad(x=inputs, pad=(self.pad,) * 4, mode=self.pad_mode, pad_from_left_axis=False)
+        weight = paddle.zeros(
+            shape=[
+                tuple(inputs.shape)[1],
+                tuple(inputs.shape)[1],
+                tuple(self.kernel.shape)[0],
+                tuple(self.kernel.shape)[1],
+            ],
+            dtype=inputs.dtype,
+        )
+        indices = paddle.arange(end=tuple(inputs.shape)[1])
+        kernel = self.kernel.to(weight)[None, :].expand(shape=[tuple(inputs.shape)[1], -1, -1])
+        weight[indices, indices] = kernel
+        return paddle.nn.functional.conv2d(x=inputs, weight=weight, stride=2)
+class CogVideoXDownsample3D(paddle.nn.Layer):
+    """
+    A 3D Downsampling layer using in [CogVideoX]() by Tsinghua University & ZhipuAI
+    Args:
+        in_channels (`int`):
+            Number of channels in the input image.
+        out_channels (`int`):
+            Number of channels produced by the convolution.
+        kernel_size (`int`, defaults to `3`):
+            Size of the convolving kernel.
+        stride (`int`, defaults to `2`):
+            Stride of the convolution.
+        padding (`int`, defaults to `0`):
+            Padding added to all four sides of the input.
+        compress_time (`bool`, defaults to `False`):
+            Whether or not to compress the time dimension.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 2,
+        padding: int = 0,
+        compress_time: bool = False,
+    ):
+        super().__init__()
+        self.conv = paddle.nn.Conv2D(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding
+        )
+        self.compress_time = compress_time
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        if self.compress_time:
+            batch_size, channels, frames, height, width = tuple(x.shape)
+            x = x.transpose(perm=[0, 3, 4, 1, 2]).reshape([batch_size * height * width, channels, frames])
+            if tuple(x.shape)[-1] % 2 == 1:
+                x_first, x_rest = x[..., 0], x[..., 1:]
+                if tuple(x_rest.shape)[-1] > 0:
+                    x_rest = paddle.nn.functional.avg_pool1d(kernel_size=2, stride=2, x=x_rest, exclusive=False)
+                x = paddle.concat(x=[x_first[..., None], x_rest], axis=-1)
+                x = x.reshape([batch_size, height, width, channels, tuple(x.shape)[-1]]).transpose(
+                    perm=[0, 3, 4, 1, 2]
+                )
+            else:
+                x = paddle.nn.functional.avg_pool1d(kernel_size=2, stride=2, x=x, exclusive=False)
+                x = x.reshape([batch_size, height, width, channels, tuple(x.shape)[-1]]).transpose(
+                    perm=[0, 3, 4, 1, 2]
+                )
+        pad = (0, 1, 0, 1, 0, 0)
+        x = paddle.nn.functional.pad(x=x, pad=pad, mode="constant", value=0, data_format="NCDHW")
+        batch_size, channels, frames, height, width = tuple(x.shape)
+        x = x.transpose(perm=[0, 2, 1, 3, 4]).reshape([batch_size * frames, channels, height, width])
+        x = self.conv(x)
+        x = x.reshape([batch_size, frames, tuple(x.shape)[1], tuple(x.shape)[2], tuple(x.shape)[3]]).transpose(
+            perm=[0, 2, 1, 3, 4]
+        )
+        return x
+def downsample_2d(
+    hidden_states: paddle.Tensor, kernel: Optional[paddle.Tensor] = None, factor: int = 2, gain: float = 1
+) -> paddle.Tensor:
+    """Downsample2D a batch of 2D images with the given filter.
+    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and downsamples each image with the
+    given filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the
+    specified `gain`. Pixels outside the image are assumed to be zero, and the filter is padded with zeros so that its
+    shape is a multiple of the downsampling factor.
+    Args:
+        hidden_states (`torch.Tensor`)
+            Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+        kernel (`torch.Tensor`, *optional*):
+            FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which
+            corresponds to average pooling.
+        factor (`int`, *optional*, default to `2`):
+            Integer downsampling factor.
+        gain (`float`, *optional*, default to `1.0`):
+            Scaling factor for signal magnitude.
+    Returns:
+        output (`torch.Tensor`):
+            Tensor of the shape `[N, C, H // factor, W // factor]`
+    """
+    assert isinstance(factor, int) and factor >= 1
+    if kernel is None:
+        kernel = [1] * factor
+    kernel = paddle.to_tensor(data=kernel, dtype="float32")
+    if kernel.ndim == 1:
+        kernel = paddle.outer(x=kernel, y=kernel)
+    kernel /= paddle.sum(x=kernel)
+    kernel = kernel * gain
+    pad_value = tuple(kernel.shape)[0] - factor
+    output = upfirdn2d_native(
+        hidden_states, kernel.to(device=hidden_states.place), down=factor, pad=((pad_value + 1) // 2, pad_value // 2)
+    )
+    return output

PaddleMIX/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py ADDED Viewed

	@@ -0,0 +1,158 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import paddle.nn as nn
+from .transformer_2d import Transformer2DModel, Transformer2DModelOutput
+class DualTransformer2DModel(nn.Layer):
+    """
+    Dual transformer wrapper that combines two `Transformer2DModel`s for mixed inference.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            Pass if the input is continuous. The number of channels in the input and output.
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.1): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
+        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
+            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
+            `ImagePositionalEmbeddings`.
+        num_vector_embeds (`int`, *optional*):
+            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
+            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
+            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
+            up to but not more than steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the TransformerBlocks' attention should contain a bias parameter.
+    """
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+    ):
+        super().__init__()
+        self.transformers = nn.LayerList(
+            [
+                Transformer2DModel(
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    in_channels=in_channels,
+                    num_layers=num_layers,
+                    dropout=dropout,
+                    norm_num_groups=norm_num_groups,
+                    cross_attention_dim=cross_attention_dim,
+                    attention_bias=attention_bias,
+                    sample_size=sample_size,
+                    num_vector_embeds=num_vector_embeds,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                )
+                for _ in range(2)
+            ]
+        )
+        # Variables that can be set by a pipeline:
+        # The ratio of transformer1 to transformer2's output states to be combined during inference
+        self.mix_ratio = 0.5
+        # The shape of `encoder_hidden_states` is expected to be
+        # `(batch_size, condition_lengths[0]+condition_lengths[1], num_features)`
+        self.condition_lengths = [77, 257]
+        # Which transformer to use to encode which condition.
+        # E.g. `(1, 0)` means that we'll use `transformers[1](conditions[0])` and `transformers[0](conditions[1])`
+        self.transformer_index_for_condition = [1, 0]
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states,
+        timestep=None,
+        added_cond_kwargs=None,
+        class_labels=None,
+        cross_attention_kwargs=None,
+        attention_mask=None,
+        encoder_attention_mask=None,
+        return_dict: bool = True,
+    ):
+        """
+        Args:
+            hidden_states ( When discrete, `paddle.Tensor` of shape `(batch size, num latent pixels)`.
+                When continuous, `paddle.Tensor` of shape `(batch size, channel, height, width)`): Input
+                hidden_states.
+            encoder_hidden_states ( `paddle.Tensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `paddle.Tensor`, *optional*):
+                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
+            attention_mask (`paddle.Tensor`, *optional*):
+                Optional attention mask to be applied in Attention.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`:
+            [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        input_states = hidden_states
+        encoded_states = []
+        tokens_start = 0
+        # attention_mask is not used yet
+        for i in range(2):
+            # for each of the two transformers, pass the corresponding condition tokens
+            condition_state = encoder_hidden_states[:, tokens_start : tokens_start + self.condition_lengths[i]]
+            transformer_index = self.transformer_index_for_condition[i]
+            encoded_state = self.transformers[transformer_index](
+                input_states,
+                encoder_hidden_states=condition_state,
+                timestep=timestep,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+            encoded_states.append(encoded_state - input_states)
+            tokens_start += self.condition_lengths[i]
+        output_states = encoded_states[0] * self.mix_ratio + encoded_states[1] * (1 - self.mix_ratio)
+        output_states = output_states + input_states
+        if not return_dict:
+            return (output_states,)
+        return Transformer2DModelOutput(sample=output_states)

PaddleMIX/ppdiffusers/ppdiffusers/models/lora.py ADDED Viewed

	@@ -0,0 +1,462 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# IMPORTANT:                                                      #
+###################################################################
+# ----------------------------------------------------------------#
+# This file is deprecated and will be removed soon                #
+# (as soon as PEFT will become a required dependency for LoRA)    #
+# ----------------------------------------------------------------#
+###################################################################
+import contextlib
+from typing import Optional, Tuple, Union
+import paddle
+from paddle import nn
+from ppdiffusers.transformers import CLIPTextModel, CLIPTextModelWithProjection
+from ..utils import logging
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def text_encoder_attn_modules(text_encoder):
+    attn_modules = []
+    if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)):
+        for i, layer in enumerate(text_encoder.text_model.encoder.layers):
+            name = f"text_model.encoder.layers.{i}.self_attn"
+            mod = layer.self_attn
+            attn_modules.append((name, mod))
+    else:
+        raise ValueError(f"do not know how to get attention modules for: {text_encoder.__class__.__name__}")
+    return attn_modules
+def text_encoder_mlp_modules(text_encoder):
+    mlp_modules = []
+    if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)):
+        for i, layer in enumerate(text_encoder.text_model.encoder.layers):
+            mlp_mod = layer.mlp
+            name = f"text_model.encoder.layers.{i}.mlp"
+            mlp_modules.append((name, mlp_mod))
+    else:
+        raise ValueError(f"do not know how to get mlp modules for: {text_encoder.__class__.__name__}")
+    return mlp_modules
+def adjust_lora_scale_text_encoder(text_encoder, lora_scale: float = 1.0):
+    for _, attn_module in text_encoder_attn_modules(text_encoder):
+        if isinstance(attn_module.q_proj, PatchedLoraProjection):
+            attn_module.q_proj.lora_scale = lora_scale
+            attn_module.k_proj.lora_scale = lora_scale
+            attn_module.v_proj.lora_scale = lora_scale
+            attn_module.out_proj.lora_scale = lora_scale
+    for _, mlp_module in text_encoder_mlp_modules(text_encoder):
+        if isinstance(mlp_module.fc1, PatchedLoraProjection):
+            mlp_module.fc1.lora_scale = lora_scale
+            mlp_module.fc2.lora_scale = lora_scale
+class PatchedLoraProjection(nn.Layer):
+    def __init__(self, regular_linear_layer, lora_scale=1, network_alpha=None, rank=4, dtype=None):
+        super().__init__()
+        from ..models.lora import LoRALinearLayer
+        self.regular_linear_layer = regular_linear_layer
+        if dtype is None:
+            dtype = self.regular_linear_layer.weight.dtype
+        self.lora_linear_layer = LoRALinearLayer(
+            self.regular_linear_layer.in_features,
+            self.regular_linear_layer.out_features,
+            network_alpha=network_alpha,
+            dtype=dtype,
+            rank=rank,
+        )
+        self.lora_scale = lora_scale
+    # overwrite PyTorch's `state_dict` to be sure that only the 'regular_linear_layer' weights are saved
+    # when saving the whole text encoder model and when LoRA is unloaded or fused
+    def state_dict(self, destination=None, include_sublayers=True, structured_name_prefix="", use_hook=True):
+        if self.lora_linear_layer is None:
+            return self.regular_linear_layer.state_dict(
+                destination=destination,
+                include_sublayers=include_sublayers,
+                structured_name_prefix=structured_name_prefix,
+                use_hook=use_hook,
+            )
+        return super().state_dict(
+            destination=destination,
+            include_sublayers=include_sublayers,
+            structured_name_prefix=structured_name_prefix,
+            use_hook=use_hook,
+        )
+    def _fuse_lora(self, lora_scale=1.0, safe_fusing=False):
+        if self.lora_linear_layer is None:
+            return
+        dtype = self.regular_linear_layer.weight.dtype
+        w_orig = self.regular_linear_layer.weight.cast("float32")
+        w_up = self.lora_linear_layer.up.weight.cast("float32")
+        w_down = self.lora_linear_layer.down.weight.cast("float32")
+        if self.lora_linear_layer.network_alpha is not None:
+            w_up = w_up * self.lora_linear_layer.network_alpha / self.lora_linear_layer.rank
+        fused_weight = w_orig + (lora_scale * paddle.matmul(w_down[None, :], w_up[None, :])[0])
+        if safe_fusing and paddle.isnan(fused_weight).any().item():
+            raise ValueError(
+                "This LoRA weight seems to be broken. "
+                f"Encountered NaN values when trying to fuse LoRA weights for {self}."
+                "LoRA weights will not be fused."
+            )
+        self.regular_linear_layer.weight.copy_(fused_weight.cast(dtype=dtype), False)
+        # we can drop the lora layer now
+        self.lora_linear_layer = None
+        # offload the up and down matrices to CPU to not blow the memory
+        self.w_up = w_up.cpu()
+        self.w_down = w_down.cpu()
+        self.lora_scale = lora_scale
+    def _unfuse_lora(self):
+        if not (getattr(self, "w_up", None) is not None and getattr(self, "w_down", None) is not None):
+            return
+        fused_weight = self.regular_linear_layer.weight
+        dtype = fused_weight.dtype
+        w_up = self.w_up.cast("float32")
+        w_down = self.w_down.cast("float32")
+        unfused_weight = fused_weight.cast("float32") - (
+            self.lora_scale * paddle.matmul(w_down[None, :], w_up[None, :])[0]
+        )
+        self.regular_linear_layer.weight.copy_(unfused_weight.cast(dtype=dtype), False)
+        self.w_up = None
+        self.w_down = None
+    def forward(self, input):
+        if self.lora_scale is None:
+            self.lora_scale = 1.0
+        if self.lora_linear_layer is None:
+            return self.regular_linear_layer(input)
+        return self.regular_linear_layer(input) + (self.lora_scale * self.lora_linear_layer(input))
+class LoRALinearLayer(nn.Layer):
+    r"""
+    A linear layer that is used with LoRA.
+    Parameters:
+        in_features (`int`):
+            Number of input features.
+        out_features (`int`):
+            Number of output features.
+        rank (`int`, `optional`, defaults to 4):
+            The rank of the LoRA layer.
+        network_alpha (`float`, `optional`, defaults to `None`):
+            The value of the network alpha used for stable learning and preventing underflow. This value has the same
+            meaning as the `--network_alpha` option in the kohya-ss trainer script. See
+            https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        dtype (`torch.dtype`, `optional`, defaults to `None`):
+            The dtype to use for the layer's weights.
+    """
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        rank: int = 4,
+        network_alpha: Optional[float] = None,
+        dtype: Optional[paddle.dtype] = None,
+    ):
+        super().__init__()
+        if dtype is not None:
+            ctx = paddle.dtype_guard(dtype)
+        else:
+            ctx = contextlib.nullcontext()
+        with ctx:
+            self.down = nn.Linear(in_features, rank, bias_attr=False)
+            self.up = nn.Linear(rank, out_features, bias_attr=False)
+        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
+        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        self.network_alpha = network_alpha
+        self.rank = rank
+        self.out_features = out_features
+        self.in_features = in_features
+        nn.init.normal_(self.down.weight, std=1 / rank)
+        nn.init.zeros_(self.up.weight)
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        orig_dtype = hidden_states.dtype
+        dtype = self.down.weight.dtype
+        down_hidden_states = self.down(hidden_states.cast(dtype))
+        up_hidden_states = self.up(down_hidden_states)
+        if self.network_alpha is not None:
+            up_hidden_states *= self.network_alpha / self.rank
+        return up_hidden_states.cast(orig_dtype)
+class LoRAConv2dLayer(nn.Layer):
+    r"""
+    A convolutional layer that is used with LoRA.
+    Parameters:
+        in_features (`int`):
+            Number of input features.
+        out_features (`int`):
+            Number of output features.
+        rank (`int`, `optional`, defaults to 4):
+            The rank of the LoRA layer.
+        kernel_size (`int` or `tuple` of two `int`, `optional`, defaults to 1):
+            The kernel size of the convolution.
+        stride (`int` or `tuple` of two `int`, `optional`, defaults to 1):
+            The stride of the convolution.
+        padding (`int` or `tuple` of two `int` or `str`, `optional`, defaults to 0):
+            The padding of the convolution.
+        network_alpha (`float`, `optional`, defaults to `None`):
+            The value of the network alpha used for stable learning and preventing underflow. This value has the same
+            meaning as the `--network_alpha` option in the kohya-ss trainer script. See
+            https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+    """
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        rank: int = 4,
+        kernel_size: Union[int, Tuple[int, int]] = (1, 1),
+        stride: Union[int, Tuple[int, int]] = (1, 1),
+        padding: Union[int, Tuple[int, int], str] = 0,
+        network_alpha: Optional[float] = None,
+    ):
+        super().__init__()
+        self.down = nn.Conv2D(
+            in_features, rank, kernel_size=kernel_size, stride=stride, padding=padding, bias_attr=False
+        )
+        # according to the official kohya_ss trainer kernel_size are always fixed for the up layer
+        # # see: https://github.com/bmaltais/kohya_ss/blob/2accb1305979ba62f5077a23aabac23b4c37e935/networks/lora_diffusers.py#L129
+        self.up = nn.Conv2D(rank, out_features, kernel_size=(1, 1), stride=(1, 1), bias_attr=False)
+        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
+        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        self.network_alpha = network_alpha
+        self.rank = rank
+        nn.init.normal_(self.down.weight, std=1 / rank)
+        nn.init.zeros_(self.up.weight)
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        orig_dtype = hidden_states.dtype
+        dtype = self.down.weight.dtype
+        down_hidden_states = self.down(hidden_states.cast(dtype))
+        up_hidden_states = self.up(down_hidden_states)
+        if self.network_alpha is not None:
+            up_hidden_states *= self.network_alpha / self.rank
+        return up_hidden_states.cast(orig_dtype)
+class LoRACompatibleConv(nn.Conv2D):
+    """
+    A convolutional layer that can be used with LoRA.
+    """
+    def __init__(self, *args, lora_layer: Optional[LoRAConv2dLayer] = None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.in_channels = self._in_channels
+        self.out_channels = self._out_channels
+        self.kernel_size = self._kernel_size
+        self.lora_layer = lora_layer
+        self.data_format = kwargs.get("data_format", "NCHW")
+    def set_lora_layer(self, lora_layer: Optional[LoRAConv2dLayer]):
+        self.lora_layer = lora_layer
+    def _fuse_lora(self, lora_scale: float = 1.0, safe_fusing: bool = False):
+        if self.lora_layer is None:
+            return
+        dtype = self.weight.dtype
+        w_orig = self.weight.cast("float32")
+        w_up = self.lora_layer.up.weight.cast("float32")
+        w_down = self.lora_layer.down.weight.cast("float32")
+        if self.lora_layer.network_alpha is not None:
+            w_up = w_up * self.lora_layer.network_alpha / self.lora_layer.rank
+        fusion = paddle.matmul(w_up.flatten(start_axis=1), w_down.flatten(start_axis=1))
+        fusion = fusion.reshape(w_orig.shape)
+        fused_weight = w_orig + (lora_scale * fusion)
+        if safe_fusing and paddle.isnan(fused_weight).any().item():
+            raise ValueError(
+                "This LoRA weight seems to be broken. "
+                f"Encountered NaN values when trying to fuse LoRA weights for {self}."
+                "LoRA weights will not be fused."
+            )
+        self.weight.copy_(fused_weight.cast(dtype=dtype), False)
+        # we can drop the lora layer now
+        self.lora_layer = None
+        # offload the up and down matrices to CPU to not blow the memory
+        self.w_up = w_up.cpu()
+        self.w_down = w_down.cpu()
+        self._lora_scale = lora_scale
+    def _unfuse_lora(self):
+        if not (getattr(self, "w_up", None) is not None and getattr(self, "w_down", None) is not None):
+            return
+        fused_weight = self.weight
+        dtype = fused_weight.dtype
+        w_up = self.w_up.cast("float32")
+        w_down = self.w_down.cast("float32")
+        fusion = paddle.matmul(w_up.flatten(start_axis=1), w_down.flatten(start_axis=1))
+        fusion = fusion.reshape(fused_weight.shape)
+        unfused_weight = fused_weight.cast("float32") - (self._lora_scale * fusion)
+        self.weight.copy_(unfused_weight.cast(dtype=dtype), False)
+        self.w_up = None
+        self.w_down = None
+    def forward(self, hidden_states: paddle.Tensor, scale: float = 1.0) -> paddle.Tensor:
+        if self.lora_layer is None:
+            # make sure to the functional Conv2D function as otherwise torch.compile's graph will break
+            # see: https://github.com/huggingface/diffusers/pull/4315
+            return nn.functional.conv2d(
+                hidden_states,
+                self.weight,
+                self.bias,
+                self._stride,
+                self._padding,
+                self._dilation,
+                self._groups,
+                data_format=self.data_format,
+            )
+        else:
+            original_outputs = nn.functional.conv2d(
+                hidden_states,
+                self.weight,
+                self.bias,
+                self._stride,
+                self._padding,
+                self._dilation,
+                self._groups,
+                data_format=self.data_format,
+            )
+            return original_outputs + (scale * self.lora_layer(hidden_states))
+class LoRACompatibleLinear(nn.Linear):
+    """
+    A Linear layer that can be used with LoRA.
+    """
+    def __init__(self, *args, lora_layer: Optional[LoRALinearLayer] = None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.lora_layer = lora_layer
+    def set_lora_layer(self, lora_layer: Optional[LoRALinearLayer]):
+        self.lora_layer = lora_layer
+    def _fuse_lora(self, lora_scale: float = 1.0, safe_fusing: bool = False):
+        if self.lora_layer is None:
+            return
+        dtype = self.weight.dtype
+        w_orig = self.weight.cast("float32")
+        w_up = self.lora_layer.up.weight.cast("float32")
+        w_down = self.lora_layer.down.weight.cast("float32")
+        if self.lora_layer.network_alpha is not None:
+            w_up = w_up * self.lora_layer.network_alpha / self.lora_layer.rank
+        fused_weight = w_orig + (lora_scale * paddle.matmul(w_down[None, :], w_up[None, :])[0])
+        if safe_fusing and paddle.isnan(fused_weight).any().item():
+            raise ValueError(
+                "This LoRA weight seems to be broken. "
+                f"Encountered NaN values when trying to fuse LoRA weights for {self}."
+                "LoRA weights will not be fused."
+            )
+        self.weight.copy_(fused_weight.cast(dtype=dtype), False)
+        # we can drop the lora layer now
+        self.lora_layer = None
+        # offload the up and down matrices to CPU to not blow the memory
+        self.w_up = w_up.cpu()
+        self.w_down = w_down.cpu()
+        self._lora_scale = lora_scale
+    def _unfuse_lora(self):
+        if not (getattr(self, "w_up", None) is not None and getattr(self, "w_down", None) is not None):
+            return
+        fused_weight = self.weight
+        dtype = fused_weight.dtype
+        w_up = self.w_up.cast("float32")
+        w_down = self.w_down.cast("float32")
+        unfused_weight = fused_weight.cast("float32") - (
+            self._lora_scale * paddle.matmul(w_down[None, :], w_up[None, :])[0]
+        )
+        self.weight.copy_(unfused_weight.cast(dtype=dtype), False)
+        self.w_up = None
+        self.w_down = None
+    def forward(self, hidden_states: paddle.Tensor, scale: float = 1.0) -> paddle.Tensor:
+        if self.lora_layer is None:
+            return nn.functional.linear(
+                hidden_states,
+                self.weight,
+                self.bias,
+            )
+        else:
+            out = super().forward(hidden_states) + (scale * self.lora_layer(hidden_states))
+            return out

PaddleMIX/ppdiffusers/ppdiffusers/models/lvdm_attention_temporal.py ADDED Viewed

	@@ -0,0 +1,462 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn.functional as F
+from paddle.distributed.fleet.utils import recompute
+try:
+    from paddle.incubate.nn.memory_efficient_attention import (  # noqa
+        memory_efficient_attention,
+    )
+    _ppxformers_available = True
+except:
+    _ppxformers_available = False
+import math
+from einops import rearrange, repeat
+from ..utils.initializer_utils import constant_, xavier_uniform_
+from .lvdm_util import (
+    GEGLU,
+    Normalize,
+    conv_nd,
+    default,
+    exists,
+    normalization,
+    zero_module,
+)
+class FeedForward(paddle.nn.Layer):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = (
+            paddle.nn.Sequential(paddle.nn.Linear(in_features=dim, out_features=inner_dim), paddle.nn.GELU())
+            if not glu
+            else GEGLU(dim, inner_dim)
+        )
+        self.net = paddle.nn.Sequential(
+            project_in, paddle.nn.Dropout(p=dropout), paddle.nn.Linear(in_features=inner_dim, out_features=dim_out)
+        )
+    def forward(self, x):
+        return self.net(x)
+class RelativePosition(paddle.nn.Layer):
+    """https://github.com/evelinehong/Transformer_Relative_Position_PyTorch/blob/master/relative_position.py"""
+    def __init__(self, num_units, max_relative_position):
+        super().__init__()
+        self.num_units = num_units
+        self.max_relative_position = max_relative_position
+        self.embeddings_table = paddle.nn.Parameter(paddle.empty(shape=[max_relative_position * 2 + 1, num_units]))
+        xavier_uniform_(self.embeddings_table)
+    def forward(self, length_q, length_k):
+        range_vec_q = paddle.arange(end=length_q)
+        range_vec_k = paddle.arange(end=length_k)
+        distance_mat = range_vec_k[(None), :] - range_vec_q[:, (None)]
+        distance_mat_clipped = paddle.clip(
+            x=distance_mat, min=-self.max_relative_position, max=self.max_relative_position
+        )
+        final_mat = distance_mat_clipped + self.max_relative_position
+        final_mat = final_mat.astype(dtype="int64")
+        embeddings = self.embeddings_table[final_mat]
+        return embeddings
+class TemporalCrossAttention(paddle.nn.Layer):
+    def __init__(
+        self,
+        query_dim,
+        context_dim=None,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        use_relative_position=False,
+        temporal_length=None,
+        **kwargs
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.context_dim = context_dim
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.temporal_length = temporal_length
+        self.use_relative_position = use_relative_position
+        self.to_q = paddle.nn.Linear(in_features=query_dim, out_features=inner_dim, bias_attr=False)
+        self.to_k = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False)
+        self.to_v = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False)
+        self.to_out = paddle.nn.Sequential(
+            paddle.nn.Linear(in_features=inner_dim, out_features=query_dim), paddle.nn.Dropout(p=dropout)
+        )
+        if use_relative_position:
+            assert temporal_length is not None
+            self.relative_position_k = RelativePosition(num_units=dim_head, max_relative_position=temporal_length)
+            self.relative_position_v = RelativePosition(num_units=dim_head, max_relative_position=temporal_length)
+        constant_(self.to_q.weight, 0)
+        constant_(self.to_k.weight, 0)
+        constant_(self.to_v.weight, 0)
+        constant_(self.to_out[0].weight, 0)
+        constant_(self.to_out[0].bias, 0)
+    def forward(self, x, context=None, mask=None):
+        nh = self.heads
+        out = x
+        q = self.to_q(out)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=nh), (q, k, v))
+        sim = paddle.einsum("b i d, b j d -> b i j", q, k) * self.scale
+        if self.use_relative_position:
+            len_q, len_k, len_v = q.shape[1], k.shape[1], v.shape[1]
+            k2 = self.relative_position_k(len_q, len_k)
+            sim2 = paddle.einsum("b t d, t s d -> b t s", q, k2) * self.scale
+            sim += sim2
+        if mask is not None:
+            max_neg_value = -1000000000.0
+            sim = sim + (1 - mask.astype(dtype="float32")) * max_neg_value
+        attn = paddle.nn.functional.softmax(sim, axis=-1)
+        out = paddle.einsum("b i j, b j d -> b i d", attn, v)
+        if self.use_relative_position:
+            v2 = self.relative_position_v(len_q, len_v)
+            out2 = paddle.einsum("b t s, t s d -> b t d", attn, v2)
+            out += out2
+        out = rearrange(out, "(b h) n d -> b n (h d)", h=nh)
+        return self.to_out(out)
+class CrossAttention(paddle.nn.Layer):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0, **kwargs):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.to_q = paddle.nn.Linear(in_features=query_dim, out_features=inner_dim, bias_attr=False)
+        self.to_k = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False)
+        self.to_v = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False)
+        self.to_out = paddle.nn.Sequential(
+            paddle.nn.Linear(in_features=inner_dim, out_features=query_dim), paddle.nn.Dropout(p=dropout)
+        )
+    def forward(self, x, context=None, mask=None):
+        h = self.heads
+        # b = x.shape[0]
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=h), (q, k, v))
+        sim = paddle.einsum("b i d, b j d -> b i j", q, k) * self.scale
+        if exists(mask):
+            mask = rearrange(mask, "b ... -> b (...)")
+            max_neg_value = -paddle.finfo(sim.dtype).max
+            mask = repeat(mask, "b j -> (b h) () j", h=h)
+            sim = paddle.masked_fill(sim, ~mask, max_neg_value)
+        attn = paddle.nn.functional.softmax(sim, axis=-1)
+        out = paddle.einsum("b i j, b j d -> b i d", attn, v)
+        out = rearrange(out, "(b h) n d -> b n (h d)", h=h)
+        return self.to_out(out)
+class MemoryEfficientCrossAttention(paddle.nn.Layer):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0, **kwargs):
+        super().__init__()
+        # print(
+        #     f"Setting up {self.__class__.__name__}. Query dim is {query_dim}, context_dim is {context_dim} and using {heads} heads."
+        # )
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.heads = heads
+        self.dim_head = dim_head
+        self.to_q = paddle.nn.Linear(in_features=query_dim, out_features=inner_dim, bias_attr=False)
+        self.to_k = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False)
+        self.to_v = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False)
+        self.to_out = paddle.nn.Sequential(
+            paddle.nn.Linear(in_features=inner_dim, out_features=query_dim), paddle.nn.Dropout(p=dropout)
+        )
+        self.attention_op = "cutlass"
+    def forward(self, x, context=None, mask=None):
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        b, _, _ = q.shape
+        q, k, v = map(lambda t: t.reshape([0, 0, self.heads, self.dim_head]), (q, k, v))
+        out = F.scaled_dot_product_attention_(
+            q,
+            k,
+            v,
+            attn_mask=None,
+            dropout_p=0.0,
+            attention_op=self.attention_op,
+            training=True,
+        )
+        if exists(mask):
+            raise NotImplementedError
+        out = out.reshape([0, 0, self.heads * self.dim_head])
+        return self.to_out(out)
+class BasicTransformerBlockST(paddle.nn.Layer):
+    """
+    if no context is given to forward function, cross-attention defaults to self-attention
+    """
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        d_head,
+        dropout=0.0,
+        context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+        temporal_length=None,
+        use_relative_position=True,
+        **kwargs
+    ):
+        super().__init__()
+        if _ppxformers_available:
+            self.attn1 = MemoryEfficientCrossAttention(
+                query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout, **kwargs
+            )
+            self.attn2 = MemoryEfficientCrossAttention(
+                query_dim=dim, context_dim=context_dim, heads=n_heads, dim_head=d_head, dropout=dropout, **kwargs
+            )
+        else:
+            self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout, **kwargs)
+            self.attn2 = CrossAttention(
+                query_dim=dim, context_dim=context_dim, heads=n_heads, dim_head=d_head, dropout=dropout, **kwargs
+            )
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.norm1 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None)
+        self.norm2 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None)
+        self.norm3 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None)
+        self.checkpoint = checkpoint
+        self.attn1_tmp = TemporalCrossAttention(
+            query_dim=dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            temporal_length=temporal_length,
+            use_relative_position=use_relative_position,
+            **kwargs,
+        )
+        self.attn2_tmp = TemporalCrossAttention(
+            query_dim=dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            context_dim=None,
+            temporal_length=temporal_length,
+            use_relative_position=use_relative_position,
+            **kwargs,
+        )
+        self.norm4 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None)
+        self.norm5 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None)
+    def forward(self, x, context=None, **kwargs):
+        if self.checkpoint:
+            return recompute(self._forward, x, context)
+        else:
+            return self._forward(x, context)
+    def _forward(self, x, context=None, mask=None):
+        assert x.dim() == 5, f"x shape = {x.shape}"
+        b, c, t, h, w = x.shape
+        x = rearrange(x, "b c t h w -> (b t) (h w) c")
+        x = self.attn1(self.norm1(x)) + x
+        x = rearrange(x, "(b t) (h w) c -> b c t h w", b=b, h=h)
+        x = rearrange(x, "b c t h w -> (b h w) t c")
+        x = self.attn1_tmp(self.norm4(x), mask=mask) + x
+        x = rearrange(x, "(b h w) t c -> b c t h w", b=b, h=h, w=w)
+        x = rearrange(x, "b c t h w -> (b t) (h w) c")
+        if context is not None:
+            context_ = []
+            for i in range(context.shape[0]):
+                context_.append(context[i].unsqueeze(axis=0).tile(repeat_times=[t, 1, 1]))
+            context_ = paddle.concat(x=context_, axis=0)
+        else:
+            context_ = None
+        x = self.attn2(self.norm2(x), context=context_) + x
+        x = rearrange(x, "(b t) (h w) c -> b c t h w", b=b, h=h)
+        x = rearrange(x, "b c t h w -> (b h w) t c")
+        x = self.attn2_tmp(self.norm5(x), context=None, mask=mask) + x
+        x = self.ff(self.norm3(x)) + x
+        x = rearrange(x, "(b h w) t c -> b c t h w", b=b, h=h, w=w)
+        return x
+class SpatialTemporalTransformer(paddle.nn.Layer):
+    """
+    Transformer block for video-like data (5D tensor).
+    First, project the input (aka embedding) with NO reshape.
+    Then apply standard transformer action.
+    The 5D -> 3D reshape operation will be done in the specific attention module.
+    """
+    def __init__(
+        self,
+        in_channels,
+        n_heads,
+        d_head,
+        depth=1,
+        dropout=0.0,
+        context_dim=None,
+        temporal_length=None,
+        use_relative_position=True,
+        **kwargs
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)
+        self.proj_in = paddle.nn.Conv3D(
+            in_channels=in_channels, out_channels=inner_dim, kernel_size=1, stride=1, padding=0
+        )
+        self.transformer_blocks = paddle.nn.LayerList(
+            sublayers=[
+                BasicTransformerBlockST(
+                    inner_dim,
+                    n_heads,
+                    d_head,
+                    dropout=dropout,
+                    context_dim=context_dim,
+                    temporal_length=temporal_length,
+                    use_relative_position=use_relative_position,
+                    **kwargs,
+                )
+                for d in range(depth)
+            ]
+        )
+        self.proj_out = zero_module(
+            paddle.nn.Conv3D(in_channels=inner_dim, out_channels=in_channels, kernel_size=1, stride=1, padding=0)
+        )
+    def forward(self, x, context=None, **kwargs):
+        assert x.dim() == 5, f"x shape = {x.shape}"
+        x_in = x
+        x = self.norm(x)
+        x = self.proj_in(x)
+        for block in self.transformer_blocks:
+            x = block(x, context=context, **kwargs)
+        x = self.proj_out(x)
+        return x + x_in
+class STAttentionBlock(paddle.nn.Layer):
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        temporal_length=16,
+        use_relative_position=False,
+    ):
+        super().__init__()
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.temporal_length = temporal_length
+        self.use_relative_position = use_relative_position
+        self.norm_s = normalization(channels)
+        self.norm_t = normalization(channels)
+        self.qkv_s = conv_nd(1, channels, channels * 3, 1)
+        self.qkv_t = conv_nd(1, channels, channels * 3, 1)
+        self.attention_s = QKVAttention(self.num_heads)
+        self.attention_t = QKVAttention(self.num_heads)
+        if use_relative_position:
+            self.relative_position_k = RelativePosition(
+                num_units=channels // self.num_heads, max_relative_position=temporal_length
+            )
+            self.relative_position_v = RelativePosition(
+                num_units=channels // self.num_heads, max_relative_position=temporal_length
+            )
+        self.proj_out_s = zero_module(conv_nd(1, channels, channels, 1))
+        self.proj_out_t = zero_module(conv_nd(1, channels, channels, 1))
+    def forward(self, x, mask=None):
+        b, c, t, h, w = x.shape
+        out = rearrange(x, "b c t h w -> (b t) c (h w)")
+        qkv = self.qkv_s(self.norm_s(out))
+        out = self.attention_s(qkv)
+        out = self.proj_out_s(out)
+        out = rearrange(out, "(b t) c (h w) -> b c t h w", b=b, h=h)
+        x += out
+        out = rearrange(x, "b c t h w -> (b h w) c t")
+        qkv = self.qkv_t(self.norm_t(out))
+        if self.use_relative_position:
+            len_q = qkv.shape[-1]
+            len_k, len_v = len_q, len_q
+            k_rp = self.relative_position_k(len_q, len_k)
+            v_rp = self.relative_position_v(len_q, len_v)
+            out = self.attention_t(qkv, rp=(k_rp, v_rp), mask=mask)
+        else:
+            out = self.attention_t(qkv, rp=None, mask=mask)
+        out = self.proj_out_t(out)
+        out = rearrange(out, "(b h w) c t -> b c t h w", b=b, h=h, w=w)
+        return x + out
+class QKVAttention(paddle.nn.Layer):
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv, rp=None, mask=None):
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.chunk(chunks=3, axis=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = paddle.einsum(
+            "bct,bcs->bts",
+            (q * scale).reshape([bs * self.n_heads, ch, length]),
+            (k * scale).reshape([bs * self.n_heads, ch, length]),
+        )
+        if rp is not None:
+            k_rp, v_rp = rp
+            weight2 = paddle.einsum("bct,tsc->bst", (q * scale).reshape([bs * self.n_heads, ch, length]), k_rp)
+            weight += weight2
+        if mask is not None:
+            INF = -100000000.0
+            weight = paddle.where(mask == 0, weight.astype(dtype="float32"), INF)
+        weight = paddle.nn.functional.softmax(x=weight.astype(dtype="float32"), axis=-1).astype(weight.dtype)
+        a = paddle.einsum("bts,bcs->bct", weight, v.reshape([bs * self.n_heads, ch, length]))
+        if rp is not None:
+            x = paddle.einsum("bts,tsc->btc", weight, v_rp)
+            perm_3 = list(range(x.ndim))
+            perm_3[1] = 2
+            perm_3[2] = 1
+            a2 = x.transpose(perm=perm_3)
+            a += a2
+        return a.reshape([bs, -1, length])

PaddleMIX/ppdiffusers/ppdiffusers/models/lvdm_unet_3d.py ADDED Viewed

	@@ -0,0 +1,713 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import abstractmethod
+from dataclasses import dataclass
+import paddle
+from einops import rearrange
+from paddle.distributed.fleet.utils import recompute
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .lvdm_attention_temporal import SpatialTemporalTransformer, STAttentionBlock
+from .lvdm_util import (
+    avg_pool_nd,
+    conv_nd,
+    linear,
+    nonlinearity,
+    normalization,
+    timestep_embedding,
+    zero_module,
+)
+from .modeling_utils import ModelMixin
+@dataclass
+class LVDMUNet3DModelOutput(BaseOutput):
+    """
+    Args:
+        sample (`paddle.Tensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+    sample: paddle.Tensor
+def convert_module_to_f16(x):
+    pass
+def convert_module_to_f32(x):
+    pass
+class TimestepBlock(paddle.nn.Layer):
+    """
+    Any module where forward() takes timestep embeddings as a second argument.
+    """
+    @abstractmethod
+    def forward(self, x, emb):
+        """
+        Apply the module to `x` given `emb` timestep embeddings.
+        """
+class TimestepEmbedSequential(paddle.nn.Sequential, TimestepBlock):
+    """
+    A sequential module that passes timestep embeddings to the children that
+    support it as an extra input.
+    """
+    def forward(self, x, emb, context=None, **kwargs):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb, **kwargs)
+            # elif isinstance(layer, STTransformerClass):
+            elif isinstance(layer, SpatialTemporalTransformer):
+                x = layer(x, context, **kwargs)
+            else:
+                x = layer(x)
+        return x
+class Upsample(paddle.nn.Layer):
+    """
+    An upsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, kernel_size_t=3, padding_t=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = conv_nd(
+                dims, self.channels, self.out_channels, (kernel_size_t, 3, 3), padding=(padding_t, 1, 1)
+            )
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = paddle.nn.functional.interpolate(
+                x=x, size=(x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest", data_format="NCDHW"
+            )
+        else:
+            x = paddle.nn.functional.interpolate(x=x, scale_factor=2, mode="nearest")
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+class Downsample(paddle.nn.Layer):
+    """
+    A downsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, kernel_size_t=3, padding_t=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(
+                dims, self.channels, self.out_channels, (kernel_size_t, 3, 3), stride=stride, padding=(padding_t, 1, 1)
+            )
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+class ResBlock(TimestepBlock):
+    """
+    A residual block that can optionally change the number of channels.
+    :param channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout.
+    :param out_channels: if specified, the number of out channels.
+    :param use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the
+        channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param use_checkpoint: if True, use gradient checkpointing on this module.
+    :param up: if True, use this block for upsampling.
+    :param down: if True, use this block for downsampling.
+    """
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout,
+        out_channels=None,
+        use_conv=False,
+        use_scale_shift_norm=False,
+        dims=2,
+        use_checkpoint=False,
+        up=False,
+        down=False,
+        kernel_size_t=3,
+        padding_t=1,
+        nonlinearity_type="silu",
+        **kwargs
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.nonlinearity_type = nonlinearity_type
+        self.in_layers = paddle.nn.Sequential(
+            normalization(channels),
+            nonlinearity(nonlinearity_type),
+            conv_nd(dims, channels, self.out_channels, (kernel_size_t, 3, 3), padding=(padding_t, 1, 1)),
+        )
+        self.updown = up or down
+        if up:
+            self.h_upd = Upsample(channels, False, dims, kernel_size_t=kernel_size_t, padding_t=padding_t)
+            self.x_upd = Upsample(channels, False, dims, kernel_size_t=kernel_size_t, padding_t=padding_t)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims, kernel_size_t=kernel_size_t, padding_t=padding_t)
+            self.x_upd = Downsample(channels, False, dims, kernel_size_t=kernel_size_t, padding_t=padding_t)
+        else:
+            self.h_upd = self.x_upd = paddle.nn.Identity()
+        self.emb_layers = paddle.nn.Sequential(
+            nonlinearity(nonlinearity_type),
+            linear(emb_channels, 2 * self.out_channels if use_scale_shift_norm else self.out_channels),
+        )
+        self.out_layers = paddle.nn.Sequential(
+            normalization(self.out_channels),
+            nonlinearity(nonlinearity_type),
+            paddle.nn.Dropout(p=dropout),
+            zero_module(
+                conv_nd(dims, self.out_channels, self.out_channels, (kernel_size_t, 3, 3), padding=(padding_t, 1, 1))
+            ),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = paddle.nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(
+                dims, channels, self.out_channels, (kernel_size_t, 3, 3), padding=(padding_t, 1, 1)
+            )
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+    def forward(self, x, emb, **kwargs):
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+        :param x: an [N x C x ...] Tensor of features.
+        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        if self.use_checkpoint:
+            return recompute(self._forward, x, emb)
+        else:
+            return self._forward(x, emb)
+    def _forward(self, x, emb):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb).astype(h.dtype)
+        if emb_out.dim() == 3:
+            emb_out = rearrange(emb_out, "b t c -> b c t")
+        while len(emb_out.shape) < h.dim():
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = paddle.chunk(x=emb_out, chunks=2, axis=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        out = self.skip_connection(x) + h
+        return out
+# def make_spatialtemporal_transformer(module_name='attention_temporal',
+#     class_name='SpatialTemporalTransformer'):
+#     module = __import__(f'.lvdm_attention_temporal', fromlist=[
+#         class_name])
+#     global STTransformerClass
+#     STTransformerClass = getattr(module, class_name)
+#     return STTransformerClass
+def make_spatialtemporal_transformer(module_name="attention_temporal", class_name="SpatialTemporalTransformer"):
+    # Todo: Support loading more types of transformers
+    assert module_name == "attention_temporal" and class_name == "SpatialTemporalTransformer"
+    return SpatialTemporalTransformer
+class LVDMUNet3DModel(ModelMixin, ConfigMixin):
+    """
+    The full UNet model with attention and timestep embedding.
+    :param in_channels: channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param num_res_blocks: number of residual blocks per downsample.
+    :param attention_resolutions: a collection of downsample rates at which
+        attention will take place. May be a set, list, or tuple.
+        For example, if this contains 4, then at 4x downsampling, attention
+        will be used.
+    :param dropout: the dropout probability.
+    :param channel_mult: channel multiplier for each level of the UNet.
+    :param conv_resample: if True, use learned convolutions for upsampling and
+        downsampling.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param num_classes: if specified (as an int), then this model will be
+        class-conditional with `num_classes` classes.
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
+    :param num_heads: the number of attention heads in each attention layer.
+    :param num_heads_channels: if specified, ignore num_heads and instead use
+                               a fixed channel width per attention head.
+    :param num_heads_upsample: works with num_heads to set a different number
+                               of heads for upsampling. Deprecated.
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+    :param resblock_updown: use residual blocks for up/downsampling.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=3,
+        num_classes=None,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=-1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        transformer_depth=1,
+        context_dim=None,
+        legacy=True,
+        kernel_size_t=1,
+        padding_t=1,
+        use_temporal_transformer=False,
+        temporal_length=None,
+        use_relative_position=False,
+        nonlinearity_type="silu",
+        ST_transformer_module="attention_temporal",
+        ST_transformer_class="SpatialTemporalTransformer",
+        **kwargs
+    ):
+        super().__init__()
+        if use_temporal_transformer:
+            assert (
+                context_dim is not None
+            ), "Fool!! You forgot to include the dimension of your cross-attention conditioning..."
+        if context_dim is not None:
+            assert (
+                use_temporal_transformer
+            ), "Fool!! You forgot to use the temporal transformer for your cross-attention conditioning..."
+            from omegaconf.listconfig import ListConfig
+            if type(context_dim) == ListConfig:
+                context_dim = list(context_dim)
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        if num_heads == -1:
+            assert num_head_channels != -1, "Either num_heads or num_head_channels has to be set"
+        if num_head_channels == -1:
+            assert num_heads != -1, "Either num_heads or num_head_channels has to be set"
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        # Todo: support customted self.dtype
+        # self.dtype = 'float16' if use_fp16 else 'float32'
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.use_relative_position = use_relative_position
+        self.temporal_length = temporal_length
+        self.nonlinearity_type = nonlinearity_type
+        time_embed_dim = model_channels * 4
+        self.time_embed_dim = time_embed_dim
+        self.time_embed = paddle.nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nonlinearity(nonlinearity_type),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        if self.num_classes is not None:
+            self.label_emb = paddle.nn.Embedding(num_classes, time_embed_dim)
+        STTransformerClass = make_spatialtemporal_transformer(
+            module_name=ST_transformer_module, class_name=ST_transformer_class
+        )
+        self.input_blocks = paddle.nn.LayerList(
+            sublayers=[
+                TimestepEmbedSequential(
+                    conv_nd(dims, in_channels, model_channels, (kernel_size_t, 3, 3), padding=(padding_t, 1, 1))
+                )
+            ]
+        )
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=mult * model_channels,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                        kernel_size_t=kernel_size_t,
+                        padding_t=padding_t,
+                        nonlinearity_type=nonlinearity_type,
+                        **kwargs,
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        dim_head = ch // num_heads if use_temporal_transformer else num_head_channels
+                    layers.append(
+                        STAttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=dim_head,
+                            temporal_length=temporal_length,
+                            use_relative_position=use_relative_position,
+                        )
+                        if not use_temporal_transformer
+                        else STTransformerClass(
+                            ch,
+                            num_heads,
+                            dim_head,
+                            depth=transformer_depth,
+                            context_dim=context_dim,
+                            temporal_length=temporal_length,
+                            use_relative_position=use_relative_position,
+                            **kwargs,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                            kernel_size_t=kernel_size_t,
+                            padding_t=padding_t,
+                            nonlinearity_type=nonlinearity_type,
+                            **kwargs,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch,
+                            conv_resample,
+                            dims=dims,
+                            out_channels=out_ch,
+                            kernel_size_t=kernel_size_t,
+                            padding_t=padding_t,
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        if legacy:
+            dim_head = ch // num_heads if use_temporal_transformer else num_head_channels
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+                kernel_size_t=kernel_size_t,
+                padding_t=padding_t,
+                nonlinearity_type=nonlinearity_type,
+                **kwargs,
+            ),
+            STAttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=dim_head,
+                temporal_length=temporal_length,
+                use_relative_position=use_relative_position,
+            )
+            if not use_temporal_transformer
+            else STTransformerClass(
+                ch,
+                num_heads,
+                dim_head,
+                depth=transformer_depth,
+                context_dim=context_dim,
+                temporal_length=temporal_length,
+                use_relative_position=use_relative_position,
+                **kwargs,
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+                kernel_size_t=kernel_size_t,
+                padding_t=padding_t,
+                nonlinearity_type=nonlinearity_type,
+                **kwargs,
+            ),
+        )
+        self._feature_size += ch
+        self.output_blocks = paddle.nn.LayerList(sublayers=[])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(
+                        ch + ich,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=model_channels * mult,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                        kernel_size_t=kernel_size_t,
+                        padding_t=padding_t,
+                        nonlinearity_type=nonlinearity_type,
+                        **kwargs,
+                    )
+                ]
+                ch = model_channels * mult
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        dim_head = ch // num_heads if use_temporal_transformer else num_head_channels
+                    layers.append(
+                        STAttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=dim_head,
+                            temporal_length=temporal_length,
+                            use_relative_position=use_relative_position,
+                        )
+                        if not use_temporal_transformer
+                        else STTransformerClass(
+                            ch,
+                            num_heads,
+                            dim_head,
+                            depth=transformer_depth,
+                            context_dim=context_dim,
+                            temporal_length=temporal_length,
+                            use_relative_position=use_relative_position,
+                            **kwargs,
+                        )
+                    )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True,
+                            kernel_size_t=kernel_size_t,
+                            padding_t=padding_t,
+                            nonlinearity_type=nonlinearity_type,
+                            **kwargs,
+                        )
+                        if resblock_updown
+                        else Upsample(
+                            ch,
+                            conv_resample,
+                            dims=dims,
+                            out_channels=out_ch,
+                            kernel_size_t=kernel_size_t,
+                            padding_t=padding_t,
+                        )
+                    )
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+        self.out = paddle.nn.Sequential(
+            normalization(ch),
+            nonlinearity(nonlinearity_type),
+            zero_module(conv_nd(dims, model_channels, out_channels, (kernel_size_t, 3, 3), padding=(padding_t, 1, 1))),
+        )
+    def convert_to_fp16(self):
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(fn=convert_module_to_f16)
+        self.middle_block.apply(fn=convert_module_to_f16)
+        self.output_blocks.apply(fn=convert_module_to_f16)
+    def convert_to_fp32(self):
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(fn=convert_module_to_f32)
+        self.middle_block.apply(fn=convert_module_to_f32)
+        self.output_blocks.apply(fn=convert_module_to_f32)
+    def forward(self, x, timesteps=None, time_emb_replace=None, context=None, y=None, **kwargs):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :param context: conditioning plugged in via crossattn
+        :param y: an [N] Tensor of labels, if class-conditional.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        # Fix 0D tensor bug
+        if timesteps.ndim == 0:
+            timesteps = timesteps.unsqueeze(0)
+        hs = []
+        if time_emb_replace is None:
+            t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+            emb = self.time_embed(t_emb)
+        else:
+            emb = time_emb_replace
+        if y is not None:
+            assert y.shape == (x.shape[0],)
+            emb = emb + self.label_emb(y)
+        h = x.astype(self.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb, context, **kwargs)
+            hs.append(h)
+        h = self.middle_block(h, emb, context, **kwargs)
+        for module in self.output_blocks:
+            h = paddle.concat(x=[h, hs.pop()], axis=1)
+            h = module(h, emb, context, **kwargs)
+        h = h.astype(x.dtype)
+        h = self.out(h)
+        return LVDMUNet3DModelOutput(sample=h)
+class FrameInterpPredUNet(LVDMUNet3DModel):
+    """
+    A Unet for unconditional generation, frame prediction and interpolation.
+    may need to input `mask` to indicate condition, as well as noise level `s` for condition augmentation.
+    """
+    def __init__(self, image_size, in_channels, cond_aug_mode=None, *args, **kwargs):
+        super().__init__(image_size, in_channels, *args, **kwargs)
+        if cond_aug_mode == "time_embed":
+            self.time_embed_cond = paddle.nn.Sequential(
+                linear(self.model_channels, self.time_embed_dim),
+                nonlinearity(self.nonlinearity_type),
+                linear(self.time_embed_dim, self.time_embed_dim),
+            )
+        elif cond_aug_mode == "learned_embed":
+            pass
+    def forward(self, x, timesteps, context=None, y=None, s=None, mask=None, **kwargs):
+        # Fix 0D tensor bug
+        if timesteps.ndim == 0:
+            timesteps = timesteps.unsqueeze(0)
+        if s is not None:
+            s_emb = timestep_embedding(s, self.model_channels, repeat_only=False)
+            s_emb = self.time_embed_cond(s_emb)
+            t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+            emb = self.time_embed(t_emb)
+            assert emb.dim() == 2
+            mask_ = mask[:, :, :, (0), (0)]
+            t = mask.shape[2]
+            emb_mix = (
+                emb.unsqueeze(axis=2).tile(repeat_times=[1, 1, t]) * (1 - mask_)
+                + s_emb.unsqueeze(axis=2).tile(repeat_times=[1, 1, t]) * mask_
+            )
+            assert emb_mix.dim() == 3
+            emb_mix = rearrange(emb_mix, "b c t -> b t c")
+            time_emb_replace = emb_mix
+            timesteps = None
+        else:
+            time_emb_replace = None
+            timesteps = timesteps
+        return super().forward(x, timesteps, time_emb_replace=time_emb_replace, context=context, y=y, **kwargs)

PaddleMIX/ppdiffusers/ppdiffusers/models/lvdm_util.py ADDED Viewed

	@@ -0,0 +1,296 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import random
+from inspect import isfunction
+import numpy as np
+import paddle
+from einops import repeat
+def make_interp_mask_with_bothsidescond(t, device, n_interp1, n_interp2):
+    """1: cond frames
+    0: generated frames
+    """
+    mask = paddle.zeros(shape=[t])
+    mask[:n_interp1] = 1
+    mask[t - n_interp2 :] = 1
+    return mask
+def make_interp_mask_with_framestride(t, device, frame_stride):
+    """1: cond frames
+    0: generated frames
+    """
+    mask = paddle.zeros(shape=[t])
+    for i in range(0, t, frame_stride):
+        mask[i] = 1
+    return mask
+def random_temporal_masking(
+    input_shape, p_interp, p_pred, device, n_interp1=1, n_interp2=1, n_prevs=[1], interp_frame_stride=None
+):
+    """return mask for masking input, where 1 indicates given real image as condition,
+    0 indicates noisy samples.
+    """
+    if p_pred == 0.0:
+        n_prevs = None
+    b, c, t, h, w = input_shape
+    mask = paddle.zeros(shape=[b, t])
+    for i in range(b):
+        r = random.random()
+        if r < p_interp:
+            if interp_frame_stride is not None:
+                mask[i] = make_interp_mask_with_framestride(t, device, interp_frame_stride)
+            else:
+                mask[i] = make_interp_mask_with_bothsidescond(t, device, n_interp1, n_interp2)
+        elif p_interp <= r < p_interp + p_pred:
+            n_pred = random.choice(n_prevs)
+            mask[(i), :n_pred] = 1
+        else:
+            pass
+    mask = mask.unsqueeze(axis=1).unsqueeze(axis=3).unsqueeze(axis=4)
+    mask = mask.tile(repeat_times=[1, 1, 1, h, w])
+    return mask
+def make_beta_schedule(schedule, n_timestep, linear_start=0.0001, linear_end=0.02, cosine_s=0.008):
+    if schedule == "linear":
+        betas = (
+            paddle.linspace(start=linear_start**0.5, stop=linear_end**0.5, num=n_timestep).astype("float64") ** 2
+        )
+    elif schedule == "cosine":
+        timesteps = paddle.arange(end=n_timestep + 1).astype("float64") / n_timestep + cosine_s
+        alphas = timesteps / (1 + cosine_s) * np.pi / 2
+        alphas = paddle.cos(x=alphas).pow(y=2)
+        alphas = alphas / alphas[0]
+        betas = 1 - alphas[1:] / alphas[:-1]
+        betas = np.clip(betas, a_min=0, a_max=0.999)
+    elif schedule == "sqrt_linear":
+        betas = paddle.linspace(start=linear_start, stop=linear_end, num=n_timestep).astype("float64")
+    elif schedule == "sqrt":
+        betas = paddle.linspace(start=linear_start, stop=linear_end, num=n_timestep).astype("float64") ** 0.5
+    else:
+        raise ValueError(f"schedule '{schedule}' unknown.")
+    return betas.numpy()
+def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True):
+    if ddim_discr_method == "uniform":
+        c = num_ddpm_timesteps // num_ddim_timesteps
+        ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
+    elif ddim_discr_method == "quad":
+        ddim_timesteps = (np.linspace(0, np.sqrt(num_ddpm_timesteps * 0.8), num_ddim_timesteps) ** 2).astype(int)
+    else:
+        raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
+    steps_out = ddim_timesteps + 1
+    if verbose:
+        print(f"Selected timesteps for ddim sampler: {steps_out}")
+    return steps_out
+def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
+    alphas = alphacums[ddim_timesteps]
+    alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
+    sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
+    if verbose:
+        print(f"Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}")
+        print(
+            f"For the chosen value of eta, which is {eta}, this results in the following sigma_t schedule for ddim sampler {sigmas}"
+        )
+    return sigmas, alphas, alphas_prev
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.take_along_axis(axis=-1, indices=t)
+    return out.reshape([b, *((1,) * (len(x_shape) - 1))])
+def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    if not repeat_only:
+        half = dim // 2
+        freqs = paddle.exp(
+            x=(-math.log(max_period) * paddle.arange(start=0, end=half).astype("float32") / half).astype("float32")
+        )
+        args = timesteps[:, (None)].astype(dtype="float32") * freqs[None]
+        embedding = paddle.concat(x=[paddle.cos(x=args), paddle.sin(x=args)], axis=-1)
+        if dim % 2:
+            embedding = paddle.concat(x=[embedding, paddle.zeros_like(x=embedding[:, :1])], axis=-1)
+    else:
+        embedding = repeat(timesteps, "b -> b d", d=dim)
+    return embedding
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().scale_(scale=scale)
+    return module
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(axis=list(range(1, len(tensor.shape))))
+def normalization(channels):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNorm32(32, channels)
+def Normalize(in_channels):
+    return paddle.nn.GroupNorm(
+        num_groups=32, num_channels=in_channels, epsilon=1e-06, weight_attr=None, bias_attr=None
+    )
+def identity(*args, **kwargs):
+    return paddle.nn.Identity()
+def nonlinearity(type="silu"):
+    if type == "silu":
+        return paddle.nn.Silu()
+    elif type == "leaky_relu":
+        return paddle.nn.LeakyReLU()
+class GEGLU(paddle.nn.Layer):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = paddle.nn.Linear(in_features=dim_in, out_features=dim_out * 2)
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(chunks=2, axis=-1)
+        return x * paddle.nn.functional.gelu(x=gate)
+class SiLU(paddle.nn.Layer):
+    def forward(self, x):
+        return x * paddle.nn.functional.sigmoid(x=x)
+class GroupNorm32(paddle.nn.GroupNorm):
+    def forward(self, x):
+        return super().forward(x.astype(dtype="float32")).astype(x.dtype)
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return paddle.nn.Conv1D(*args, **kwargs)
+    elif dims == 2:
+        return paddle.nn.Conv2D(*args, **kwargs)
+    elif dims == 3:
+        return paddle.nn.Conv3D(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return paddle.nn.Linear(*args, **kwargs)
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return paddle.nn.AvgPool1D(*args, **kwargs, exclusive=False)
+    elif dims == 2:
+        return paddle.nn.AvgPool1D(*args, **kwargs, exclusive=False)
+    elif dims == 3:
+        return paddle.nn.AvgPool1D(*args, **kwargs, exclusive=False)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def noise_like(shape, device, repeat=False):
+    repeat_noise = lambda: paddle.randn(shape=(1, *shape[1:])).tile(
+        repeat_times=[shape[0], *((1,) * (len(shape) - 1))]
+    )
+    noise = lambda: paddle.randn(shape=shape)
+    return repeat_noise() if repeat else noise()
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(min=-std, max=std)
+    return tensor
+def exists(val):
+    return val is not None
+def uniq(arr):
+    return {el: (True) for el in arr}.keys()
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d

PaddleMIX/ppdiffusers/ppdiffusers/models/modeling_pytorch_paddle_utils.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle.nn as nn
+from ..utils import logging
+from ..utils.import_utils import is_torch_available
+logger = logging.get_logger(__name__)
+if is_torch_available():
+    import torch
+def convert_pytorch_state_dict_to_paddle(self: nn.Layer, pt_state_dict, sub_layer=None):
+    # Step 1: Find Linear layer which need transpose weight
+    linear_need_transpose = []
+    for k, v in self.named_sublayers(include_self=True):
+        if isinstance(v, nn.Linear):
+            if sub_layer is not None and sub_layer not in k:
+                continue
+            linear_need_transpose.append(k + ".weight")
+    ignore_keys = ["position_ids", ".num_batches_tracked"]
+    ptname2pdname = {
+        # torch.nn.BatchNorm2d -> paddle.nn.BatchNorm2D
+        ".running_var": "._variance",
+        ".running_mean": "._mean",
+    }
+    # Need to change some parameters name to match paddle names
+    keys = list(pt_state_dict.keys())
+    for pt_key in keys:
+        pt_tensor = pt_state_dict.pop(pt_key)
+        # only convert sub_layer state dict
+        if sub_layer is not None and sub_layer not in pt_key:
+            continue
+        # (0) ignore_keys
+        if any(i in pt_key for i in ignore_keys):
+            continue
+        # (1) transpose linear
+        if pt_key in linear_need_transpose and pt_tensor.ndim == 2:
+            pt_tensor = pt_tensor.T
+        # (2) 0d tensor -> 1d tensor
+        # if pt_tensor.ndim == 0:
+        # pt_tensor = pt_tensor.reshape((1,))
+        # (3) name mapping
+        for old_key, new_key in ptname2pdname.items():
+            pt_key = pt_key.replace(old_key, new_key)
+        pt_state_dict[pt_key] = pt_tensor
+    return pt_state_dict
+def convert_paddle_state_dict_to_pytorch(self: nn.Layer, pd_state_dict):
+    # Step 2: Find Linear layer which need transpose weight
+    linear_need_transpose = []
+    for k, v in self.named_sublayers(include_self=True):
+        if isinstance(v, nn.Linear):
+            linear_need_transpose.append(k + ".weight")
+    ignore_keys = ["position_ids"]
+    ptname2pdname = {
+        # torch.nn.BatchNorm2d -> paddle.nn.BatchNorm2D
+        ".running_var": "._variance",
+        ".running_mean": "._mean",
+    }
+    keys = list(pd_state_dict.keys())
+    detect_bfloat16 = False
+    for pd_key in keys:
+        pd_tensor = pd_state_dict.pop(pd_key)
+        # (0) ignore_keys
+        if any(i in pd_key for i in ignore_keys):
+            continue
+        # (1) transpose linear
+        if pd_key in linear_need_transpose and pd_tensor.ndim == 2:
+            pd_tensor = pd_tensor.T
+        # TODO maybe not true
+        # (2) 1d tensor -> 0d tensor
+        if pd_tensor.ndim == 1:
+            pd_tensor = pd_tensor.squeeze()
+        # (3) name mapping
+        for old_key, new_key in ptname2pdname.items():
+            pd_key = pd_key.replace(new_key, old_key)
+        pd_tensor = np.ascontiguousarray(pd_tensor)
+        if is_torch_available():
+            if pd_tensor.dtype in ["uint16", np.uint16]:
+                pd_tensor = pd_tensor.astype(np.float32)
+                pd_state_dict[pd_key] = torch.from_numpy(pd_tensor).to(torch.bfloat16)
+            else:
+                pd_state_dict[pd_key] = torch.from_numpy(pd_tensor)
+        else:
+            if pd_tensor.dtype in ["uint16", np.uint16]:
+                pd_tensor = pd_tensor.astype(np.float16)
+                detect_bfloat16 = True
+            pd_state_dict[pd_key] = pd_tensor
+    if detect_bfloat16:
+        logger.warning(
+            "PyTorch is not installed, so we cannot save as `bfloat16` tensor. "
+            "To ensure the model can still be loaded, we will save it as `float16` tensor instead. "
+            "Please note that this may affect the precision of the saved model."
+        )
+    return pd_state_dict

PaddleMIX/ppdiffusers/ppdiffusers/models/modeling_utils.py ADDED Viewed

	@@ -0,0 +1,1356 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import json
+import os
+from collections import OrderedDict
+from contextlib import ExitStack
+from functools import partial
+from typing import Any, Callable, List, Optional, Union
+import numpy as np
+from aistudio_sdk.hub import create_repo as aistudio_create_repo
+from huggingface_hub import create_repo
+from paddle import nn
+from tqdm import tqdm
+from ..utils import (
+    CONFIG_NAME,
+    DIFFUSERS_CACHE,
+    FROM_AISTUDIO,
+    FROM_DIFFUSERS,
+    FROM_HF_HUB,
+    HF_HUB_OFFLINE,
+    LOW_CPU_MEM_USAGE_DEFAULT,
+    MIN_PEFT_VERSION,
+    PADDLE_SAFETENSORS_WEIGHTS_NAME,
+    PADDLE_SAFETENSORS_WEIGHTS_NAME_INDEX_NAME,
+    PADDLE_WEIGHTS_NAME,
+    PADDLE_WEIGHTS_NAME_INDEX_NAME,
+    PPDIFFUSERS_CACHE,
+    TO_DIFFUSERS,
+    TORCH_SAFETENSORS_WEIGHTS_NAME,
+    TORCH_SAFETENSORS_WEIGHTS_NAME_INDEX_NAME,
+    TORCH_WEIGHTS_NAME,
+    TORCH_WEIGHTS_NAME_INDEX_NAME,
+    _add_variant,
+    _get_model_file,
+    check_peft_version,
+    deprecate,
+    get_checkpoint_shard_files,
+    is_paddle_available,
+    is_paddle_version,
+    is_paddlenlp_available,
+    is_safetensors_available,
+    is_torch_available,
+    logging,
+    smart_load,
+)
+from ..version import VERSION as __version__
+from .modeling_pytorch_paddle_utils import (
+    convert_paddle_state_dict_to_pytorch,
+    convert_pytorch_state_dict_to_paddle,
+)
+logger = logging.get_logger(__name__)
+if is_torch_available():
+    import torch
+if is_safetensors_available():
+    from safetensors import safe_open
+    from safetensors.numpy import save_file as np_safe_save_file
+    if is_torch_available():
+        from safetensors.torch import save_file as torch_safe_save_file
+if is_paddle_available():
+    import paddle
+if is_paddlenlp_available():
+    try:
+        from paddlenlp.transformers.model_utils import no_init_weights
+    except ImportError:
+        from ..utils.paddle_utils import no_init_weights
+    from paddlenlp.transformers.model_utils import shard_checkpoint
+def faster_set_state_dict(model, state_dict):
+    # the state_dict will be destroied.
+    with paddle.no_grad():
+        for k, v in model.state_dict(use_hook=False).items():
+            if k in state_dict:
+                v_new = state_dict.pop(k)
+                # with device_guard(): donot do device guard
+                if isinstance(v_new, np.ndarray):
+                    v_new = paddle.Tensor(v_new, zero_copy=True)
+                if v.dtype != v_new.dtype:
+                    v_new = v_new.cast(v.dtype)
+                v.copy_(v_new, False)
+            else:
+                if (hasattr(v, "_is_initialized") and not v._is_initialized()) or "undefined" in str(v.place):
+                    v.initialize()
+                    # logger.warning(f"key {k} is not in state_dict. And it is lazy tensor. We will initialize it.")
+class ContextManagers:
+    """
+    Wrapper for `contextlib.ExitStack` which enters a collection of context managers. Adaptation of `ContextManagers`
+    in the `fastcore` library.
+    """
+    def __init__(self, context_managers):
+        self.context_managers = context_managers
+        self.stack = ExitStack()
+    def __enter__(self):
+        for context_manager in self.context_managers:
+            self.stack.enter_context(context_manager)
+    def __exit__(self, *args, **kwargs):
+        self.stack.__exit__(*args, **kwargs)
+def get_parameter_device(parameter: nn.Layer):
+    try:
+        # TODO https://github.com/huggingface/diffusers/compare/v0.15.0...v0.16.0#diff-6a3b9a08c1d37dbc341131632415fea800af242a84fb31f1bcd40d725e2eeeebR64
+        return next(parameter.named_parameters())[1].place
+    except StopIteration:
+        try:
+            return next(parameter.named_buffers())[1].place
+        except StopIteration:
+            return paddle.get_device()
+def get_parameter_dtype(parameter: nn.Layer) -> paddle.dtype:
+    try:
+        # TODO https://github.com/huggingface/diffusers/compare/v0.15.0...v0.16.0#diff-6a3b9a08c1d37dbc341131632415fea800af242a84fb31f1bcd40d725e2eeeebR80
+        return next(parameter.named_parameters())[1].dtype
+    except StopIteration:
+        try:
+            return next(parameter.named_buffers())[1].dtype
+        except StopIteration:
+            return parameter._dtype
+def load_state_dict(
+    checkpoint_file: Union[str, os.PathLike], state_dict, tensor_parallel_split_mapping=None, ignore_keys=None
+):
+    """
+    Reads a PaddlePaddle checkpoint file, returning properly formatted errors if they arise.
+    """
+    if tensor_parallel_split_mapping is None:
+        tensor_parallel_split_mapping = {}
+    data_format = "pd"
+    if checkpoint_file.endswith(".safetensors") and is_safetensors_available():
+        # Check format of the archive
+        with safe_open(checkpoint_file, framework="np") as f:
+            metadata = f.metadata()
+        if metadata is None:
+            metadata = {}
+        if metadata.get("format", "pt") not in ["pt", "pd", "np"]:
+            raise OSError(
+                f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure "
+                "you save your model with the `save_pretrained` method."
+            )
+        data_format = metadata.get("format", "pt")
+        with safe_open(checkpoint_file, framework="np") as f:
+            for key in f.keys():
+                need_continue = False
+                if ignore_keys is not None:
+                    for ik in ignore_keys:
+                        if key.startswith(ik):
+                            logger.info("Deleting key {} from state_dict.".format(key))
+                            need_continue = True
+                            break
+                if need_continue:
+                    continue
+                if key in tensor_parallel_split_mapping:
+                    py_safe_slice_ = f.get_slice(key)
+                    weight = tensor_parallel_split_mapping[key](py_safe_slice_)
+                else:
+                    weight = f.get_tensor(key)
+                state_dict[key] = paddle.Tensor(weight, zero_copy=True)
+    else:
+        if any(checkpoint_file.endswith(suffix) for suffix in [".pt", ".pth", ".bin", ".ckpt"]):
+            data_format = "pt"
+        tmp_state_dict = smart_load(checkpoint_file, return_numpy=True)
+        for key in list(tmp_state_dict.keys()):
+            need_continue = False
+            if ignore_keys is not None:
+                for ik in ignore_keys:
+                    if key.startswith(ik):
+                        logger.info("Deleting key {} from state_dict.".format(key))
+                        need_continue = True
+                        break
+            if need_continue:
+                continue
+            # with device_guard():
+            t = tmp_state_dict.pop(key)
+            if key in tensor_parallel_split_mapping:
+                t = tensor_parallel_split_mapping[key](t)
+            if isinstance(t, dict):
+                if len(t) == 0:
+                    state_dict[key] = {}
+            else:
+                state_dict[key] = paddle.Tensor(t, zero_copy=True)
+    return data_format
+class ModelMixin(nn.Layer):
+    r"""
+    Base class for all models.
+    [`ModelMixin`] takes care of storing the model configuration and provides methods for loading, downloading and
+    saving models.
+        - **config_name** ([`str`]) -- Filename to save a model to when calling [`~models.ModelMixin.save_pretrained`].
+    """
+    config_name = CONFIG_NAME
+    _automatically_saved_args = ["_ppdiffusers_version", "_class_name", "_name_or_path"]
+    _supports_gradient_checkpointing = False
+    _keys_to_ignore_on_load_unexpected = None
+    _pp_peft_config_loaded = False
+    def __init__(self):
+        super().__init__()
+    def __getattr__(self, name: str) -> Any:
+        """The only reason we overwrite `getattr` here is to gracefully deprecate accessing
+        config attributes directly. See https://github.com/huggingface/diffusers/pull/3129 We need to overwrite
+        __getattr__ here in addition so that we don't trigger `nn.Layer`'s __getattr__':
+        https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        """
+        is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
+        is_attribute = name in self.__dict__
+        if is_in_config and not is_attribute:
+            deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'unet.config.{name}'."
+            deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False, stacklevel=3)
+            return self._internal_dict[name]
+        # call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        return super().__getattr__(name)
+    @property
+    def is_gradient_checkpointing(self) -> bool:
+        """
+        Whether gradient checkpointing is activated for this model or not.
+        """
+        return any(
+            hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing
+            for m in self.sublayers(include_self=True)
+        )
+    def enable_gradient_checkpointing(self) -> None:
+        """
+        Activates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or
+        *checkpoint activations* in other frameworks).
+        """
+        if not self._supports_gradient_checkpointing:
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+    def disable_gradient_checkpointing(self) -> None:
+        """
+        Deactivates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or
+        *checkpoint activations* in other frameworks).
+        """
+        if self._supports_gradient_checkpointing:
+            self.apply(partial(self._set_gradient_checkpointing, value=False))
+    def set_use_memory_efficient_attention_xformers(self, valid: bool, attention_op: Optional[str] = None) -> None:
+        # Recursively walk through all the children.
+        # Any children which exposes the set_use_memory_efficient_attention_xformers method
+        # gets the message
+        def fn_recursive_set_mem_eff(module: nn.Layer):
+            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
+                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
+            for child in module.children():
+                fn_recursive_set_mem_eff(child)
+        for module in self.children():
+            if isinstance(module, nn.Layer):
+                fn_recursive_set_mem_eff(module)
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[str] = None) -> None:
+        r"""
+        Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/).
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up during
+        inference. Speed up during training is not guaranteed.
+        <Tip warning={true}>
+        ⚠️ When memory efficient attention and sliced attention are both enabled, memory efficient attention takes
+        precedent.
+        </Tip>
+        Parameters:
+            attention_op (`str`, *optional*):
+                Override the default `None`
+        Examples:
+        ```py
+        >>> import paddle
+        >>> from ppdiffusers import UNet2DConditionModel
+        >>> model = UNet2DConditionModel.from_pretrained(
+        ...     "stabilityai/stable-diffusion-2-1", subfolder="unet", paddle_dtype=paddle.float16
+        ... )
+        >>> model.enable_xformers_memory_efficient_attention(attention_op="auto")
+        ```
+        """
+        self.set_use_memory_efficient_attention_xformers(True, attention_op)
+    def disable_xformers_memory_efficient_attention(self) -> None:
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.set_use_memory_efficient_attention_xformers(False)
+    def add_adapter(self, adapter_config, adapter_name: str = "default") -> None:
+        r"""
+        Adds a new adapter to the current model for training. If no adapter name is passed, a default name is assigned
+        to the adapter to follow the convention of the PEFT library.
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them in the PEFT
+        [documentation](https://huggingface.co/docs/peft).
+        Args:
+            adapter_config (`[~peft.PeftConfig]`):
+                The configuration of the adapter to add; supported adapters are non-prefix tuning and adaption prompt
+                methods.
+            adapter_name (`str`, *optional*, defaults to `"default"`):
+                The name of the adapter to add. If no name is passed, a default name is assigned to the adapter.
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+        from ppdiffusers.peft import PeftConfig, inject_adapter_in_model
+        if not self._pp_peft_config_loaded:
+            self._pp_peft_config_loaded = True
+        elif adapter_name in self.peft_config:
+            raise ValueError(f"Adapter with name {adapter_name} already exists. Please use a different name.")
+        if not isinstance(adapter_config, PeftConfig):
+            raise ValueError(
+                f"adapter_config should be an instance of PeftConfig. Got {type(adapter_config)} instead."
+            )
+        # Unlike transformers, here we don't need to retrieve the name_or_path of the unet as the loading logic is
+        # handled by the `load_lora_layers` or `LoraLoaderMixin`. Therefore we set it to `None` here.
+        adapter_config.base_model_name_or_path = None
+        inject_adapter_in_model(adapter_config, self, adapter_name)
+        self.set_adapter(adapter_name)
+    def set_adapter(self, adapter_name: Union[str, List[str]]) -> None:
+        """
+        Sets a specific adapter by forcing the model to only use that adapter and disables the other adapters.
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
+        official documentation: https://huggingface.co/docs/peft
+        Args:
+            adapter_name (Union[str, List[str]])):
+                The list of adapters to set or the adapter name in case of single adapter.
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+        if not self._pp_peft_config_loaded:
+            raise ValueError("No adapter loaded. Please load an adapter first.")
+        if isinstance(adapter_name, str):
+            adapter_name = [adapter_name]
+        missing = set(adapter_name) - set(self.peft_config)
+        if len(missing) > 0:
+            raise ValueError(
+                f"Following adapter(s) could not be found: {', '.join(missing)}. Make sure you are passing the correct adapter name(s)."
+                f" current loaded adapters are: {list(self.peft_config.keys())}"
+            )
+        from ppdiffusers.peft.tuners.tuners_utils import BaseTunerLayer
+        _adapters_has_been_set = False
+        for _, module in self.named_sublayers(include_self=True):
+            if isinstance(module, BaseTunerLayer):
+                if hasattr(module, "set_adapter"):
+                    module.set_adapter(adapter_name)
+                # Previous versions of PEFT does not support multi-adapter inference
+                elif not hasattr(module, "set_adapter") and len(adapter_name) != 1:
+                    raise ValueError(
+                        "You are trying to set multiple adapters and you have a PEFT version that does not support multi-adapter inference. Please upgrade to the latest version of PEFT."
+                        " `pip install -U peft` or `pip install -U git+https://github.com/huggingface/peft.git`"
+                    )
+                else:
+                    module.active_adapter = adapter_name
+                _adapters_has_been_set = True
+        if not _adapters_has_been_set:
+            raise ValueError(
+                "Did not succeeded in setting the adapter. Please make sure you are using a model that supports adapters."
+            )
+    def disable_adapters(self) -> None:
+        r"""
+        Disable all adapters attached to the model and fallback to inference with the base model only.
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
+        official documentation: https://huggingface.co/docs/peft
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+        if not self._pp_peft_config_loaded:
+            raise ValueError("No adapter loaded. Please load an adapter first.")
+        from ppdiffusers.peft.tuners.tuners_utils import BaseTunerLayer
+        for _, module in self.named_sublayers(include_self=True):
+            if isinstance(module, BaseTunerLayer):
+                if hasattr(module, "enable_adapters"):
+                    module.enable_adapters(enabled=False)
+                else:
+                    # support for older PEFT versions
+                    module.disable_adapters = True
+    def enable_adapters(self) -> None:
+        """
+        Enable adapters that are attached to the model. The model will use `self.active_adapters()` to retrieve the
+        list of adapters to enable.
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
+        official documentation: https://huggingface.co/docs/peft
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+        if not self._pp_peft_config_loaded:
+            raise ValueError("No adapter loaded. Please load an adapter first.")
+        from ppdiffusers.peft.tuners.tuners_utils import BaseTunerLayer
+        for _, module in self.named_sublayers(include_self=True):
+            if isinstance(module, BaseTunerLayer):
+                if hasattr(module, "enable_adapters"):
+                    module.enable_adapters(enabled=True)
+                else:
+                    # support for older PEFT versions
+                    module.disable_adapters = False
+    def active_adapters(self) -> List[str]:
+        """
+        Gets the current list of active adapters of the model.
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
+        official documentation: https://huggingface.co/docs/peft
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+        if not self._pp_peft_config_loaded:
+            raise ValueError("No adapter loaded. Please load an adapter first.")
+        from ppdiffusers.peft.tuners.tuners_utils import BaseTunerLayer
+        for _, module in self.named_sublayers(include_self=True):
+            if isinstance(module, BaseTunerLayer):
+                return module.active_adapter
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        save_function: Optional[Callable] = None,
+        max_shard_size: Union[int, str] = "10GB",
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+        push_to_hub: bool = False,
+        save_to_aistudio: bool = False,
+        to_diffusers: Optional[bool] = None,
+        **kwargs,
+    ):
+        """
+        Save a model and its configuration file to a directory so that it can be reloaded using the
+        [`~models.ModelMixin.from_pretrained`] class method.
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save a model and its configuration file to. Will be created if it doesn't exist.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful during distributed training when you need to
+                replace `torch.save` with another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
+            variant (`str`, *optional*):
+                If specified, weights are saved in the format `pytorch_model.<variant>.bin`.
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        # distributed kwargs
+        merge_tensor_parallel = kwargs.get("merge_tensor_parallel", False)
+        tensor_parallel_degree = kwargs.pop("tensor_parallel_degree", 1)
+        if to_diffusers is None:
+            to_diffusers = TO_DIFFUSERS
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+        os.makedirs(save_directory, exist_ok=True)
+        # create repo
+        commit_message = kwargs.pop("commit_message", None)
+        private = kwargs.pop("private", False)
+        create_pr = kwargs.pop("create_pr", False)
+        token = kwargs.pop("token", None)
+        token_kwargs = {}
+        if token is not None:
+            token_kwargs["token"] = token
+        repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+        license = kwargs.pop("license", "creativeml-openrail-m")
+        exist_ok = kwargs.pop("exist_ok", True)
+        if push_to_hub:
+            repo_id = create_repo(repo_id, exist_ok=True, private=private, **token_kwargs).repo_id
+        if save_to_aistudio:
+            assert "/" in repo_id, "Please specify the repo id in format of `user_id/repo_name`"
+            res = aistudio_create_repo(repo_id=repo_id, private=private, license=license, **token_kwargs)
+            if "error_code" in res:
+                if res["error_code"] == 10003 and exist_ok:
+                    logger.info(
+                        f"Repo {repo_id} already exists, it will override files with the same name. To avoid this, please set exist_ok=False"
+                    )
+                else:
+                    logger.error(
+                        f"Failed to create repo {repo_id}, error_code: {res['error_code']}, error_msg: {res['error_msg']}"
+                    )
+            else:
+                logger.info(f"Successfully created repo {repo_id}")
+        # Only save the model itself if we are using distributed training
+        model_to_save = self
+        # Attach architecture to the config
+        # Save the config
+        if is_main_process:
+            model_to_save.save_config(save_directory, to_diffusers=to_diffusers)
+        # Save the model
+        state_dict = model_to_save.state_dict()
+        if tensor_parallel_degree > 1:
+            if merge_tensor_parallel:
+                config_to_save = model_to_save._internal_dict
+                state_dict = model_to_save.merge_tensor_parallel(state_dict, config_to_save)
+                tensor_parallel_degree = 1
+                if paddle.distributed.fleet.get_hybrid_communicate_group().get_model_parallel_rank() != 0:
+                    logger.info("Saving with merge_tensor_parallel, tensor_parallel_rank > 0 don't need save")
+                    return
+        if to_diffusers:
+            if not is_torch_available() and not safe_serialization:
+                safe_serialization = True
+                logger.warning(
+                    "PyTorch is not installed, and `safe_serialization` is currently set to `False`. "
+                    "To ensure proper model saving, we will automatically set `safe_serialization=True`. "
+                    "If you want to keep `safe_serialization=False`, please make sure PyTorch is installed."
+                )
+            if safe_serialization:
+                save_index_file = TORCH_SAFETENSORS_WEIGHTS_NAME_INDEX_NAME
+                weights_name = TORCH_SAFETENSORS_WEIGHTS_NAME
+                if is_torch_available():
+                    save_function = partial(torch_safe_save_file, metadata={"format": "pt"})
+                else:
+                    save_function = partial(np_safe_save_file, metadata={"format": "pt"})
+            else:
+                save_index_file = TORCH_WEIGHTS_NAME_INDEX_NAME
+                weights_name = TORCH_WEIGHTS_NAME
+                save_function = torch.save
+        else:
+            if safe_serialization:
+                save_index_file = PADDLE_SAFETENSORS_WEIGHTS_NAME_INDEX_NAME
+                weights_name = PADDLE_SAFETENSORS_WEIGHTS_NAME
+                save_function = partial(np_safe_save_file, metadata={"format": "pd"})
+            else:
+                save_index_file = PADDLE_WEIGHTS_NAME_INDEX_NAME
+                weights_name = PADDLE_WEIGHTS_NAME
+                save_function = paddle.save
+        weights_name = _add_variant(weights_name, variant)
+        # Save model
+        shards, index = shard_checkpoint(
+            state_dict,
+            max_shard_size=max_shard_size,
+            weights_name=weights_name,
+        )
+        # Save the model
+        for shard_file, shard in shards.items():
+            for k in list(shard.keys()):
+                if isinstance(shard[k], paddle.Tensor):
+                    shard[k] = np.ascontiguousarray(shard.pop(k).cpu().numpy())
+            if to_diffusers:
+                convert_paddle_state_dict_to_pytorch(self, shard)
+            save_function(shard, os.path.join(save_directory, shard_file))
+        # Save the model
+        if index is None:
+            logger.info(f"Model weights saved in {os.path.join(save_directory, weights_name)}")
+        else:
+            save_index_file = os.path.join(save_directory, _add_variant(save_index_file, variant))
+            # Save the index as well
+            with open(save_index_file, "w", encoding="utf-8") as f:
+                content = json.dumps(index, indent=2) + "\n"
+                f.write(content)
+            logger.info(
+                f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
+                f"split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the "
+                f"index located at {save_index_file}."
+            )
+        # upload to aistudio or huggingface hub
+        if save_to_aistudio:
+            self._upload_folder_aistudio(
+                save_directory,
+                repo_id,
+                commit_message=commit_message,
+                **token_kwargs,
+            )
+        if push_to_hub:
+            self._upload_folder(
+                save_directory,
+                repo_id,
+                commit_message=commit_message,
+                create_pr=create_pr,
+                **token_kwargs,
+            )
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        r"""
+        Instantiate a pretrained PyTorch model from a pretrained model configuration.
+        The model is set in evaluation mode - `model.eval()` - by default, and dropout modules are deactivated. To
+        train the model, set it back in training mode with `model.train()`.
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`~ModelMixin.save_pretrained`].
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            paddle_dtype (`str` or `paddle.dtype`, *optional*):
+                Override the default `paddle.dtype` and load the model with another dtype. If `"auto"` is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info (`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            from_flax (`bool`, *optional*, defaults to `False`):
+                Load the model weights from a Flax checkpoint save file.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be defined for each
+                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
+                same device.
+                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
+                each GPU and the available CPU RAM if unset.
+            offload_folder (`str` or `os.PathLike`, *optional*):
+                The path to offload weights if `device_map` contains the value `"disk"`.
+            offload_state_dict (`bool`, *optional*):
+                If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if
+                the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True`
+                when there is some disk offload.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            variant (`str`, *optional*):
+                Load weights from a specified `variant` filename such as `"fp16"` or `"ema"`. This is ignored when
+                loading `from_flax`.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the `safetensors` weights are downloaded if they're available **and** if the
+                `safetensors` library is installed. If set to `True`, the model is forcibly loaded from `safetensors`
+                weights. If set to `False`, `safetensors` weights are not loaded.
+        <Tip>
+        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
+        `huggingface-cli login`. You can also activate the special
+        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
+        firewalled environment.
+        </Tip>
+        Example:
+        ```py
+        from ppdiffusers import UNet2DConditionModel
+        unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet")
+        ```
+        If you get the error message below, you need to finetune the weights for your downstream task:
+        ```bash
+        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
+        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+        ```
+        """
+        from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
+        from_aistudio = kwargs.pop("from_aistudio", FROM_AISTUDIO)
+        cache_dir = kwargs.pop("cache_dir", None)
+        if cache_dir is None:
+            if from_aistudio:
+                cache_dir = None  # TODO, check aistudio cache
+            elif from_hf_hub:
+                cache_dir = DIFFUSERS_CACHE
+            else:
+                cache_dir = PPDIFFUSERS_CACHE
+        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
+        force_download = kwargs.pop("force_download", False)
+        from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
+        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        paddle_dtype = kwargs.pop("paddle_dtype", None)
+        subfolder = kwargs.pop("subfolder", "")
+        if subfolder is None:
+            subfolder = ""
+        device_map = kwargs.pop("device_map", None)
+        max_memory = kwargs.pop("max_memory", None)
+        offload_folder = kwargs.pop("offload_folder", None)
+        offload_state_dict = kwargs.pop("offload_state_dict", False)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", LOW_CPU_MEM_USAGE_DEFAULT)
+        variant = kwargs.pop("variant", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        ignore_keys = kwargs.pop("ignore_keys", [])
+        # distributed kwargs
+        tensor_parallel_degree = kwargs.pop("tensor_parallel_degree", 1)
+        if use_safetensors is None:
+            use_safetensors = True
+        if low_cpu_mem_usage and (not is_paddle_version(">=", "2.5.0") and not is_paddle_version("==", "0.0.0")):
+            raise NotImplementedError(
+                "Low memory initialization requires paddlepaddle-gpu >= 2.5.0. Please either update your PaddlePaddle version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+        # Load config if we don't provide a configuration
+        config_path = pretrained_model_name_or_path
+        user_agent = {
+            "ppdiffusers": __version__,
+            "file_type": "model",
+            "framework": "pytorch" if from_diffusers else "paddle",
+        }
+        # load config
+        config, unused_kwargs, commit_hash, config_file = cls.load_config(
+            config_path,
+            cache_dir=cache_dir,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            return_config_file=True,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            use_auth_token=use_auth_token,
+            revision=revision,
+            subfolder=subfolder,
+            device_map=device_map,
+            max_memory=max_memory,
+            offload_folder=offload_folder,
+            offload_state_dict=offload_state_dict,
+            user_agent=user_agent,
+            from_hf_hub=from_hf_hub,
+            from_aistudio=from_aistudio,
+            **kwargs,
+        )
+        index_file = None
+        variant_list = [variant]
+        if None not in variant_list:
+            variant_list.append(None)
+        if "fp16" not in variant_list:
+            variant_list.append("fp16")
+        if "fp32" not in variant_list:
+            variant_list.append("fp32")
+        for v_index, variant in enumerate(variant_list):
+            try:
+                if use_safetensors:
+                    try:
+                        # is sharded model
+                        index_file = _get_model_file(
+                            pretrained_model_name_or_path,
+                            weights_name=_add_variant(TORCH_SAFETENSORS_WEIGHTS_NAME_INDEX_NAME, variant)
+                            if from_diffusers
+                            else _add_variant(PADDLE_SAFETENSORS_WEIGHTS_NAME_INDEX_NAME, variant),
+                            cache_dir=cache_dir,
+                            force_download=force_download,
+                            resume_download=resume_download,
+                            proxies=proxies,
+                            local_files_only=local_files_only,
+                            use_auth_token=use_auth_token,
+                            revision=revision,
+                            subfolder=subfolder,
+                            user_agent=user_agent,
+                            commit_hash=commit_hash,
+                            from_hf_hub=from_hf_hub,
+                            from_aistudio=from_aistudio,
+                        )
+                    except Exception:
+                        index_file = None
+                if index_file is None:
+                    # is sharded model
+                    try:
+                        index_file = _get_model_file(
+                            pretrained_model_name_or_path,
+                            weights_name=_add_variant(TORCH_WEIGHTS_NAME_INDEX_NAME, variant)
+                            if from_diffusers
+                            else _add_variant(PADDLE_WEIGHTS_NAME_INDEX_NAME, variant),
+                            cache_dir=cache_dir,
+                            force_download=force_download,
+                            resume_download=resume_download,
+                            proxies=proxies,
+                            local_files_only=local_files_only,
+                            use_auth_token=use_auth_token,
+                            revision=revision,
+                            subfolder=subfolder,
+                            user_agent=user_agent,
+                            commit_hash=commit_hash,
+                            from_hf_hub=from_hf_hub,
+                            from_aistudio=from_aistudio,
+                        )
+                    except Exception:
+                        index_file = None
+                is_sharded = index_file is not None
+                if is_sharded:
+                    resolved_model_files, sharded_metadata = get_checkpoint_shard_files(
+                        pretrained_model_name_or_path,
+                        index_filename=index_file,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        resume_download=resume_download,
+                        proxies=proxies,
+                        local_files_only=local_files_only,
+                        use_auth_token=use_auth_token,
+                        revision=revision,
+                        subfolder=subfolder,
+                        user_agent=user_agent,
+                        commit_hash=commit_hash,
+                        from_hf_hub=from_hf_hub,
+                        from_aistudio=from_aistudio,
+                    )
+                    if not isinstance(resolved_model_files, list):
+                        resolved_model_files = [resolved_model_files]
+                else:
+                    # load model
+                    model_file = None
+                    if use_safetensors:
+                        try:
+                            model_file = _get_model_file(
+                                pretrained_model_name_or_path,
+                                weights_name=_add_variant(TORCH_SAFETENSORS_WEIGHTS_NAME, variant)
+                                if from_diffusers
+                                else _add_variant(PADDLE_SAFETENSORS_WEIGHTS_NAME, variant),
+                                cache_dir=cache_dir,
+                                force_download=force_download,
+                                resume_download=resume_download,
+                                proxies=proxies,
+                                local_files_only=local_files_only,
+                                use_auth_token=use_auth_token,
+                                revision=revision,
+                                subfolder=subfolder,
+                                user_agent=user_agent,
+                                commit_hash=commit_hash,
+                                from_hf_hub=from_hf_hub,
+                                from_aistudio=from_aistudio,
+                            )
+                        except Exception:
+                            model_file = None
+                            pass
+                    if model_file is None:
+                        model_file = _get_model_file(
+                            pretrained_model_name_or_path,
+                            weights_name=_add_variant(TORCH_WEIGHTS_NAME, variant)
+                            if from_diffusers
+                            else _add_variant(PADDLE_WEIGHTS_NAME, variant),
+                            cache_dir=cache_dir,
+                            force_download=force_download,
+                            resume_download=resume_download,
+                            proxies=proxies,
+                            local_files_only=local_files_only,
+                            use_auth_token=use_auth_token,
+                            revision=revision,
+                            subfolder=subfolder,
+                            user_agent=user_agent,
+                            commit_hash=commit_hash,
+                            from_hf_hub=from_hf_hub,
+                            from_aistudio=from_aistudio,
+                        )
+                    resolved_model_files = [model_file]
+            except Exception as e:  # NOQA
+                logger.warning(
+                    f"Unable to load the `variant={variant}` of the model from `{pretrained_model_name_or_path}`! "
+                    "Please make sure the specified variant exists and is correct."
+                )
+                resolved_model_files = []
+            if len(resolved_model_files) > 0:
+                if v_index > 0:
+                    name = (
+                        ", ".join([config_file, index_file] + resolved_model_files)
+                        if index_file is not None
+                        else ", ".join(resolved_model_files)
+                    )
+                    logger.warning(
+                        f"Proceeding to load the `variant={variant}` of the model with the resolved model files: {name}. "
+                        "Please note that this might not be the desired variant."
+                    )
+                break
+        variant_str = ", ".join(map(lambda x: "`" + str(x) + "`", variant_list))
+        assert len(resolved_model_files) > 0, (
+            f"We are attempting to load the variant in [{variant_str}]. "
+            f"But unfortunately, no model files were found in the path {pretrained_model_name_or_path}. "
+            "Please check if the provided path is correct and ensure that it contains the necessary model files. "
+            "If the issue persists, consider redownloading the model files or contacting the model provider for assistance."
+        )
+        init_contexts = []
+        dtype = paddle.float32 if paddle_dtype is None else paddle_dtype
+        init_contexts.append(paddle.dtype_guard(dtype))
+        if low_cpu_mem_usage:
+            # Instantiate model.
+            init_contexts.append(no_init_weights(_enable=True))
+            if hasattr(paddle, "LazyGuard"):
+                init_contexts.append(paddle.LazyGuard())
+        with ContextManagers(init_contexts):
+            model = cls.from_config(config, **unused_kwargs)
+        # (westfish) 2024/04/01:
+        #  Tensor parallel is only supported for models that inherit from `ConversionMixin`
+        if tensor_parallel_degree > 1:
+            from paddlenlp.transformers.conversion_utils import ConversionMixin
+            if not issubclass(cls, ConversionMixin):
+                raise NotImplementedError(
+                    "Tensor parallel is only supported for models that inherit from `ConversionMixin`."
+                )
+            if len(resolved_model_files) > 1:
+                raise NotImplementedError("Tensor parallel is not supported for multiple shards yet.")
+            tmp_state_dict = smart_load(resolved_model_files[0], return_numpy=True)
+            tensor_parallel_split_mapping = cls.get_tensor_parallel_convert_actions(config, tmp_state_dict.keys())
+        else:
+            tensor_parallel_split_mapping = None
+        model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
+            model,
+            resolved_model_files,
+            pretrained_model_name_or_path,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            ignore_keys=ignore_keys,
+            from_diffusers=from_diffusers,
+            tensor_parallel_split_mapping=tensor_parallel_split_mapping,
+            tensor_parallel_degree=tensor_parallel_degree,
+        )
+        loading_info = {
+            "missing_keys": missing_keys,
+            "unexpected_keys": unexpected_keys,
+            "mismatched_keys": mismatched_keys,
+            "error_msgs": error_msgs,
+        }
+        if paddle_dtype is not None:
+            model = model.to(dtype=paddle_dtype)
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+        # Set model in evaluation mode to deactivate DropOut modules by default
+        model.eval()
+        if output_loading_info:
+            return model, loading_info
+        return model
+    @classmethod
+    def custom_modify_weight(cls, model_to_load, state_dict):
+        pass
+    @classmethod
+    def _load_pretrained_model(
+        cls,
+        model: "ModelMixin",
+        resolved_model_files,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        ignore_mismatched_sizes: bool = False,
+        ignore_keys=None,
+        from_diffusers=False,
+        tensor_parallel_split_mapping=None,
+        tensor_parallel_degree=1,
+    ):
+        state_dict = OrderedDict()
+        model_state_dict = model.state_dict()
+        loaded_keys = []
+        expected_keys = list(model_state_dict.keys())
+        error_msgs = []
+        mismatched_keys = []
+        if len(resolved_model_files) > 1:
+            resolved_model_files = tqdm(resolved_model_files, desc="Loading checkpoint shards")
+            if tensor_parallel_degree > 1:
+                raise NotImplementedError("Tensor parallel is not supported for multiple shards yet.")
+        # load shard state dict
+        for shard_file in resolved_model_files:
+            data_format = load_state_dict(
+                shard_file,
+                state_dict,  # inplace update state_dict
+                tensor_parallel_split_mapping=tensor_parallel_split_mapping,
+                ignore_keys=ignore_keys,
+            )
+            # NOTE: new add support old state_dict
+            model._update_deprecated_state_dict(state_dict)
+            # NOTE: convert old model state dict!
+            model._convert_deprecated_attention_blocks(state_dict)
+            # NOTE: convert torch model state dict!
+            if from_diffusers or data_format in ["pt"]:
+                convert_pytorch_state_dict_to_paddle(model, state_dict)
+            original_loaded_keys = list(state_dict.keys())
+            loaded_keys.extend(original_loaded_keys)
+            # Make sure we are able to load base models as well as derived models (with heads)
+            model_to_load = model
+            def _find_mismatched_keys(
+                state_dict,
+                model_state_dict,
+                loaded_keys,
+                ignore_mismatched_sizes,
+            ):
+                mismatched_keys = []
+                for checkpoint_key in loaded_keys:
+                    model_key = checkpoint_key
+                    if model_key in model_state_dict and list(state_dict[checkpoint_key].shape) != list(
+                        model_state_dict[model_key].shape
+                    ):
+                        mismatched_keys.append(
+                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
+                        )
+                        del state_dict[checkpoint_key]
+                if ignore_mismatched_sizes:
+                    mismatched_keys = []
+                return mismatched_keys
+            if state_dict is not None and len(state_dict) > 0:
+                _mismatched_keys = _find_mismatched_keys(
+                    state_dict,
+                    model_state_dict,
+                    original_loaded_keys,
+                    ignore_mismatched_sizes,
+                )
+                mismatched_keys.extend(_mismatched_keys)
+                for key_name, loaded_shape, model_shape in _mismatched_keys:
+                    error_msgs.append(
+                        f"Error size mismatch, {key_name} receives a shape {loaded_shape}, but the expected shape is {model_shape}."
+                    )
+                cls.custom_modify_weight(model_to_load, state_dict)
+                faster_set_state_dict(model_to_load, state_dict)
+        missing_keys = sorted(list(set(expected_keys) - set(loaded_keys)))
+        unexpected_keys = sorted(list(set(loaded_keys) - set(expected_keys)))
+        if len(error_msgs) > 0:
+            error_msg = "\n\t".join(error_msgs)
+            if "size mismatch" in error_msg:
+                error_msg += (
+                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
+                )
+            raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
+                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
+                " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
+                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
+                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
+                " identical (initializing a BertForSequenceClassification model from a"
+                " BertForSequenceClassification model)."
+            )
+        else:
+            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+            )
+        elif len(mismatched_keys) == 0:
+            logger.info(
+                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
+                f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
+                " without further training."
+            )
+        if len(mismatched_keys) > 0:
+            mismatched_warning = "\n".join(
+                [
+                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+                    for key, shape1, shape2 in mismatched_keys
+                ]
+            )
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
+                f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
+                " able to use it for predictions and inference."
+            )
+        del state_dict
+        gc.collect()
+        return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
+    @property
+    def device(self):
+        """
+        `paddle.place`: The device on which the module is (assuming that all the module parameters are on the same
+        device).
+        """
+        return get_parameter_device(self)
+    @property
+    def dtype(self) -> paddle.dtype:
+        """
+        `paddle.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+        """
+        return get_parameter_dtype(self)
+    def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
+        """
+        Get number of (trainable or non-embedding) parameters in the module.
+        Args:
+            only_trainable (`bool`, *optional*, defaults to `False`):
+                Whether or not to return only the number of trainable parameters.
+            exclude_embeddings (`bool`, *optional*, defaults to `False`):
+                Whether or not to return only the number of non-embedding parameters.
+        Returns:
+            `int`: The number of parameters.
+        Example:
+        ```py
+        from ppdiffusers import UNet2DConditionModel
+        model_id = "runwayml/stable-diffusion-v1-5"
+        unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet")
+        unet.num_parameters(only_trainable=True)
+        859520964
+        ```
+        """
+        if exclude_embeddings:
+            embedding_param_names = [
+                f"{name}.weight"
+                for name, module_type in self.named_sublayers(include_self=True)
+                if isinstance(module_type, nn.Embedding)
+            ]
+            non_embedding_parameters = [
+                parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
+            ]
+            return sum(p.numel() for p in non_embedding_parameters if not p.stop_gradient or not only_trainable)
+        else:
+            return sum(p.numel() for p in self.parameters() if not p.stop_gradient or not only_trainable)
+    def _convert_deprecated_attention_blocks(self, state_dict: OrderedDict) -> None:
+        deprecated_attention_block_paths = []
+        def recursive_find_attn_block(name, module):
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_paths.append(name)
+            for sub_name, sub_module in module.named_children():
+                sub_name = sub_name if name == "" else f"{name}.{sub_name}"
+                recursive_find_attn_block(sub_name, sub_module)
+        recursive_find_attn_block("", self)
+        # NOTE: we have to check if the deprecated parameters are in the state dict
+        # because it is possible we are loading from a state dict that was already
+        # converted
+        for path in deprecated_attention_block_paths:
+            # group_norm path stays the same
+            # query -> to_q
+            if f"{path}.query.weight" in state_dict:
+                state_dict[f"{path}.to_q.weight"] = state_dict.pop(f"{path}.query.weight")
+            if f"{path}.query.bias" in state_dict:
+                state_dict[f"{path}.to_q.bias"] = state_dict.pop(f"{path}.query.bias")
+            # key -> to_k
+            if f"{path}.key.weight" in state_dict:
+                state_dict[f"{path}.to_k.weight"] = state_dict.pop(f"{path}.key.weight")
+            if f"{path}.key.bias" in state_dict:
+                state_dict[f"{path}.to_k.bias"] = state_dict.pop(f"{path}.key.bias")
+            # value -> to_v
+            if f"{path}.value.weight" in state_dict:
+                state_dict[f"{path}.to_v.weight"] = state_dict.pop(f"{path}.value.weight")
+            if f"{path}.value.bias" in state_dict:
+                state_dict[f"{path}.to_v.bias"] = state_dict.pop(f"{path}.value.bias")
+            # proj_attn -> to_out.0
+            if f"{path}.proj_attn.weight" in state_dict:
+                state_dict[f"{path}.to_out.0.weight"] = state_dict.pop(f"{path}.proj_attn.weight")
+            if f"{path}.proj_attn.bias" in state_dict:
+                state_dict[f"{path}.to_out.0.bias"] = state_dict.pop(f"{path}.proj_attn.bias")
+    def _temp_convert_self_to_deprecated_attention_blocks(self) -> None:
+        deprecated_attention_block_modules = []
+        def recursive_find_attn_block(module):
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_modules.append(module)
+            for sub_module in module.children():
+                recursive_find_attn_block(sub_module)
+        recursive_find_attn_block(self)
+        for module in deprecated_attention_block_modules:
+            module.query = module.to_q
+            module.key = module.to_k
+            module.value = module.to_v
+            module.proj_attn = module.to_out[0]
+            # We don't _have_ to delete the old attributes, but it's helpful to ensure
+            # that _all_ the weights are loaded into the new attributes and we're not
+            # making an incorrect assumption that this model should be converted when
+            # it really shouldn't be.
+            del module.to_q
+            del module.to_k
+            del module.to_v
+            del module.to_out
+    def _undo_temp_convert_self_to_deprecated_attention_blocks(self) -> None:
+        deprecated_attention_block_modules = []
+        def recursive_find_attn_block(module) -> None:
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_modules.append(module)
+            for sub_module in module.children():
+                recursive_find_attn_block(sub_module)
+        recursive_find_attn_block(self)
+        for module in deprecated_attention_block_modules:
+            module.to_q = module.query
+            module.to_k = module.key
+            module.to_v = module.value
+            module.to_out = nn.LayerList([module.proj_attn, nn.Dropout(module.dropout)])
+            del module.query
+            del module.key
+            del module.value
+            del module.proj_attn
+    @classmethod
+    def _update_deprecated_state_dict(cls, state_dict=None, loaded_keys=None, model=None):
+        if state_dict is None:
+            return loaded_keys
+        _deprecated_dict = getattr(cls, "_deprecated_dict", None)
+        from_deprecated_state_dict = _deprecated_dict is not None and any(
+            cls._deprecated_dict.get("key", "NONE") in all_key for all_key in state_dict.keys()
+        )
+        if from_deprecated_state_dict:
+            logger.warning(
+                "Loading from deprecated state_dict, please load new state_dict via setting `use_safetensors=True`."
+            )
+            for name in list(state_dict.keys()):
+                deprecated_name = name
+                for old_name, new_name in cls._deprecated_dict.get("name_mapping", {}).items():
+                    name = name.replace(old_name, new_name)
+                state_dict[name] = state_dict.pop(deprecated_name)
+            loaded_keys = list(state_dict.keys())
+        return loaded_keys

PaddleMIX/ppdiffusers/ppdiffusers/models/modelscope_gaussion_sdedit.py ADDED Viewed

	@@ -0,0 +1,451 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import random
+import paddle
+from tqdm.auto import trange
+def _logsnr_cosine(n, logsnr_min=-15, logsnr_max=15):
+    t_min = math.atan(math.exp(-0.5 * logsnr_min))
+    t_max = math.atan(math.exp(-0.5 * logsnr_max))
+    t = paddle.linspace(1, 0, n)
+    logsnrs = -2 * paddle.log(paddle.tan(t_min + t * (t_max - t_min)))
+    return logsnrs
+def _logsnr_cosine_shifted(n, logsnr_min=-15, logsnr_max=15, scale=2):
+    logsnrs = _logsnr_cosine(n, logsnr_min, logsnr_max)
+    logsnrs += 2 * math.log(1 / scale)
+    return logsnrs
+def logsnrs_to_sigmas(logsnrs):
+    return paddle.sqrt(paddle.nn.functional.sigmoid(-logsnrs))
+def _logsnr_cosine_interp(n, logsnr_min=-15, logsnr_max=15, scale_min=2, scale_max=4):
+    t = paddle.linspace(1, 0, n)
+    logsnrs_min = _logsnr_cosine_shifted(n, logsnr_min, logsnr_max, scale_min)
+    logsnrs_max = _logsnr_cosine_shifted(n, logsnr_min, logsnr_max, scale_max)
+    logsnrs = t * logsnrs_min + (1 - t) * logsnrs_max
+    return logsnrs
+def logsnr_cosine_interp_schedule(n, logsnr_min=-15, logsnr_max=15, scale_min=2, scale_max=4):
+    return logsnrs_to_sigmas(_logsnr_cosine_interp(n, logsnr_min, logsnr_max, scale_min, scale_max))
+def noise_schedule(schedule="logsnr_cosine_interp", n=1000, zero_terminal_snr=False, **kwargs):
+    # compute sigmas
+    sigmas = {"logsnr_cosine_interp": logsnr_cosine_interp_schedule}[schedule](n, **kwargs)
+    # post-processing
+    if zero_terminal_snr and sigmas.max() != 1.0:
+        scale = (1.0 - sigmas.min()) / (sigmas.max() - sigmas.min())
+        sigmas = sigmas.min() + scale * (sigmas - sigmas.min())
+    return sigmas
+def _i(tensor, t, x):
+    r"""Index tensor using t and format the output according to x."""
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+    if tensor.place != x.place:
+        tensor = paddle.to_tensor(tensor, place=x.place)
+    return tensor[t].reshape(shape).astype(x.dtype)
+def get_scalings(sigma):
+    c_out = -sigma
+    c_in = 1 / (sigma**2 + 1.0**2) ** 0.5
+    return c_out, c_in
+def karras_schedule(n, sigma_min=0.002, sigma_max=80.0, rho=7.0):
+    ramp = paddle.linspace(1, 0, n)
+    min_inv_rho = sigma_min ** (1 / rho)
+    max_inv_rho = sigma_max ** (1 / rho)
+    sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+    sigmas = paddle.sqrt(sigmas**2 / (1 + sigmas**2))
+    return sigmas
+@paddle.no_grad()
+def sample_heun(noise, model, sigmas, s_churn=0.0, s_tmin=0.0, s_tmax=float("inf"), s_noise=1.0, show_progress=True):
+    """
+    Implements Algorithm 2 (Heun steps) from Karras et al. (2022).
+    """
+    x = noise * sigmas[0]
+    for i in trange(len(sigmas) - 1, disable=not show_progress):
+        gamma = 0.0
+        if s_tmin <= sigmas[i] <= s_tmax and sigmas[i] < float("inf"):
+            gamma = min(s_churn / (len(sigmas) - 1), 2**0.5 - 1)
+        eps = paddle.randn(shape=x.shape, dtype=x.dtype) * s_noise
+        sigma_hat = sigmas[i] * (gamma + 1)
+        if gamma > 0:
+            x = x + eps * (sigma_hat**2 - sigmas[i] ** 2) ** 0.5
+        if sigmas[i] == float("inf"):
+            # Euler method
+            denoised = model(noise, sigma_hat)
+            x = denoised + sigmas[i + 1] * (gamma + 1) * noise
+        else:
+            _, c_in = get_scalings(sigma_hat)
+            denoised = model(x * c_in, sigma_hat)
+            d = (x - denoised) / sigma_hat
+            dt = sigmas[i + 1] - sigma_hat
+            if sigmas[i + 1] == 0:
+                # Euler method
+                x = x + d * dt
+            else:
+                # Heun's method
+                x_2 = x + d * dt
+                _, c_in = get_scalings(sigmas[i + 1])
+                denoised_2 = model(x_2 * c_in, sigmas[i + 1])
+                d_2 = (x_2 - denoised_2) / sigmas[i + 1]
+                d_prime = (d + d_2) / 2
+                x = x + d_prime * dt
+    return x
+class BatchedBrownianTree:
+    """
+    A wrapper around torchsde.BrownianTree that enables batches of entropy.
+    """
+    def __init__(self, x, t0, t1, seed=None, **kwargs):
+        import paddlesde
+        t0, t1, self.sign = self.sort(t0, t1)
+        w0 = kwargs.get("w0", paddle.zeros_like(x))
+        if seed is None:
+            seed = paddle.randint(0, 2**31 - 1, []).item()
+        self.batched = True
+        try:
+            assert len(seed) == x.shape[0]
+            w0 = w0[0]
+        except TypeError:
+            seed = [seed]
+            self.batched = False
+        self.trees = [paddlesde.BrownianTree(t0, w0, t1, entropy=s, **kwargs) for s in seed]
+    @staticmethod
+    def sort(a, b):
+        return (a, b, 1) if a < b else (b, a, -1)
+    def __call__(self, t0, t1):
+        t0, t1, sign = self.sort(t0, t1)
+        w = paddle.stack([tree(t0, t1) for tree in self.trees]) * (self.sign * sign)
+        return w if self.batched else w[0]
+class BrownianTreeNoiseSampler:
+    """
+    A noise sampler backed by a torchsde.BrownianTree.
+    Args:
+        x (Tensor): The tensor whose shape, device and dtype to use to generate
+            random samples.
+        sigma_min (float): The low end of the valid interval.
+        sigma_max (float): The high end of the valid interval.
+        seed (int or List[int]): The random seed. If a list of seeds is
+            supplied instead of a single integer, then the noise sampler will
+            use one BrownianTree per batch item, each with its own seed.
+        transform (callable): A function that maps sigma to the sampler's
+            internal timestep.
+    """
+    def __init__(self, x, sigma_min, sigma_max, seed=None, transform=lambda x: x):
+        self.transform = transform
+        t0 = self.transform(paddle.to_tensor(sigma_min))
+        t1 = self.transform(paddle.to_tensor(sigma_max))
+        self.tree = BatchedBrownianTree(x, t0, t1, seed)
+    def __call__(self, sigma, sigma_next):
+        t0 = self.transform(paddle.to_tensor(sigma))
+        t1 = self.transform(paddle.to_tensor(sigma_next))
+        return self.tree(t0, t1) / (t1 - t0).abs().sqrt()
+@paddle.no_grad()
+def sample_dpmpp_2m_sde(noise, model, sigmas, eta=1.0, s_noise=1.0, solver_type="midpoint", show_progress=True):
+    """
+    DPM-Solver++ (2M) SDE.
+    """
+    assert solver_type in {"heun", "midpoint"}
+    x = noise * sigmas[0]
+    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas[sigmas < float("inf")].max()
+    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max)
+    old_denoised = None
+    h_last = None
+    for i in trange(len(sigmas) - 1, disable=not show_progress):
+        if sigmas[i] == float("inf"):
+            # Euler method
+            denoised = model(noise, sigmas[i])
+            x = denoised + sigmas[i + 1] * noise
+        else:
+            _, c_in = get_scalings(sigmas[i])
+            denoised = model(x * c_in, sigmas[i])
+            if sigmas[i + 1] == 0:
+                # Denoising step
+                x = denoised
+            else:
+                # DPM-Solver++(2M) SDE
+                t, s = -sigmas[i].log(), -sigmas[i + 1].log()
+                h = s - t
+                eta_h = eta * h
+                x = sigmas[i + 1] / sigmas[i] * (-eta_h).exp() * x + (-h - eta_h).expm1().neg() * denoised
+                if old_denoised is not None:
+                    r = h_last / h
+                    if solver_type == "heun":
+                        x = x + ((-h - eta_h).expm1().neg() / (-h - eta_h) + 1) * (1 / r) * (denoised - old_denoised)
+                    elif solver_type == "midpoint":
+                        x = x + 0.5 * (-h - eta_h).expm1().neg() * (1 / r) * (denoised - old_denoised)
+                x = (
+                    x
+                    + noise_sampler(sigmas[i], sigmas[i + 1])
+                    * sigmas[i + 1]
+                    * (-2 * eta_h).expm1().neg().sqrt()
+                    * s_noise
+                )
+            old_denoised = denoised
+            h_last = h
+    return x
+class GaussianDiffusion_SDEdit(object):
+    def __init__(self, sigmas, prediction_type="eps"):
+        assert prediction_type in {"x0", "eps", "v"}
+        self.sigmas = sigmas
+        self.alphas = paddle.sqrt(1 - sigmas**2)
+        self.num_timesteps = len(sigmas)
+        self.prediction_type = prediction_type
+    def diffuse(self, x0, t, noise=None):
+        noise = paddle.randn(shape=x0.shape, dtype=x0.dtype) if noise is None else noise
+        xt = _i(self.alphas, t, x0) * x0 + _i(self.sigmas, t, x0) * noise
+        return xt
+    def denoise(
+        self, xt, t, s, model, model_kwargs={}, guide_scale=None, guide_rescale=None, clamp=None, percentile=None
+    ):
+        s = t - 1 if s is None else s
+        # hyperparams
+        sigmas = _i(self.sigmas, t, xt)
+        alphas = _i(self.alphas, t, xt)
+        alphas_s = _i(self.alphas, s.clip(0), xt)
+        alphas_s[s < 0] = 1.0
+        sigmas_s = paddle.sqrt(1 - alphas_s**2)
+        # precompute variables
+        betas = 1 - (alphas / alphas_s) ** 2
+        coef1 = betas * alphas_s / sigmas**2
+        coef2 = (alphas * sigmas_s**2) / (alphas_s * sigmas**2)
+        var = betas * (sigmas_s / sigmas) ** 2
+        log_var = paddle.log(var).clip_(-20, 20)
+        # prediction
+        if guide_scale is None:
+            assert isinstance(model_kwargs, dict)
+            out = model(xt, t=t, **model_kwargs).sample
+        else:
+            # classifier-free guidance
+            assert isinstance(model_kwargs, list) and len(model_kwargs) == 2
+            y_out = model(xt, t=t, **model_kwargs[0]).sample
+            if guide_scale == 1.0:
+                out = y_out
+            else:
+                u_out = model(xt, t=t, **model_kwargs[1]).sample
+                out = u_out + guide_scale * (y_out - u_out)
+                if guide_rescale is not None:
+                    assert 0 <= guide_rescale <= 1
+                    ratio = (
+                        paddle.std(y_out.flatten(1), axis=1) / (paddle.std(out.flatten(1), axis=1) + 1e-12)  # noqa
+                    ).reshape(list((-1,) + (1,) * (y_out.ndim - 1)))
+                    out *= guide_rescale * ratio + (1 - guide_rescale) * 1.0
+        # compute x0
+        if self.prediction_type == "x0":
+            x0 = out
+        elif self.prediction_type == "eps":
+            x0 = (xt - sigmas * out) / alphas
+        elif self.prediction_type == "v":
+            x0 = alphas * xt - sigmas * out
+        else:
+            raise NotImplementedError(f"prediction_type {self.prediction_type} not implemented")
+        # restrict the range of x0
+        if percentile is not None:
+            assert 0 < percentile <= 1
+            s = paddle.quantile(x0.flatten(1).abs(), percentile, axis=1).clip_(1.0).reshape([-1, 1, 1, 1])
+            x0 = paddle.min(s, paddle.max(-s, x0)) / s
+        elif clamp is not None:
+            x0 = x0.clip_(-clamp, clamp)
+        # recompute eps using the restricted x0
+        eps = (xt - alphas * x0) / sigmas
+        # compute mu (mean of posterior distribution) using the restricted x0
+        mu = coef1 * x0 + coef2 * xt
+        return mu, var, log_var, x0, eps
+    @paddle.no_grad()
+    def sample(
+        self,
+        noise,
+        model,
+        model_kwargs={},
+        condition_fn=None,
+        guide_scale=None,
+        guide_rescale=None,
+        clamp=None,
+        percentile=None,
+        solver="euler_a",
+        steps=20,
+        t_max=None,
+        t_min=None,
+        discretization=None,
+        discard_penultimate_step=None,
+        return_intermediate=None,
+        show_progress=False,
+        seed=-1,
+        **kwargs
+    ):
+        # sanity check
+        assert isinstance(steps, (int, "paddle.int64"))
+        assert t_max is None or (0 < t_max <= self.num_timesteps - 1)
+        assert t_min is None or (0 <= t_min < self.num_timesteps - 1)
+        assert discretization in (None, "leading", "linspace", "trailing")
+        assert discard_penultimate_step in (None, True, False)
+        assert return_intermediate in (None, "x0", "xt")
+        # function of diffusion solver
+        solver_fn = {"heun": sample_heun, "dpmpp_2m_sde": sample_dpmpp_2m_sde}[solver]
+        # options
+        schedule = "karras" if "karras" in solver else None
+        discretization = discretization or "linspace"
+        seed = seed if seed >= 0 else random.randint(0, 2**31)
+        if isinstance(steps, paddle.Tensor):
+            discard_penultimate_step = False
+        if discard_penultimate_step is None:
+            discard_penultimate_step = (
+                True
+                if solver
+                in (
+                    "dpm2",
+                    "dpm2_ancestral",
+                    "dpmpp_2m_sde",
+                    "dpm2_karras",
+                    "dpm2_ancestral_karras",
+                    "dpmpp_2m_sde_karras",
+                )
+                else False
+            )
+        # function for denoising xt to get x0
+        intermediates = []
+        def model_fn(xt, sigma):
+            # denoising
+            t = self._sigma_to_t(sigma).tile(len(xt)).round().astype("int64")
+            x0 = self.denoise(xt, t, None, model, model_kwargs, guide_scale, guide_rescale, clamp, percentile)[-2]
+            # collect intermediate outputs
+            if return_intermediate == "xt":
+                intermediates.append(xt)
+            elif return_intermediate == "x0":
+                intermediates.append(x0)
+            return x0
+        # get timesteps
+        if isinstance(steps, int):
+            steps += 1 if discard_penultimate_step else 0
+            t_max = self.num_timesteps - 1 if t_max is None else t_max
+            t_min = 0 if t_min is None else t_min
+            # discretize timesteps
+            if discretization == "leading":
+                steps = paddle.arange(t_min, t_max + 1, (t_max - t_min + 1) / steps).flip(0)
+            elif discretization == "linspace":
+                steps = paddle.linspace(t_max, t_min, steps)
+            elif discretization == "trailing":
+                steps = paddle.arange(t_max, t_min - 1, -((t_max - t_min + 1) / steps))
+            else:
+                raise NotImplementedError(f"{discretization} discretization not implemented")
+            steps = steps.clip_(t_min, t_max)
+        steps = paddle.to_tensor(steps, dtype=paddle.float32, place=noise.place)
+        # get sigmas
+        sigmas = self._t_to_sigma(steps)
+        sigmas = paddle.concat([sigmas, paddle.zeros([1]).astype(sigmas.dtype)])
+        if schedule == "karras":
+            if sigmas[0] == float("inf"):
+                sigmas = karras_schedule(
+                    n=len(steps) - 1,
+                    sigma_min=sigmas[sigmas > 0].min().item(),
+                    sigma_max=sigmas[sigmas < float("inf")].max().item(),
+                    rho=7.0,
+                ).to(sigmas)
+                sigmas = paddle.concat(
+                    [sigmas.to_tensor([float("inf")]), sigmas, paddle.zeros([1]).astype(sigmas.dtype)]
+                )
+            else:
+                sigmas = karras_schedule(
+                    n=len(steps), sigma_min=sigmas[sigmas > 0].min().item(), sigma_max=sigmas.max().item(), rho=7.0
+                ).to(sigmas)
+                sigmas = paddle.concat([sigmas, paddle.zeros([1]).astype(sigmas.dtype)])
+        if discard_penultimate_step:
+            sigmas = paddle.concat([sigmas[:-2], sigmas[-1:]])
+        # sampling
+        x0 = solver_fn(noise, model_fn, sigmas, show_progress=show_progress, **kwargs)
+        return (x0, intermediates) if return_intermediate is not None else x0
+    def _sigma_to_t(self, sigma):
+        if sigma == float("inf"):
+            t = paddle.full_like(sigma, len(self.sigmas) - 1)
+        else:
+            log_sigmas = paddle.sqrt(self.sigmas**2 / (1 - self.sigmas**2)).log().astype(sigma.dtype)  # noqa
+            log_sigma = sigma.log()
+            dists = log_sigma - log_sigmas[:, None]
+            low_idx = dists.greater_equal(paddle.to_tensor(0, dtype=dists.dtype)).astype(dists.dtype)
+            low_idx = paddle.cumsum(low_idx, axis=0).argmax(axis=0).clip_(max=log_sigmas.shape[0] - 2)
+            high_idx = low_idx + 1
+            low, high = log_sigmas[low_idx], log_sigmas[high_idx]
+            w = (low - log_sigma) / (low - high)
+            w = w.clip_(0, 1)
+            t = (1 - w) * low_idx + w * high_idx
+            t = t.reshape(sigma.shape)
+        if t.ndim == 0:
+            t = t.unsqueeze(0)
+        return t
+    def _t_to_sigma(self, t):
+        t = t.astype("float32")
+        low_idx, high_idx, w = t.floor().astype("int64"), t.ceil().astype("int64"), t.frac()
+        log_sigmas = paddle.sqrt(self.sigmas**2 / (1 - self.sigmas**2)).log().astype(t.dtype)  # noqa
+        log_sigma = (1 - w) * log_sigmas[low_idx] + w * log_sigmas[high_idx]
+        log_sigma[paddle.isnan(log_sigma) | paddle.isinf(log_sigma)] = float("inf")
+        return log_sigma.exp()

PaddleMIX/ppdiffusers/ppdiffusers/models/modelscope_st_unet_video2video.py ADDED Viewed

	@@ -0,0 +1,409 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from einops import rearrange
+from ..configuration_utils import register_to_config
+from .lvdm_util import avg_pool_nd
+from .modelscope_st_unet import (
+    ResBlock,
+    SpatialTransformer,
+    STUNetModel,
+    STUNetOutput,
+    TemporalAttentionMultiBlock,
+    TemporalTransformer,
+    default,
+    prob_mask_like,
+    sinusoidal_embedding_paddle,
+)
+USE_TEMPORAL_TRANSFORMER = True
+class Downsample(nn.Layer):
+    """
+    A downsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=(2, 1)):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = nn.Conv2D(self.channels, self.out_channels, 3, stride=stride, padding=padding)
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+class Upsample(nn.Layer):
+    """
+    An upsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = nn.Conv2D(self.channels, self.out_channels, 3, padding=padding)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest")
+        else:
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+            x = x[..., 1:-1, :]
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+class Vid2VidSTUNet(STUNetModel):
+    @register_to_config
+    def __init__(
+        self,
+        in_channels=4,
+        out_channels=4,
+        dim=320,
+        y_dim=1024,
+        context_channels=1024,
+        dim_mult=[1, 2, 4, 4],
+        num_heads=8,
+        head_dim=64,
+        num_res_blocks=2,
+        attn_scales=[1 / 1, 1 / 2, 1 / 4],
+        use_scale_shift_norm=True,
+        dropout=0.1,
+        temporal_attn_times=1,
+        temporal_attention=True,
+        use_checkpoint=True,
+        use_image_dataset=False,
+        use_fps_condition=False,
+        use_sim_mask=False,
+        training=False,
+        inpainting=True,
+        **kwargs
+    ):
+        super(Vid2VidSTUNet, self).__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            dim=dim,
+            y_dim=y_dim,
+            context_channels=context_channels,
+            dim_mult=dim_mult,
+            num_heads=num_heads,
+            head_dim=head_dim,
+            num_res_blocks=num_res_blocks,
+            attn_scales=attn_scales,
+            use_scale_shift_norm=use_scale_shift_norm,
+            dropout=dropout,
+            temporal_attn_times=temporal_attn_times,
+            temporal_attention=temporal_attention,
+        )
+        embed_dim = dim * 4
+        num_heads = num_heads if num_heads else dim // 32
+        self.in_dim = in_channels
+        self.dim = dim
+        self.y_dim = y_dim
+        self.context_dim = context_channels
+        self.embed_dim = embed_dim
+        self.out_dim = out_channels
+        self.dim_mult = dim_mult
+        # for temporal attention
+        self.num_heads = num_heads
+        # for spatial attention
+        self.head_dim = head_dim
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.temporal_attn_times = temporal_attn_times
+        self.temporal_attention = temporal_attention
+        self.inpainting = inpainting
+        self.use_fps_condition = use_fps_condition
+        use_linear_in_temporal = False
+        transformer_depth = 1
+        disabled_sa = False
+        enc_dims = [dim * u for u in [1] + dim_mult]
+        dec_dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        shortcut_dims = []
+        scale = 1.0
+        if self.use_fps_condition:
+            self.fps_embedding = nn.Sequential(
+                nn.Linear(dim, embed_dim),
+                nn.Silu(),
+                nn.Linear(
+                    embed_dim,
+                    embed_dim,
+                    weight_attr=nn.initializer.Constant(value=0.0),
+                    bias_attr=nn.initializer.Constant(value=0.0),
+                ),
+            )
+            # encoder
+        self.input_blocks = nn.LayerList()
+        init_block = nn.LayerList([nn.Conv2D(self.in_dim, dim, 3, padding=1)])
+        if temporal_attention:
+            if USE_TEMPORAL_TRANSFORMER:
+                init_block.append(
+                    TemporalTransformer(
+                        dim,
+                        num_heads,
+                        head_dim,
+                        depth=transformer_depth,
+                        context_dim=context_channels,
+                        disable_self_attn=disabled_sa,
+                        use_linear=use_linear_in_temporal,
+                        multiply_zero=use_image_dataset,
+                    )
+                )
+            else:
+                init_block.append(
+                    TemporalAttentionMultiBlock(
+                        dim,
+                        num_heads,
+                        head_dim,
+                        rotary_emb=self.rotary_emb,
+                        temporal_attn_times=temporal_attn_times,
+                        use_image_dataset=use_image_dataset,
+                    )
+                )
+        self.input_blocks.append(init_block)
+        shortcut_dims.append(dim)
+        for i, (in_dim, out_dim) in enumerate(zip(enc_dims[:-1], enc_dims[1:])):
+            for j in range(num_res_blocks):
+                block = nn.LayerList(
+                    [
+                        ResBlock(
+                            in_dim,
+                            embed_dim,
+                            dropout,
+                            out_channels=out_dim,
+                            use_scale_shift_norm=False,
+                            use_image_dataset=use_image_dataset,
+                        )
+                    ]
+                )
+                if scale in attn_scales:
+                    block.append(
+                        SpatialTransformer(
+                            out_dim,
+                            out_dim // head_dim,
+                            head_dim,
+                            depth=1,
+                            context_dim=self.context_dim,
+                            disable_self_attn=False,
+                            use_linear=True,
+                        )
+                    )
+                    if self.temporal_attention:
+                        if USE_TEMPORAL_TRANSFORMER:
+                            block.append(
+                                TemporalTransformer(
+                                    out_dim,
+                                    out_dim // head_dim,
+                                    head_dim,
+                                    depth=transformer_depth,
+                                    context_dim=context_channels,
+                                    disable_self_attn=disabled_sa,
+                                    use_linear=use_linear_in_temporal,
+                                    multiply_zero=use_image_dataset,
+                                )
+                            )
+                        else:
+                            block.append(
+                                TemporalAttentionMultiBlock(
+                                    out_dim,
+                                    num_heads,
+                                    head_dim,
+                                    rotary_emb=self.rotary_emb,
+                                    use_image_dataset=use_image_dataset,
+                                    use_sim_mask=use_sim_mask,
+                                    temporal_attn_times=temporal_attn_times,
+                                )
+                            )
+                in_dim = out_dim
+                self.input_blocks.append(block)
+                shortcut_dims.append(out_dim)
+                # downsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks - 1:
+                    downsample = Downsample(out_dim, True, dims=2, out_channels=out_dim)
+                    shortcut_dims.append(out_dim)
+                    scale /= 2.0
+                    self.input_blocks.append(downsample)
+        # decoder
+        self.output_blocks = nn.LayerList()
+        for i, (in_dim, out_dim) in enumerate(zip(dec_dims[:-1], dec_dims[1:])):
+            for j in range(num_res_blocks + 1):
+                block = nn.LayerList(
+                    [
+                        ResBlock(
+                            in_dim + shortcut_dims.pop(),
+                            embed_dim,
+                            dropout,
+                            out_dim,
+                            use_scale_shift_norm=False,
+                            use_image_dataset=use_image_dataset,
+                        )
+                    ]
+                )
+                if scale in attn_scales:
+                    block.append(
+                        SpatialTransformer(
+                            out_dim,
+                            out_dim // head_dim,
+                            head_dim,
+                            depth=1,
+                            context_dim=1024,
+                            disable_self_attn=False,
+                            use_linear=True,
+                        )
+                    )
+                    if self.temporal_attention:
+                        if USE_TEMPORAL_TRANSFORMER:
+                            block.append(
+                                TemporalTransformer(
+                                    out_dim,
+                                    out_dim // head_dim,
+                                    head_dim,
+                                    depth=transformer_depth,
+                                    context_dim=context_channels,
+                                    disable_self_attn=disabled_sa,
+                                    use_linear=use_linear_in_temporal,
+                                    multiply_zero=use_image_dataset,
+                                )
+                            )
+                        else:
+                            block.append(
+                                TemporalAttentionMultiBlock(
+                                    out_dim,
+                                    num_heads,
+                                    head_dim,
+                                    rotary_emb=self.rotary_emb,
+                                    use_image_dataset=use_image_dataset,
+                                    use_sim_mask=use_sim_mask,
+                                    temporal_attn_times=temporal_attn_times,
+                                )
+                            )
+                in_dim = out_dim
+                # upsample
+                if i != len(dim_mult) - 1 and j == num_res_blocks:
+                    upsample = Upsample(out_dim, True, dims=2, out_channels=out_dim)
+                    scale *= 2.0
+                    block.append(upsample)
+                self.output_blocks.append(block)
+    def forward(
+        self,
+        x,
+        t,
+        y,
+        x_lr=None,
+        fps=None,
+        video_mask=None,
+        focus_present_mask=None,
+        prob_focus_present=0.0,
+        mask_last_frame_num=0,
+        return_dict: bool = True,
+        **kwargs
+    ):
+        batch, x_c, x_f, x_h, x_w = x.shape
+        device = x.place
+        self.batch = batch
+        # image and video joint training, if mask_last_frame_num is set, prob_focus_present will be ignored
+        if mask_last_frame_num > 0:
+            focus_present_mask = None
+            video_mask[-mask_last_frame_num:] = False
+        else:
+            focus_present_mask = default(
+                focus_present_mask, lambda: prob_mask_like((batch,), prob_focus_present, device=device)
+            )
+        time_rel_pos_bias = None
+        # embeddings
+        e = self.time_embed(sinusoidal_embedding_paddle(t, self.dim))
+        context = y
+        # repeat f times for spatial e and context
+        e = e.repeat_interleave(repeats=x_f, axis=0)
+        context = context.repeat_interleave(repeats=x_f, axis=0)
+        # always in shape (b f) c h w, except for temporal layer
+        x = rearrange(x, "b c f h w -> (b f) c h w")
+        # encoder
+        xs = []
+        for block in self.input_blocks:
+            x = self._forward_single(block, x, e, context, time_rel_pos_bias, focus_present_mask, video_mask)
+            xs.append(x)
+        # middle
+        for block in self.middle_block:
+            x = self._forward_single(block, x, e, context, time_rel_pos_bias, focus_present_mask, video_mask)
+        # decoder
+        for block in self.output_blocks:
+            x = paddle.concat([x, xs.pop()], axis=1)
+            x = self._forward_single(
+                block,
+                x,
+                e,
+                context,
+                time_rel_pos_bias,
+                focus_present_mask,
+                video_mask,
+                reference=xs[-1] if len(xs) > 0 else None,
+            )
+        # head
+        x = self.out(x)
+        # reshape back to (b c f h w)
+        sample = rearrange(x, "(b f) c h w -> b c f h w", b=batch)
+        if not return_dict:
+            return (sample,)
+        return STUNetOutput(sample=sample)

PaddleMIX/ppdiffusers/ppdiffusers/models/prior_transformer.py ADDED Viewed

	@@ -0,0 +1,398 @@

+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Dict, Optional, Union
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..loaders import UNet2DConditionLoadersMixin
+from ..utils import BaseOutput
+from .attention import BasicTransformerBlock
+from .attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from .embeddings import TimestepEmbedding, Timesteps
+from .modeling_utils import ModelMixin
+@dataclass
+class PriorTransformerOutput(BaseOutput):
+    """
+    The output of [`PriorTransformer`].
+    Args:
+        predicted_image_embedding (`paddle.Tensor` of shape `(batch_size, embedding_dim)`):
+            The predicted CLIP image embedding conditioned on the CLIP text embedding input.
+    """
+    predicted_image_embedding: paddle.Tensor
+class PriorTransformer(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    """
+    A Prior Transformer model.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 32): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
+        num_layers (`int`, *optional*, defaults to 20): The number of layers of Transformer blocks to use.
+        embedding_dim (`int`, *optional*, defaults to 768): The dimension of the model input `hidden_states`
+        num_embeddings (`int`, *optional*, defaults to 77):
+            The number of embeddings of the model input `hidden_states`
+        additional_embeddings (`int`, *optional*, defaults to 4): The number of additional tokens appended to the
+            projected `hidden_states`. The actual length of the used `hidden_states` is `num_embeddings +
+            additional_embeddings`.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        time_embed_act_fn (`str`, *optional*, defaults to 'silu'):
+            The activation function to use to create timestep embeddings.
+        norm_in_type (`str`, *optional*, defaults to None): The normalization layer to apply on hidden states before
+            passing to Transformer blocks. Set it to `None` if normalization is not needed.
+        embedding_proj_norm_type (`str`, *optional*, defaults to None):
+            The normalization layer to apply on the input `proj_embedding`. Set it to `None` if normalization is not
+            needed.
+        encoder_hid_proj_type (`str`, *optional*, defaults to `linear`):
+            The projection layer to apply on the input `encoder_hidden_states`. Set it to `None` if
+            `encoder_hidden_states` is `None`.
+        added_emb_type (`str`, *optional*, defaults to `prd`): Additional embeddings to condition the model.
+            Choose from `prd` or `None`. if choose `prd`, it will prepend a token indicating the (quantized) dot
+            product between the text embedding and image embedding as proposed in the unclip paper
+            https://arxiv.org/abs/2204.06125 If it is `None`, no additional embeddings will be prepended.
+        time_embed_dim (`int, *optional*, defaults to None): The dimension of timestep embeddings.
+            If None, will be set to `num_attention_heads * attention_head_dim`
+        embedding_proj_dim (`int`, *optional*, default to None):
+            The dimension of `proj_embedding`. If None, will be set to `embedding_dim`.
+        clip_embed_dim (`int`, *optional*, default to None):
+            The dimension of the output. If None, will be set to `embedding_dim`.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 32,
+        attention_head_dim: int = 64,
+        num_layers: int = 20,
+        embedding_dim: int = 768,
+        num_embeddings=77,
+        additional_embeddings=4,
+        dropout: float = 0.0,
+        time_embed_act_fn: str = "silu",
+        norm_in_type: Optional[str] = None,  # layer
+        embedding_proj_norm_type: Optional[str] = None,  # layer
+        encoder_hid_proj_type: Optional[str] = "linear",  # linear
+        added_emb_type: Optional[str] = "prd",  # prd
+        time_embed_dim: Optional[int] = None,
+        embedding_proj_dim: Optional[int] = None,
+        clip_embed_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        self.additional_embeddings = additional_embeddings
+        time_embed_dim = time_embed_dim or inner_dim
+        embedding_proj_dim = embedding_proj_dim or embedding_dim
+        clip_embed_dim = clip_embed_dim or embedding_dim
+        self.time_proj = Timesteps(inner_dim, True, 0)
+        self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, out_dim=inner_dim, act_fn=time_embed_act_fn)
+        self.proj_in = nn.Linear(embedding_dim, inner_dim)
+        if embedding_proj_norm_type is None:
+            self.embedding_proj_norm = None
+        elif embedding_proj_norm_type == "layer":
+            self.embedding_proj_norm = nn.LayerNorm(embedding_proj_dim)
+        else:
+            raise ValueError(f"unsupported embedding_proj_norm_type: {embedding_proj_norm_type}")
+        self.embedding_proj = nn.Linear(embedding_proj_dim, inner_dim)
+        if encoder_hid_proj_type is None:
+            self.encoder_hidden_states_proj = None
+        elif encoder_hid_proj_type == "linear":
+            self.encoder_hidden_states_proj = nn.Linear(embedding_dim, inner_dim)
+        else:
+            raise ValueError(f"unsupported encoder_hid_proj_type: {encoder_hid_proj_type}")
+        self.positional_embedding = nn.Parameter(paddle.zeros([1, num_embeddings + additional_embeddings, inner_dim]))
+        if added_emb_type == "prd":
+            self.prd_embedding = nn.Parameter(paddle.zeros([1, 1, inner_dim]))
+        elif added_emb_type is None:
+            self.prd_embedding = None
+        else:
+            raise ValueError(
+                f"`added_emb_type`: {added_emb_type} is not supported. Make sure to choose one of `'prd'` or `None`."
+            )
+        self.transformer_blocks = nn.LayerList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    activation_fn="gelu",
+                    attention_bias=True,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        if norm_in_type == "layer":
+            self.norm_in = nn.LayerNorm(inner_dim)
+        elif norm_in_type is None:
+            self.norm_in = None
+        else:
+            raise ValueError(f"Unsupported norm_in_type: {norm_in_type}.")
+        self.norm_out = nn.LayerNorm(inner_dim)
+        self.proj_to_clip_embeddings = nn.Linear(inner_dim, clip_embed_dim)
+        causal_attention_mask = paddle.triu(
+            paddle.full([num_embeddings + additional_embeddings, num_embeddings + additional_embeddings], -1e4), 1
+        )
+        causal_attention_mask = causal_attention_mask[None, ...]
+        self.register_buffer("causal_attention_mask", causal_attention_mask, persistable=False)
+        self.clip_mean = nn.Parameter(paddle.zeros([1, clip_embed_dim]))
+        self.clip_std = nn.Parameter(paddle.zeros([1, clip_embed_dim]))
+    @property
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor, _remove_lora=_remove_lora)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor, _remove_lora=True)
+    def forward(
+        self,
+        hidden_states,
+        timestep: Union[paddle.Tensor, float, int],
+        proj_embedding: paddle.Tensor,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`PriorTransformer`] forward method.
+        Args:
+            hidden_states (`paddle.Tensor` of shape `(batch_size, embedding_dim)`):
+                The currently predicted image embeddings.
+            timestep (`paddle.Tensor`):
+                Current denoising step.
+            proj_embedding (`paddle.Tensor` of shape `(batch_size, embedding_dim)`):
+                Projected embedding vector the denoising process is conditioned on.
+            encoder_hidden_states (`paddle.Tensor` of shape `(batch_size, num_embeddings, embedding_dim)`):
+                Hidden states of the text embeddings the denoising process is conditioned on.
+            attention_mask (`paddle.Tensor` of shape `(batch_size, num_embeddings)`):
+                Text mask for the text embeddings.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.prior_transformer.PriorTransformerOutput`] instead of a plain
+                tuple.
+        Returns:
+            [`~models.prior_transformer.PriorTransformerOutput`] or `tuple`:
+                If return_dict is True, a [`~models.prior_transformer.PriorTransformerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        # TODO junnyu, add this to support pure fp16
+        hidden_states = hidden_states.cast(self.dtype)
+        batch_size = hidden_states.shape[0]
+        timesteps = timestep
+        if not paddle.is_tensor(timesteps):
+            timesteps = paddle.to_tensor([timesteps], dtype=paddle.int64)
+        elif paddle.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None]
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps * paddle.ones((batch_size,), dtype=timesteps.dtype)
+        timesteps_projected = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might be fp16, so we need to cast here.
+        timesteps_projected = timesteps_projected.cast(hidden_states.dtype)
+        time_embeddings = self.time_embedding(timesteps_projected)
+        if self.embedding_proj_norm is not None:
+            proj_embedding = self.embedding_proj_norm(proj_embedding)
+        proj_embeddings = self.embedding_proj(proj_embedding)
+        if self.encoder_hidden_states_proj is not None and encoder_hidden_states is not None:
+            encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states)
+        elif self.encoder_hidden_states_proj is not None and encoder_hidden_states is None:
+            raise ValueError("`encoder_hidden_states_proj` requires `encoder_hidden_states` to be set")
+        hidden_states = self.proj_in(hidden_states)
+        positional_embeddings = self.positional_embedding.cast(hidden_states.dtype)
+        additional_embeds = []
+        additional_embeddings_len = 0
+        if encoder_hidden_states is not None:
+            additional_embeds.append(encoder_hidden_states)
+            additional_embeddings_len += encoder_hidden_states.shape[1]
+        if len(proj_embeddings.shape) == 2:
+            proj_embeddings = proj_embeddings[:, None, :]
+        if len(hidden_states.shape) == 2:
+            hidden_states = hidden_states[:, None, :]
+        additional_embeds = additional_embeds + [
+            proj_embeddings,
+            time_embeddings[:, None, :],
+            hidden_states,
+        ]
+        if self.prd_embedding is not None:
+            prd_embedding = self.prd_embedding.cast(hidden_states.dtype).expand([batch_size, -1, -1])
+            additional_embeds.append(prd_embedding)
+        hidden_states = paddle.concat(
+            additional_embeds,
+            axis=1,
+        )
+        # Allow positional_embedding to not include the `additional_embeddings` and instead pad it with zeros for these additional tokens
+        additional_embeddings_len = additional_embeddings_len + proj_embeddings.shape[1] + 1
+        if positional_embeddings.shape[1] < hidden_states.shape[1]:
+            positional_embeddings = F.pad(
+                positional_embeddings,
+                (
+                    additional_embeddings_len,
+                    self.prd_embedding.shape[1] if self.prd_embedding is not None else 0,
+                ),
+                value=0.0,
+                data_format="NLC",
+            )
+        hidden_states = hidden_states + positional_embeddings
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.cast(hidden_states.dtype)) * -1e4
+            attention_mask = F.pad(
+                attention_mask.unsqueeze(0), (0, self.additional_embeddings), value=0.0, data_format="NCL"
+            ).squeeze(0)
+            attention_mask = (attention_mask[:, None, :] + self.causal_attention_mask).cast(hidden_states.dtype)
+            attention_mask = attention_mask.repeat_interleave(self.config.num_attention_heads, axis=0)
+        if self.norm_in is not None:
+            hidden_states = self.norm_in(hidden_states)
+        for block in self.transformer_blocks:
+            hidden_states = block(hidden_states, attention_mask=attention_mask)
+        hidden_states = self.norm_out(hidden_states)
+        if self.prd_embedding is not None:
+            hidden_states = hidden_states[:, -1]
+        else:
+            hidden_states = hidden_states[:, additional_embeddings_len:]
+        predicted_image_embedding = self.proj_to_clip_embeddings(hidden_states)
+        if not return_dict:
+            return (predicted_image_embedding,)
+        return PriorTransformerOutput(predicted_image_embedding=predicted_image_embedding)
+    def post_process_latents(self, prior_latents):
+        prior_latents = (prior_latents * self.clip_std) + self.clip_mean
+        return prior_latents

PaddleMIX/ppdiffusers/ppdiffusers/models/simplified_sd3.py ADDED Viewed

	@@ -0,0 +1,216 @@

+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.distributed.fleet.meta_parallel import ColumnParallelLinear as CPLinear
+from paddle.distributed.fleet.meta_parallel import RowParallelLinear as RPLinear
+from paddle.nn import LayerList as LayerList
+class SimplifiedSD3(nn.Layer):
+    def __init__(self, num_layers: int, dim: int, num_attention_heads: int, attention_head_dim: int, mp_degree: int):
+        super().__init__()
+        self.num_layers = num_layers
+        self.dim = dim
+        self.head_dim = 64
+        self.mp_degree = mp_degree
+        self.silu = nn.Silu()
+        self.linear1 = LayerList([nn.Linear(self.dim, 6 * self.dim) for i in range(num_layers)])
+        self.linear_context = LayerList(
+            [nn.Linear(self.dim, (6 if i < num_layers - 1 else 2) * self.dim) for i in range(num_layers)]
+        )
+        self.norm_last_context = nn.LayerNorm(self.dim, epsilon=1e-6, weight_attr=False, bias_attr=True)
+        if mp_degree > 1:
+            self.qkv_mp = LayerList(
+                [CPLinear(self.dim, 3 * self.dim, gather_output=False, has_bias=True) for i in range(num_layers)]
+            )
+            self.eqkv_mp = LayerList(
+                [CPLinear(self.dim, 3 * self.dim, gather_output=False, has_bias=True) for i in range(num_layers)]
+            )
+            self.to_out_linear_mp = LayerList(
+                [RPLinear(self.dim, self.dim, input_is_parallel=True, has_bias=True) for i in range(num_layers)]
+            )
+            # When using Model Parallel, for the symmetry of GEMM, we change num_layers-1 here to num_layers, which has no effect on the results.
+            self.to_add_out_linear_mp = LayerList(
+                [RPLinear(self.dim, self.dim, input_is_parallel=True, has_bias=True) for i in range(num_layers)]
+            )
+            self.ffn1_mp = LayerList(
+                [CPLinear(self.dim, 4 * self.dim, gather_output=False, has_bias=True) for i in range(num_layers)]
+            )
+            self.ffn2_mp = LayerList(
+                [RPLinear(self.dim * 4, self.dim, input_is_parallel=True, has_bias=True) for i in range(num_layers)]
+            )
+            self.ffn1_context_mp = LayerList(
+                [CPLinear(self.dim, 4 * self.dim, gather_output=False, has_bias=True) for i in range(num_layers - 1)]
+            )
+            self.ffn2_context_mp = LayerList(
+                [
+                    RPLinear(self.dim * 4, self.dim, input_is_parallel=True, has_bias=True)
+                    for i in range(num_layers - 1)
+                ]
+            )
+        else:
+            self.qkv = LayerList([nn.Linear(self.dim, self.dim * 3) for i in range(num_layers)])
+            self.eqkv = LayerList([nn.Linear(self.dim, self.dim * 3) for i in range(num_layers)])
+            self.to_out_linear = LayerList([nn.Linear(self.dim, self.dim) for i in range(num_layers)])
+            # When using Model Parallel, for the symmetry of GEMM, we change num_layers-1 here to num_layers, which has no effect on the results.
+            self.to_add_out_linear = LayerList([nn.Linear(self.dim, self.dim) for i in range(num_layers)])
+            self.ffn1 = LayerList([nn.Linear(self.dim, self.dim * 4) for i in range(num_layers)])
+            self.ffn2 = LayerList([nn.Linear(self.dim * 4, self.dim) for i in range(num_layers)])
+            self.ffn1_context = LayerList([nn.Linear(self.dim, self.dim * 4) for i in range(num_layers - 1)])
+            self.ffn2_context = LayerList([nn.Linear(self.dim * 4, self.dim) for i in range(num_layers - 1)])
+    def forward(self, hidden_states, encoder_hidden_states, temb):
+        print("--------------------this is simplified_sd3------------------------")
+        temb_silu = self.silu(temb)
+        last_ffn_output = None
+        last_hidden_states = None
+        last_gate_mlp = None
+        last_context_ffn_output = None
+        last_context_hidden_states = None
+        last_context_gate_mlp = None
+        seq1 = hidden_states.shape[1]
+        seq2 = encoder_hidden_states.shape[1]
+        for i in range(self.num_layers):
+            context_pre_only = i == self.num_layers - 1
+            emb = self.linear1[i](temb_silu)
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, axis=1)
+            import paddlemix
+            if last_ffn_output is None:
+                norm_hidden_states = paddlemix.triton_ops.adaptive_layer_norm(
+                    hidden_states, scale_msa, shift_msa, epsilon=1e-06
+                )
+            else:
+                hidden_states, norm_hidden_states = paddlemix.triton_ops.fused_adaLN_scale_residual(
+                    last_hidden_states, last_ffn_output, last_gate_mlp, scale_msa, shift_msa, epsilon=1e-06
+                )
+            emb = self.linear_context[i](temb_silu)
+            if not context_pre_only:
+                shift_msa, scale_msa, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = emb.chunk(6, axis=1)
+                if last_context_ffn_output is None:
+                    norm_encoder_hidden_states = paddlemix.triton_ops.adaptive_layer_norm(
+                        encoder_hidden_states, scale_msa, shift_msa, epsilon=1e-06
+                    )
+                else:
+                    (
+                        encoder_hidden_states,
+                        norm_encoder_hidden_states,
+                    ) = paddlemix.triton_ops.fused_adaLN_scale_residual(
+                        last_context_hidden_states,
+                        last_context_ffn_output,
+                        last_context_gate_mlp,
+                        scale_msa,
+                        shift_msa,
+                        epsilon=1e-06,
+                    )
+            else:
+                # the last layer.
+                scale, shift = paddle.chunk(emb, 2, axis=1)
+                (encoder_hidden_states, norm_encoder_hidden_states,) = paddlemix.triton_ops.fused_adaLN_scale_residual(
+                    last_context_hidden_states,
+                    last_context_ffn_output,
+                    last_context_gate_mlp,
+                    scale,
+                    shift,
+                    epsilon=1e-06,
+                )
+            if self.mp_degree > 1:
+                qkv = self.qkv_mp[i](norm_hidden_states)
+                eqkv = self.eqkv_mp[i](norm_encoder_hidden_states)
+            else:
+                qkv = self.qkv[i](norm_hidden_states)
+                eqkv = self.eqkv[i](norm_encoder_hidden_states)
+            q, k, v = paddlemix.triton_ops.split_concat(qkv, eqkv)
+            bs = hidden_states.shape[0]
+            head_nums = q.shape[2] // self.head_dim
+            q = q.reshape([bs, -1, head_nums, self.head_dim])
+            k = k.reshape([bs, -1, head_nums, self.head_dim])
+            v = v.reshape([bs, -1, head_nums, self.head_dim])
+            norm_hidden_states1 = F.scaled_dot_product_attention_(q, k, v, dropout_p=0.0, is_causal=False)
+            norm_hidden_states1 = norm_hidden_states1.reshape([bs, -1, head_nums * self.head_dim])
+            attn_output, context_attn_output = paddle.split(norm_hidden_states1, num_or_sections=[seq1, seq2], axis=1)
+            # attn_output, context_attn_output = paddlemix.triton_ops.triton_split(
+            #     norm_hidden_states1, num_or_sections=[1024, 154], axis=1
+            # )
+            if self.mp_degree > 1:
+                attn_output = self.to_out_linear_mp[i](attn_output)
+                context_attn_output = self.to_add_out_linear_mp[i](context_attn_output)
+            else:
+                attn_output = self.to_out_linear[i](attn_output)
+                context_attn_output = self.to_add_out_linear[i](context_attn_output)
+            hidden_states, norm_hidden_states = paddlemix.triton_ops.fused_adaLN_scale_residual(
+                hidden_states, attn_output, gate_msa, scale_mlp, shift_mlp, epsilon=1e-06
+            )
+            # ffn1
+            if self.mp_degree > 1:
+                ffn_output = self.ffn1_mp[i](norm_hidden_states)
+                ffn_output = F.gelu(ffn_output, approximate=True)
+                ffn_output = self.ffn2_mp[i](ffn_output)
+            else:
+                ffn_output = self.ffn1[i](norm_hidden_states)
+                ffn_output = F.gelu(ffn_output, approximate=True)
+                ffn_output = self.ffn2[i](ffn_output)
+            if context_pre_only:
+                ffn_output = gate_mlp.unsqueeze(1) * ffn_output
+                hidden_states = hidden_states + ffn_output
+            else:
+                last_ffn_output = ffn_output
+                last_hidden_states = hidden_states
+                last_gate_mlp = gate_mlp
+            # ffn2
+            if not context_pre_only:
+                (encoder_hidden_states, norm_encoder_hidden_states,) = paddlemix.triton_ops.fused_adaLN_scale_residual(
+                    encoder_hidden_states, context_attn_output, c_gate_msa, c_scale_mlp, c_shift_mlp, epsilon=1e-06
+                )
+                if self.mp_degree > 1:
+                    context_ffn_output = self.ffn1_context_mp[i](norm_encoder_hidden_states)
+                    context_ffn_output = F.gelu(context_ffn_output, approximate=True)
+                    context_ffn_output = self.ffn2_context_mp[i](context_ffn_output)
+                else:
+                    context_ffn_output = self.ffn1_context[i](norm_encoder_hidden_states)
+                    context_ffn_output = F.gelu(context_ffn_output, approximate=True)
+                    context_ffn_output = self.ffn2_context[i](context_ffn_output)
+                last_context_ffn_output = context_ffn_output
+                last_context_hidden_states = encoder_hidden_states
+                last_context_gate_mlp = c_gate_mlp
+        return hidden_states

PaddleMIX/ppdiffusers/ppdiffusers/models/transformer_2d.py ADDED Viewed

	@@ -0,0 +1,538 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.distributed.fleet.utils import recompute
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..models.embeddings import ImagePositionalEmbeddings
+from ..utils import (
+    USE_PEFT_BACKEND,
+    BaseOutput,
+    deprecate,
+    recompute_use_reentrant,
+    use_old_recompute,
+)
+from .attention import BasicTransformerBlock
+from .embeddings import CaptionProjection, PatchEmbed
+from .lora import LoRACompatibleConv, LoRACompatibleLinear
+from .modeling_utils import ModelMixin
+from .normalization import AdaLayerNormSingle
+from .simplified_facebook_dit import SimplifiedFacebookDIT
+@dataclass
+class Transformer2DModelOutput(BaseOutput):
+    """
+    The output of [`Transformer2DModel`].
+    Args:
+        sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
+            distributions for the unnoised latent pixels.
+    """
+    sample: paddle.Tensor
+class Transformer2DModel(ModelMixin, ConfigMixin):
+    """
+    A 2D Transformer model for image-like data.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        num_vector_embeds (`int`, *optional*):
+            The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*):
+            The number of diffusion steps used during training. Pass if at least one of the norm_layers is
+            `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
+            added to the hidden states.
+            During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the `TransformerBlocks` attention should contain a bias parameter.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        attention_type: str = "default",
+        caption_channels: int = None,
+        data_format: str = "NCHW",
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.inner_dim = inner_dim = num_attention_heads * attention_head_dim
+        self.data_format = data_format
+        self.inference_optimize = os.getenv("INFERENCE_OPTIMIZE") == "True"
+        conv_cls = nn.Conv2D if USE_PEFT_BACKEND else LoRACompatibleConv
+        linear_cls = nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear
+        # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+        # Define whether input is continuous or discrete depending on configuration
+        self.is_input_continuous = (in_channels is not None) and (patch_size is None)
+        self.is_input_vectorized = num_vector_embeds is not None
+        self.is_input_patches = in_channels is not None and patch_size is not None
+        if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
+            deprecation_message = (
+                f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
+                " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config."
+                " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
+                " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
+                " would be very nice if you could open a Pull request for the `transformer/config.json` file"
+            )
+            deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
+            norm_type = "ada_norm"
+        if self.is_input_continuous and self.is_input_vectorized:
+            raise ValueError(
+                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is None."
+            )
+        elif self.is_input_vectorized and self.is_input_patches:
+            raise ValueError(
+                f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
+                " sure that either `num_vector_embeds` or `num_patches` is None."
+            )
+        elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
+            raise ValueError(
+                f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
+                f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
+            )
+        # 2. Define input layers
+        if self.is_input_continuous:
+            self.in_channels = in_channels
+            self.norm = nn.GroupNorm(
+                num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-6, data_format=data_format
+            )
+            if use_linear_projection:
+                self.proj_in = linear_cls(in_channels, inner_dim)
+            else:
+                self.proj_in = conv_cls(
+                    in_channels, inner_dim, kernel_size=1, stride=1, padding=0, data_format=data_format
+                )
+        elif self.is_input_vectorized:
+            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
+            self.height = sample_size
+            self.width = sample_size
+            self.num_vector_embeds = num_vector_embeds
+            self.num_latent_pixels = self.height * self.width
+            self.latent_image_embedding = ImagePositionalEmbeddings(
+                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
+            )
+        elif self.is_input_patches:
+            assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
+            self.height = sample_size
+            self.width = sample_size
+            self.patch_size = patch_size
+            interpolation_scale = self.config.sample_size // 64  # => 64 (= 512 pixart) has interpolation scale 1
+            interpolation_scale = max(interpolation_scale, 1)
+            self.pos_embed = PatchEmbed(
+                height=sample_size,
+                width=sample_size,
+                patch_size=patch_size,
+                in_channels=in_channels,
+                embed_dim=inner_dim,
+                interpolation_scale=interpolation_scale,
+                data_format=data_format,
+            )
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.LayerList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    double_self_attention=double_self_attention,
+                    upcast_attention=upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                    attention_type=attention_type,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        if self.inference_optimize:
+            self.simplified_facebookdit = SimplifiedFacebookDIT(
+                num_layers, inner_dim, num_attention_heads, attention_head_dim
+            )
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+        if self.is_input_continuous:
+            # TODO: should use out_channels for continuous projections
+            if use_linear_projection:
+                self.proj_out = linear_cls(inner_dim, in_channels)
+            else:
+                self.proj_out = conv_cls(
+                    inner_dim, in_channels, kernel_size=1, stride=1, padding=0, data_format=data_format
+                )
+        elif self.is_input_vectorized:
+            self.norm_out = nn.LayerNorm(inner_dim)
+            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
+        elif self.is_input_patches and norm_type != "ada_norm_single":
+            norm_elementwise_affine_kwargs = dict(weight_attr=False, bias_attr=False)
+            self.norm_out = nn.LayerNorm(inner_dim, epsilon=1e-6, **norm_elementwise_affine_kwargs)
+            self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
+            self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+        elif self.is_input_patches and norm_type == "ada_norm_single":
+            norm_elementwise_affine_kwargs = dict(weight_attr=False, bias_attr=False)
+            self.norm_out = nn.LayerNorm(inner_dim, epsilon=1e-6, **norm_elementwise_affine_kwargs)
+            self.scale_shift_table = nn.Parameter(paddle.randn([2, inner_dim]) / inner_dim**0.5)
+            self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+        # 5. PixArt-Alpha blocks.
+        self.adaln_single = None
+        self.use_additional_conditions = False
+        if norm_type == "ada_norm_single":
+            self.use_additional_conditions = self.config.sample_size == 128
+            # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use
+            # additional conditions until we find better name
+            self.adaln_single = AdaLayerNormSingle(inner_dim, use_additional_conditions=self.use_additional_conditions)
+        self.caption_projection = None
+        if caption_channels is not None:
+            self.caption_projection = CaptionProjection(in_features=caption_channels, hidden_size=inner_dim)
+        self.gradient_checkpointing = False
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        timestep: Optional[paddle.Tensor] = None,
+        added_cond_kwargs: Dict[str, paddle.Tensor] = None,
+        class_labels: Optional[paddle.Tensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`Transformer2DModel`] forward method.
+        Args:
+            hidden_states (`paddle.Tensor` of shape `(batch size, num latent pixels)` if discrete, `paddle.Tensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input `hidden_states`.
+            encoder_hidden_states ( `paddle.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `paddle.Tensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `paddle.Tensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            attention_mask ( `paddle.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            encoder_attention_mask ( `paddle.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  query_tokens, heads, key_tokens] (e.g. paddle sdp or ppxformers attn)
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. classic attn)
+        # pure fp16
+        hidden_states = hidden_states.cast(self.dtype)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.cast(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.cast(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # Retrieve lora scale.
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        # 1. Input
+        if self.is_input_continuous:
+            if self.data_format == "NCHW":
+                # (NOTE,zhoukangkang paddle inference ) make hit paddle inference elementwiseadd_transpose_pass.
+                batch, _, height, width = hidden_states.shape
+            else:
+                batch, height, width, _ = hidden_states.shape
+            residual = hidden_states
+            shape = paddle.shape(hidden_states)
+            hidden_states = self.norm(hidden_states)
+            if not self.use_linear_projection:
+                hidden_states = (
+                    self.proj_in(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_in(hidden_states)
+                )
+                if self.data_format == "NCHW":
+                    hidden_states = hidden_states.transpose([0, 2, 3, 1]).flatten(1, 2)
+                else:
+                    hidden_states = hidden_states.flatten(1, 2)
+            else:
+                if self.data_format == "NCHW":
+                    hidden_states = hidden_states.transpose([0, 2, 3, 1]).flatten(1, 2)
+                else:
+                    hidden_states = hidden_states.flatten(1, 2)
+                hidden_states = (
+                    self.proj_in(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_in(hidden_states)
+                )
+        elif self.is_input_vectorized:
+            hidden_states = self.latent_image_embedding(hidden_states.cast("int64"))  # NEW ADD
+        elif self.is_input_patches:
+            height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
+            hidden_states = self.pos_embed(hidden_states)
+            if self.adaln_single is not None:
+                if self.use_additional_conditions and added_cond_kwargs is None:
+                    raise ValueError(
+                        "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`."
+                    )
+                batch_size = hidden_states.shape[0]
+                timestep, embedded_timestep = self.adaln_single(
+                    timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+                )
+        # 2. Blocks
+        if self.caption_projection is not None:
+            batch_size = hidden_states.shape[0]
+            encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.reshape([batch_size, -1, hidden_states.shape[-1]])
+        if self.inference_optimize:
+            hidden_states = self.simplified_facebookdit(hidden_states, timestep, class_labels)
+        else:
+            for block in self.transformer_blocks:
+                if self.gradient_checkpointing and not hidden_states.stop_gradient and not use_old_recompute():
+                    def create_custom_forward(module, return_dict=None):
+                        def custom_forward(*inputs):
+                            if return_dict is not None:
+                                return module(*inputs, return_dict=return_dict)
+                            else:
+                                return module(*inputs)
+                        return custom_forward
+                    ckpt_kwargs = {} if recompute_use_reentrant() else {"use_reentrant": False}
+                    hidden_states = recompute(
+                        create_custom_forward(block),
+                        hidden_states,
+                        attention_mask,
+                        encoder_hidden_states,
+                        encoder_attention_mask,
+                        timestep,
+                        cross_attention_kwargs,
+                        class_labels,
+                        **ckpt_kwargs,
+                    )
+                else:
+                    hidden_states = block(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        encoder_hidden_states=encoder_hidden_states,
+                        encoder_attention_mask=encoder_attention_mask,
+                        timestep=timestep,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        class_labels=class_labels,
+                    )
+        # 3. Output
+        if self.is_input_continuous:
+            if not self.use_linear_projection:
+                if self.data_format == "NCHW":
+                    hidden_states = hidden_states.reshape([shape[0], shape[2], shape[3], self.inner_dim])
+                else:
+                    hidden_states = hidden_states.reshape([shape[0], shape[1], shape[2], self.inner_dim])
+                if self.data_format == "NCHW":
+                    hidden_states = hidden_states.transpose([0, 3, 1, 2])
+                hidden_states = (
+                    self.proj_out(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_out(hidden_states)
+                )
+            else:
+                hidden_states = (
+                    self.proj_out(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_out(hidden_states)
+                )
+                if self.data_format == "NCHW":
+                    hidden_states = hidden_states.reshape([shape[0], shape[2], shape[3], self.inner_dim])
+                else:
+                    hidden_states = hidden_states.reshape([shape[0], shape[1], shape[2], self.inner_dim])
+                if self.data_format == "NCHW":
+                    hidden_states = hidden_states.transpose([0, 3, 1, 2])
+            output = hidden_states + residual
+        elif self.is_input_vectorized:
+            hidden_states = self.norm_out(hidden_states)
+            logits = self.out(hidden_states)
+            # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+            logits = logits.transpose([0, 2, 1])
+            # log(p(x_0))
+            output = F.log_softmax(logits.cast("float64"), axis=1).cast("float32")
+        if self.is_input_patches:
+            if self.config.norm_type != "ada_norm_single":
+                conditioning = self.transformer_blocks[0].norm1.emb(
+                    timestep, class_labels, hidden_dtype=hidden_states.dtype
+                )
+                shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, axis=1)
+                hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+                hidden_states = self.proj_out_2(hidden_states)
+            elif self.config.norm_type == "ada_norm_single":
+                shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, axis=1)
+                hidden_states = self.norm_out(hidden_states)
+                # Modulation
+                hidden_states = hidden_states * (1 + scale) + shift
+                hidden_states = self.proj_out(hidden_states)
+                hidden_states = hidden_states.squeeze(1)
+            # unpatchify
+            if self.adaln_single is None:
+                height = width = int(hidden_states.shape[1] ** 0.5)
+            hidden_states = hidden_states.reshape(
+                shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+            )
+            # hidden_states = paddle.einsum("nhwpqc->nchpwq", hidden_states)
+            hidden_states = hidden_states.transpose([0, 5, 1, 3, 2, 4])
+            output = hidden_states.reshape(
+                shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
+            )
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)
+    @classmethod
+    def custom_modify_weight(cls, model_to_load, state_dict):
+        if not model_to_load.inference_optimize:
+            return
+        for i in range(28):
+            map_from_my_dit = [
+                (f"q.{i}.weight", f"{i}.attn1.to_q.weight"),
+                (f"k.{i}.weight", f"{i}.attn1.to_k.weight"),
+                (f"v.{i}.weight", f"{i}.attn1.to_v.weight"),
+                (f"q.{i}.bias", f"{i}.attn1.to_q.bias"),
+                (f"k.{i}.bias", f"{i}.attn1.to_k.bias"),
+                (f"v.{i}.bias", f"{i}.attn1.to_v.bias"),
+                (f"out_proj.{i}.weight", f"{i}.attn1.to_out.0.weight"),
+                (f"out_proj.{i}.bias", f"{i}.attn1.to_out.0.bias"),
+                (f"ffn1.{i}.weight", f"{i}.ff.net.0.proj.weight"),
+                (f"ffn1.{i}.bias", f"{i}.ff.net.0.proj.bias"),
+                (f"ffn2.{i}.weight", f"{i}.ff.net.2.weight"),
+                (f"ffn2.{i}.bias", f"{i}.ff.net.2.bias"),
+                (f"fcs0.{i}.weight", f"{i}.norm1.emb.timestep_embedder.linear_1.weight"),
+                (f"fcs0.{i}.bias", f"{i}.norm1.emb.timestep_embedder.linear_1.bias"),
+                (f"fcs1.{i}.weight", f"{i}.norm1.emb.timestep_embedder.linear_2.weight"),
+                (f"fcs1.{i}.bias", f"{i}.norm1.emb.timestep_embedder.linear_2.bias"),
+                (f"fcs2.{i}.weight", f"{i}.norm1.linear.weight"),
+                (f"fcs2.{i}.bias", f"{i}.norm1.linear.bias"),
+                (f"embs.{i}.weight", f"{i}.norm1.emb.class_embedder.embedding_table.weight"),
+            ]
+            for to_, from_ in map_from_my_dit:
+                state_dict["simplified_facebookdit." + to_] = paddle.assign(state_dict["transformer_blocks." + from_])

PaddleMIX/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py ADDED Viewed

	@@ -0,0 +1,752 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional, Tuple, Union
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from ..utils import is_ppxformers_available
+from .activations import get_activation
+from .resnet import Downsample1D, ResidualTemporalBlock1D, Upsample1D, rearrange_dims
+class DownResnetBlock1D(nn.Layer):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        conv_shortcut: bool = False,
+        temb_channels: int = 32,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
+        non_linearity: Optional[str] = None,
+        time_embedding_norm: str = "default",
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.time_embedding_norm = time_embedding_norm
+        self.add_downsample = add_downsample
+        self.output_scale_factor = output_scale_factor
+        if groups_out is None:
+            groups_out = groups
+        # there will always be at least one resnet
+        resnets = [ResidualTemporalBlock1D(in_channels, out_channels, embed_dim=temb_channels)]
+        for _ in range(num_layers):
+            resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=temb_channels))
+        self.resnets = nn.LayerList(resnets)
+        if non_linearity is None:
+            self.nonlinearity = None
+        else:
+            self.nonlinearity = get_activation(non_linearity)
+        self.downsample = None
+        if add_downsample:
+            self.downsample = Downsample1D(out_channels, use_conv=True, padding=1)
+    def forward(self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None) -> paddle.Tensor:
+        output_states = ()
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for resnet in self.resnets[1:]:
+            hidden_states = resnet(hidden_states, temb)
+        output_states += (hidden_states,)
+        if self.nonlinearity is not None:
+            hidden_states = self.nonlinearity(hidden_states)
+        if self.downsample is not None:
+            hidden_states = self.downsample(hidden_states)
+        return hidden_states, output_states
+class UpResnetBlock1D(nn.Layer):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        temb_channels: int = 32,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
+        non_linearity: Optional[str] = None,
+        time_embedding_norm: str = "default",
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.time_embedding_norm = time_embedding_norm
+        self.add_upsample = add_upsample
+        self.output_scale_factor = output_scale_factor
+        if groups_out is None:
+            groups_out = groups
+        # there will always be at least one resnet
+        resnets = [ResidualTemporalBlock1D(2 * in_channels, out_channels, embed_dim=temb_channels)]
+        for _ in range(num_layers):
+            resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=temb_channels))
+        self.resnets = nn.LayerList(resnets)
+        if non_linearity is None:
+            self.nonlinearity = None
+        else:
+            self.nonlinearity = get_activation(non_linearity)
+        self.upsample = None
+        if add_upsample:
+            self.upsample = Upsample1D(out_channels, use_conv_transpose=True)
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        res_hidden_states_tuple: Optional[Tuple[paddle.Tensor, ...]] = None,
+        temb: Optional[paddle.Tensor] = None,
+    ) -> paddle.Tensor:
+        if res_hidden_states_tuple is not None:
+            res_hidden_states = res_hidden_states_tuple[-1]
+            hidden_states = paddle.concat((hidden_states, res_hidden_states), axis=1)
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for resnet in self.resnets[1:]:
+            hidden_states = resnet(hidden_states, temb)
+        if self.nonlinearity is not None:
+            hidden_states = self.nonlinearity(hidden_states)
+        if self.upsample is not None:
+            hidden_states = self.upsample(hidden_states)
+        return hidden_states
+class ValueFunctionMidBlock1D(nn.Layer):
+    def __init__(self, in_channels: int, out_channels: int, embed_dim: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.embed_dim = embed_dim
+        self.res1 = ResidualTemporalBlock1D(in_channels, in_channels // 2, embed_dim=embed_dim)
+        self.down1 = Downsample1D(out_channels // 2, use_conv=True)
+        self.res2 = ResidualTemporalBlock1D(in_channels // 2, in_channels // 4, embed_dim=embed_dim)
+        self.down2 = Downsample1D(out_channels // 4, use_conv=True)
+    def forward(self, x: paddle.Tensor, temb: Optional[paddle.Tensor] = None) -> paddle.Tensor:
+        x = self.res1(x, temb)
+        x = self.down1(x)
+        x = self.res2(x, temb)
+        x = self.down2(x)
+        return x
+class MidResTemporalBlock1D(nn.Layer):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        embed_dim: int,
+        num_layers: int = 1,
+        add_downsample: bool = False,
+        add_upsample: bool = False,
+        non_linearity: Optional[str] = None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.add_downsample = add_downsample
+        # there will always be at least one resnet
+        resnets = [ResidualTemporalBlock1D(in_channels, out_channels, embed_dim=embed_dim)]
+        for _ in range(num_layers):
+            resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=embed_dim))
+        self.resnets = nn.LayerList(resnets)
+        if non_linearity is None:
+            self.nonlinearity = None
+        else:
+            self.nonlinearity = get_activation(non_linearity)
+        self.upsample = None
+        if add_upsample:
+            self.upsample = Downsample1D(out_channels, use_conv=True)
+        self.downsample = None
+        if add_downsample:
+            self.downsample = Downsample1D(out_channels, use_conv=True)
+        if self.upsample and self.downsample:
+            raise ValueError("Block cannot downsample and upsample")
+    def forward(self, hidden_states: paddle.Tensor, temb: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for resnet in self.resnets[1:]:
+            hidden_states = resnet(hidden_states, temb)
+        if self.upsample:
+            hidden_states = self.upsample(hidden_states)
+        if self.downsample:
+            self.downsample = self.downsample(hidden_states)
+        return hidden_states
+class OutConv1DBlock(nn.Layer):
+    def __init__(self, num_groups_out: int, out_channels: int, embed_dim: int, act_fn: str):
+        super().__init__()
+        self.final_conv1d_1 = nn.Conv1D(embed_dim, embed_dim, 5, padding=2)
+        self.final_conv1d_gn = nn.GroupNorm(num_groups_out, embed_dim)
+        self.final_conv1d_act = get_activation(act_fn)
+        self.final_conv1d_2 = nn.Conv1D(embed_dim, out_channels, 1)
+    def forward(self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None) -> paddle.Tensor:
+        hidden_states = self.final_conv1d_1(hidden_states)
+        hidden_states = rearrange_dims(hidden_states)
+        hidden_states = self.final_conv1d_gn(hidden_states)
+        hidden_states = rearrange_dims(hidden_states)
+        hidden_states = self.final_conv1d_act(hidden_states)
+        hidden_states = self.final_conv1d_2(hidden_states)
+        return hidden_states
+class OutValueFunctionBlock(nn.Layer):
+    def __init__(self, fc_dim: int, embed_dim: int, act_fn: str = "mish"):
+        super().__init__()
+        self.final_block = nn.LayerList(
+            [
+                nn.Linear(fc_dim + embed_dim, fc_dim // 2),
+                get_activation(act_fn),
+                nn.Linear(fc_dim // 2, 1),
+            ]
+        )
+    def forward(self, hidden_states: paddle.Tensor, temb: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = hidden_states.reshape([hidden_states.shape[0], -1])
+        hidden_states = paddle.concat((hidden_states, temb), axis=-1)
+        for layer in self.final_block:
+            hidden_states = layer(hidden_states)
+        return hidden_states
+_kernels = {
+    "linear": [1 / 8, 3 / 8, 3 / 8, 1 / 8],
+    "cubic": [-0.01171875, -0.03515625, 0.11328125, 0.43359375, 0.43359375, 0.11328125, -0.03515625, -0.01171875],
+    "lanczos3": [
+        0.003689131001010537,
+        0.015056144446134567,
+        -0.03399861603975296,
+        -0.066637322306633,
+        0.13550527393817902,
+        0.44638532400131226,
+        0.44638532400131226,
+        0.13550527393817902,
+        -0.066637322306633,
+        -0.03399861603975296,
+        0.015056144446134567,
+        0.003689131001010537,
+    ],
+}
+class Downsample1d(nn.Layer):
+    def __init__(self, kernel: str = "linear", pad_mode: str = "reflect"):
+        super().__init__()
+        self.pad_mode = pad_mode
+        kernel_1d = paddle.to_tensor(_kernels[kernel])
+        self.pad = kernel_1d.shape[0] // 2 - 1
+        self.register_buffer("kernel", kernel_1d)
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = F.pad(hidden_states, (self.pad,) * 2, self.pad_mode, data_format="NCL")
+        weight = paddle.zeros(
+            [hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]], dtype=hidden_states.dtype
+        )
+        indices = paddle.arange(hidden_states.shape[1])
+        weight[indices, indices] = self.kernel.cast(weight.dtype).expand([hidden_states.shape[1], -1])
+        return F.conv1d(hidden_states, weight, stride=2)
+class Upsample1d(nn.Layer):
+    def __init__(self, kernel: str = "linear", pad_mode: str = "reflect"):
+        super().__init__()
+        self.pad_mode = pad_mode
+        kernel_1d = paddle.to_tensor(_kernels[kernel])
+        self.pad = kernel_1d.shape[0] // 2 - 1
+        self.register_buffer("kernel", kernel_1d)
+    def forward(self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None) -> paddle.Tensor:
+        hidden_states = F.pad(hidden_states, ((self.pad + 1) // 2,) * 2, self.pad_mode, data_format="NCL")
+        weight = paddle.zeros(
+            [hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]], dtype=hidden_states.dtype
+        )
+        indices = paddle.arange(hidden_states.shape[1])
+        weight[indices, indices] = self.kernel.cast(weight.dtype).expand([hidden_states.shape[1], -1])
+        return F.conv1d_transpose(hidden_states, weight, stride=2, padding=self.pad * 2 + 1)
+class SelfAttention1d(nn.Layer):
+    def __init__(self, in_channels: int, n_head: int = 1, dropout_rate: float = 0.0):
+        super().__init__()
+        self.channels = in_channels
+        self.group_norm = nn.GroupNorm(1, num_channels=in_channels)
+        self.num_heads = n_head
+        self.head_size = in_channels // n_head
+        self.scale = 1 / math.sqrt(self.head_size)
+        self.query = nn.Linear(self.channels, self.channels)
+        self.key = nn.Linear(self.channels, self.channels)
+        self.value = nn.Linear(self.channels, self.channels)
+        self.proj_attn = nn.Linear(self.channels, self.channels)
+        self.dropout = nn.Dropout(dropout_rate)
+        self._use_memory_efficient_attention_xformers = False
+        self._attention_op = None
+    def reshape_heads_to_batch_dim(self, tensor, transpose=True):
+        tensor = tensor.reshape([0, 0, self.num_heads, self.head_size])
+        if transpose:
+            tensor = tensor.transpose([0, 2, 1, 3])
+        return tensor
+    def reshape_batch_dim_to_heads(self, tensor, transpose=True):
+        if transpose:
+            tensor = tensor.transpose([0, 2, 1, 3])
+        tensor = tensor.reshape([0, 0, tensor.shape[2] * tensor.shape[3]])
+        return tensor
+    def set_use_memory_efficient_attention_xformers(
+        self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[str] = None
+    ):
+        if use_memory_efficient_attention_xformers:
+            if not is_ppxformers_available():
+                raise NotImplementedError(
+                    "requires the scaled_dot_product_attention but your PaddlePaddle donot have this. Checkout the instructions on the installation page: https://www.paddlepaddle.org.cn/install/quick and follow the ones that match your environment."
+                )
+            else:
+                try:
+                    _ = F.scaled_dot_product_attention_(
+                        paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+                        paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+                        paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
+                        attention_op=attention_op,
+                    )
+                except Exception as e:
+                    raise e
+        self._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+        self._attention_op = attention_op
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        residual = hidden_states
+        hidden_states = self.group_norm(hidden_states)
+        hidden_states = hidden_states.transpose([0, 2, 1])
+        query_proj = self.query(hidden_states)
+        key_proj = self.key(hidden_states)
+        value_proj = self.value(hidden_states)
+        query_proj = self.reshape_heads_to_batch_dim(
+            query_proj, transpose=not self._use_memory_efficient_attention_xformers
+        )
+        key_proj = self.reshape_heads_to_batch_dim(
+            key_proj, transpose=not self._use_memory_efficient_attention_xformers
+        )
+        value_proj = self.reshape_heads_to_batch_dim(
+            value_proj, transpose=not self._use_memory_efficient_attention_xformers
+        )
+        if self._use_memory_efficient_attention_xformers:
+            hidden_states = F.scaled_dot_product_attention_(
+                query_proj,
+                key_proj,
+                value_proj,
+                attn_mask=None,
+                scale=self.scale,
+                dropout_p=0.0,
+                training=self.training,
+                attention_op=self._attention_op,
+            )
+        else:
+            attention_scores = paddle.matmul(query_proj, key_proj, transpose_y=True) * self.scale
+            attention_probs = F.softmax(attention_scores.cast("float32"), axis=-1).cast(attention_scores.dtype)
+            hidden_states = paddle.matmul(attention_probs, value_proj)
+        # reshape hidden_states
+        hidden_states = self.reshape_batch_dim_to_heads(
+            hidden_states, transpose=not self._use_memory_efficient_attention_xformers
+        )
+        # compute next hidden_states
+        hidden_states = self.proj_attn(hidden_states)
+        hidden_states = hidden_states.transpose([0, 2, 1])
+        hidden_states = self.dropout(hidden_states)
+        output = hidden_states + residual
+        return output
+class ResConvBlock(nn.Layer):
+    def __init__(self, in_channels: int, mid_channels: int, out_channels: int, is_last: bool = False):
+        super().__init__()
+        self.is_last = is_last
+        self.has_conv_skip = in_channels != out_channels
+        if self.has_conv_skip:
+            self.conv_skip = nn.Conv1D(in_channels, out_channels, 1, bias_attr=False)
+        self.conv_1 = nn.Conv1D(in_channels, mid_channels, 5, padding=2)
+        self.group_norm_1 = nn.GroupNorm(1, mid_channels)
+        self.gelu_1 = nn.GELU()
+        self.conv_2 = nn.Conv1D(mid_channels, out_channels, 5, padding=2)
+        if not self.is_last:
+            self.group_norm_2 = nn.GroupNorm(1, out_channels)
+            self.gelu_2 = nn.GELU()
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        residual = self.conv_skip(hidden_states) if self.has_conv_skip else hidden_states
+        hidden_states = self.conv_1(hidden_states)
+        hidden_states = self.group_norm_1(hidden_states)
+        hidden_states = self.gelu_1(hidden_states)
+        hidden_states = self.conv_2(hidden_states)
+        if not self.is_last:
+            hidden_states = self.group_norm_2(hidden_states)
+            hidden_states = self.gelu_2(hidden_states)
+        output = hidden_states + residual
+        return output
+class UNetMidBlock1D(nn.Layer):
+    def __init__(self, mid_channels: int, in_channels: int, out_channels: Optional[int] = None):
+        super().__init__()
+        out_channels = in_channels if out_channels is None else out_channels
+        # there is always at least one resnet
+        self.down = Downsample1d("cubic")
+        resnets = [
+            ResConvBlock(in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+        attentions = [
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(out_channels, out_channels // 32),
+        ]
+        self.up = Upsample1d(kernel="cubic")
+        self.attentions = nn.LayerList(attentions)
+        self.resnets = nn.LayerList(resnets)
+    def forward(self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None) -> paddle.Tensor:
+        hidden_states = self.down(hidden_states)
+        for attn, resnet in zip(self.attentions, self.resnets):
+            hidden_states = resnet(hidden_states)
+            hidden_states = attn(hidden_states)
+        hidden_states = self.up(hidden_states)
+        return hidden_states
+class AttnDownBlock1D(nn.Layer):
+    def __init__(self, out_channels: int, in_channels: int, mid_channels: Optional[int] = None):
+        super().__init__()
+        mid_channels = out_channels if mid_channels is None else mid_channels
+        self.down = Downsample1d("cubic")
+        resnets = [
+            ResConvBlock(in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+        attentions = [
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(out_channels, out_channels // 32),
+        ]
+        self.attentions = nn.LayerList(attentions)
+        self.resnets = nn.LayerList(resnets)
+    def forward(self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None) -> paddle.Tensor:
+        hidden_states = self.down(hidden_states)
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states)
+            hidden_states = attn(hidden_states)
+        return hidden_states, (hidden_states,)
+class DownBlock1D(nn.Layer):
+    def __init__(self, out_channels: int, in_channels: int, mid_channels: Optional[int] = None):
+        super().__init__()
+        mid_channels = out_channels if mid_channels is None else mid_channels
+        self.down = Downsample1d("cubic")
+        resnets = [
+            ResConvBlock(in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+        self.resnets = nn.LayerList(resnets)
+    def forward(self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None) -> paddle.Tensor:
+        hidden_states = self.down(hidden_states)
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states)
+        return hidden_states, (hidden_states,)
+class DownBlock1DNoSkip(nn.Layer):
+    def __init__(self, out_channels: int, in_channels: int, mid_channels: Optional[int] = None):
+        super().__init__()
+        mid_channels = out_channels if mid_channels is None else mid_channels
+        resnets = [
+            ResConvBlock(in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+        self.resnets = nn.LayerList(resnets)
+    def forward(self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None) -> paddle.Tensor:
+        hidden_states = paddle.concat([hidden_states, temb], axis=1)
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states)
+        return hidden_states, (hidden_states,)
+class AttnUpBlock1D(nn.Layer):
+    def __init__(self, in_channels: int, out_channels: int, mid_channels: Optional[int] = None):
+        super().__init__()
+        mid_channels = out_channels if mid_channels is None else mid_channels
+        resnets = [
+            ResConvBlock(2 * in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+        attentions = [
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(out_channels, out_channels // 32),
+        ]
+        self.attentions = nn.LayerList(attentions)
+        self.resnets = nn.LayerList(resnets)
+        self.up = Upsample1d(kernel="cubic")
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        res_hidden_states_tuple: Tuple[paddle.Tensor, ...],
+        temb: Optional[paddle.Tensor] = None,
+    ) -> paddle.Tensor:
+        res_hidden_states = res_hidden_states_tuple[-1]
+        hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states)
+            hidden_states = attn(hidden_states)
+        hidden_states = self.up(hidden_states)
+        return hidden_states
+class UpBlock1D(nn.Layer):
+    def __init__(self, in_channels: int, out_channels: int, mid_channels: Optional[int] = None):
+        super().__init__()
+        mid_channels = in_channels if mid_channels is None else mid_channels
+        resnets = [
+            ResConvBlock(2 * in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+        self.resnets = nn.LayerList(resnets)
+        self.up = Upsample1d(kernel="cubic")
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        res_hidden_states_tuple: Tuple[paddle.Tensor, ...],
+        temb: Optional[paddle.Tensor] = None,
+    ) -> paddle.Tensor:
+        res_hidden_states = res_hidden_states_tuple[-1]
+        hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states)
+        hidden_states = self.up(hidden_states)
+        return hidden_states
+class UpBlock1DNoSkip(nn.Layer):
+    def __init__(self, in_channels: int, out_channels: int, mid_channels: Optional[int] = None):
+        super().__init__()
+        mid_channels = in_channels if mid_channels is None else mid_channels
+        resnets = [
+            ResConvBlock(2 * in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels, is_last=True),
+        ]
+        self.resnets = nn.LayerList(resnets)
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        res_hidden_states_tuple: Tuple[paddle.Tensor, ...],
+        temb: Optional[paddle.Tensor] = None,
+    ) -> paddle.Tensor:
+        res_hidden_states = res_hidden_states_tuple[-1]
+        hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states)
+        return hidden_states
+DownBlockType = Union[DownResnetBlock1D, DownBlock1D, AttnDownBlock1D, DownBlock1DNoSkip]
+MidBlockType = Union[MidResTemporalBlock1D, ValueFunctionMidBlock1D, UNetMidBlock1D]
+OutBlockType = Union[OutConv1DBlock, OutValueFunctionBlock]
+UpBlockType = Union[UpResnetBlock1D, UpBlock1D, AttnUpBlock1D, UpBlock1DNoSkip]
+def get_down_block(
+    down_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    temb_channels: int,
+    add_downsample: bool,
+) -> DownBlockType:
+    if down_block_type == "DownResnetBlock1D":
+        return DownResnetBlock1D(
+            in_channels=in_channels,
+            num_layers=num_layers,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+        )
+    elif down_block_type == "DownBlock1D":
+        return DownBlock1D(out_channels=out_channels, in_channels=in_channels)
+    elif down_block_type == "AttnDownBlock1D":
+        return AttnDownBlock1D(out_channels=out_channels, in_channels=in_channels)
+    elif down_block_type == "DownBlock1DNoSkip":
+        return DownBlock1DNoSkip(out_channels=out_channels, in_channels=in_channels)
+    raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block(
+    up_block_type: str, num_layers: int, in_channels: int, out_channels: int, temb_channels: int, add_upsample: bool
+) -> UpBlockType:
+    if up_block_type == "UpResnetBlock1D":
+        return UpResnetBlock1D(
+            in_channels=in_channels,
+            num_layers=num_layers,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+        )
+    elif up_block_type == "UpBlock1D":
+        return UpBlock1D(in_channels=in_channels, out_channels=out_channels)
+    elif up_block_type == "AttnUpBlock1D":
+        return AttnUpBlock1D(in_channels=in_channels, out_channels=out_channels)
+    elif up_block_type == "UpBlock1DNoSkip":
+        return UpBlock1DNoSkip(in_channels=in_channels, out_channels=out_channels)
+    raise ValueError(f"{up_block_type} does not exist.")
+def get_mid_block(
+    mid_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    mid_channels: int,
+    out_channels: int,
+    embed_dim: int,
+    add_downsample: bool,
+) -> MidBlockType:
+    if mid_block_type == "MidResTemporalBlock1D":
+        return MidResTemporalBlock1D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            embed_dim=embed_dim,
+            add_downsample=add_downsample,
+        )
+    elif mid_block_type == "ValueFunctionMidBlock1D":
+        return ValueFunctionMidBlock1D(in_channels=in_channels, out_channels=out_channels, embed_dim=embed_dim)
+    elif mid_block_type == "UNetMidBlock1D":
+        return UNetMidBlock1D(in_channels=in_channels, mid_channels=mid_channels, out_channels=out_channels)
+    raise ValueError(f"{mid_block_type} does not exist.")
+def get_out_block(
+    *, out_block_type: str, num_groups_out: int, embed_dim: int, out_channels: int, act_fn: str, fc_dim: int
+) -> Optional[OutBlockType]:
+    if out_block_type == "OutConv1DBlock":
+        return OutConv1DBlock(num_groups_out, out_channels, embed_dim, act_fn)
+    elif out_block_type == "ValueFunction":
+        return OutValueFunctionBlock(fc_dim, embed_dim, act_fn)
+    return None