Upload 13 files

Browse files

Files changed (14) hide show

.gitattributes +5 -0
Disclaimer.md +6 -0
LICENSE +28 -0
README.md +7 -16
asserts/img2img-init.png +3 -0
asserts/img2img_output_axe.png +3 -0
asserts/lcm_lora_sdv1-5_imgGrid_output.png +3 -0
asserts/lcm_lora_sdv1_5_axmodel.png +3 -0
asserts/txt2img_output_axe.png +3 -0
config.json.txt +0 -0
run_img2img_axe_infer.py +546 -0
run_img2img_onnx_infer.py +546 -0
run_txt2img_axe_infer.py +183 -0
run_txt2img_onnx_infer.py +183 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+asserts/img2img_output_axe.png filter=lfs diff=lfs merge=lfs -text
+asserts/img2img-init.png filter=lfs diff=lfs merge=lfs -text
+asserts/lcm_lora_sdv1_5_axmodel.png filter=lfs diff=lfs merge=lfs -text
+asserts/lcm_lora_sdv1-5_imgGrid_output.png filter=lfs diff=lfs merge=lfs -text
+asserts/txt2img_output_axe.png filter=lfs diff=lfs merge=lfs -text

Disclaimer.md ADDED Viewed

	@@ -0,0 +1,6 @@

+# 免责声明
+所有由 SD1.5-LCM.Axera 导出的模型均存在固有的局限性，可能产生错误的、有害的、冒犯性的或其他不良的输出。用户在关键或高风险场景中应谨慎行事，不要使用这些模型，以免导致人身伤害、财产损失或重大损失。此类场景的例子包括但不限于医疗领域、可能导致伤害的软硬件系统的控制以及进行重要的财务或法律决策。
+SD1.5-LCM.Axera 按对于开源项目“原样”提供，不附带任何种类的明示或暗示的保证，包括但不限于适销性、特定目的的适用性和非侵权的暗示保证。在任何情况下，作者、贡献者或版权所有者均不对因软件或使用或其他软件交易而产生的任何索赔、损害赔偿或其他责任（无论是合同、侵权还是其他原因）承担责任。
+使用 SD1.5-LCM.Axera 即表示您同意这些条款和条件，并承认您了解其使用可能带来的潜在风险。您还同意赔偿并使作者、贡献者和版权所有者免受因您使用 SD1.5-LCM.Axera 而产生的任何索赔、损害赔偿或责任的影响。

LICENSE ADDED Viewed

	@@ -0,0 +1,28 @@

+BSD 3-Clause License
+Copyright (c) 2024, BUG1989
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

README.md CHANGED Viewed

@@ -1,23 +1,14 @@
----
-license: bsd-3-clause
-base_model:
-- latent-consistency/lcm-lora-sdv1-5
-- Lykon/dreamshaper-7
-tags:
-- RaspberryPi5
-- StableDiffusion1.5
----
-# SD1.5-LCM
 基于 StableDiffusion 1.5 LCM 项目，展示该项目 **文生图**、**图生图** 在基于 AX650N 的产品上部署的流程。
-## Support Platform
-- AX650
-  - AX650N DEMO Board
-  - [M4N-Dock(爱芯派Pro)](https://wiki.sipeed.com/hardware/zh/maixIV/m4ndock/m4ndock.html)
-  - [M.2 Accelerator card](https://axcl-docs.readthedocs.io/zh-cn/latest/doc_guide_hardware.html)
 原始模型请参考
 - [Latent Consistency Model (LCM) LoRA: SDv1-5](https://huggingface.co/latent-consistency/lcm-lora-sdv1-5)

+# SD1.5-LCM.Axera
 基于 StableDiffusion 1.5 LCM 项目，展示该项目 **文生图**、**图生图** 在基于 AX650N 的产品上部署的流程。
+支持芯片：
+- AX650N
+支持硬件
+- 爱芯派Pro
+- M.2 算力卡
 原始模型请参考
 - [Latent Consistency Model (LCM) LoRA: SDv1-5](https://huggingface.co/latent-consistency/lcm-lora-sdv1-5)

asserts/img2img-init.png ADDED Viewed

Git LFS Details

SHA256: 42f0ee242d8caaee1aea5506c8318c6a920d559a63c6db8d79f993eebaf7d790
Pointer size: 131 Bytes
Size of remote file: 253 kB

asserts/img2img_output_axe.png ADDED Viewed

Git LFS Details

SHA256: 7e10a3fb95ce3eb95079d584cafa2f7e55b373c3bdc0110fdf5f7a45e462df78
Pointer size: 131 Bytes
Size of remote file: 367 kB

asserts/lcm_lora_sdv1-5_imgGrid_output.png ADDED Viewed

Git LFS Details

SHA256: 327f76a5e9b68642224224d7dbcae1828da695179d4b6d7c41507b8c1c369d8e
Pointer size: 131 Bytes
Size of remote file: 633 kB

asserts/lcm_lora_sdv1_5_axmodel.png ADDED Viewed

Git LFS Details

SHA256: 91dd62d810ba66e8142ef27285fe28af55746e764c10eafe3521a52cd92f9da0
Pointer size: 131 Bytes
Size of remote file: 395 kB

asserts/txt2img_output_axe.png ADDED Viewed

Git LFS Details

SHA256: 572a205faf1fb7c2538c06f5c2bbce175ef8630b3d276a201f89c7b6695efeb0
Pointer size: 131 Bytes
Size of remote file: 340 kB

config.json.txt ADDED Viewed

File without changes

run_img2img_axe_infer.py ADDED Viewed

	@@ -0,0 +1,546 @@

+from typing import List, Union
+import numpy as np
+# import onnxruntime
+import axengine
+import torch
+from PIL import Image
+from transformers import CLIPTokenizer, CLIPTextModel, PreTrainedTokenizer, CLIPTextModelWithProjection
+import time
+import argparse
+from diffusers.utils import load_image
+import PIL.Image
+from typing import List, Optional, Tuple, Union
+from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.utils import make_image_grid, load_image
+########## Img2Img
+PipelineImageInput = Union[
+    PIL.Image.Image,
+    np.ndarray,
+    torch.Tensor,
+    List[PIL.Image.Image],
+    List[np.ndarray],
+    List[torch.Tensor],
+]
+PipelineDepthInput = PipelineImageInput
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+def add_noise(
+    original_samples: torch.Tensor,
+    noise: torch.Tensor,
+    timesteps: torch.IntTensor,
+) -> torch.Tensor:
+    # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+    # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+    # for the subsequent add_noise calls
+    # self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+    # Convert betas to alphas_bar_sqrt
+    beta_start = 0.00085
+    beta_end = 0.012
+    num_train_timesteps = 1000
+    betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_cumprod = alphas_cumprod.to(device=original_samples.device)
+    alphas_cumprod = alphas_cumprod.to(dtype=original_samples.dtype)
+    timesteps = timesteps.to(original_samples.device)
+    sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+    sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+    while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+        sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+    sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+    sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+    while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+    noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+    return noisy_samples
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+def numpy_to_pt(images: np.ndarray) -> torch.Tensor:
+    r"""
+    Convert a NumPy image to a PyTorch tensor.
+    Args:
+        images (`np.ndarray`):
+            The NumPy image array to convert to PyTorch format.
+    Returns:
+        `torch.Tensor`:
+            A PyTorch tensor representation of the images.
+    """
+    if images.ndim == 3:
+        images = images[..., None]
+    images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+    return images
+def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
+    r"""
+    Convert a PIL image or a list of PIL images to NumPy arrays.
+    Args:
+        images (`PIL.Image.Image` or `List[PIL.Image.Image]`):
+            The PIL image or list of images to convert to NumPy format.
+    Returns:
+        `np.ndarray`:
+            A NumPy array representation of the images.
+    """
+    if not isinstance(images, list):
+        images = [images]
+    images = [np.array(image).astype(np.float32) / 255.0 for image in images]
+    images = np.stack(images, axis=0)
+    return images
+def is_valid_image(image) -> bool:
+    r"""
+    Checks if the input is a valid image.
+    A valid image can be:
+    - A `PIL.Image.Image`.
+    - A 2D or 3D `np.ndarray` or `torch.Tensor` (grayscale or color image).
+    Args:
+        image (`Union[PIL.Image.Image, np.ndarray, torch.Tensor]`):
+            The image to validate. It can be a PIL image, a NumPy array, or a torch tensor.
+    Returns:
+        `bool`:
+            `True` if the input is a valid image, `False` otherwise.
+    """
+    return isinstance(image, PIL.Image.Image) or isinstance(image, (np.ndarray, torch.Tensor)) and image.ndim in (2, 3)
+def is_valid_image_imagelist(images):
+    r"""
+    Checks if the input is a valid image or list of images.
+    The input can be one of the following formats:
+    - A 4D tensor or numpy array (batch of images).
+    - A valid single image: `PIL.Image.Image`, 2D `np.ndarray` or `torch.Tensor` (grayscale image), 3D `np.ndarray` or
+      `torch.Tensor`.
+    - A list of valid images.
+    Args:
+        images (`Union[np.ndarray, torch.Tensor, PIL.Image.Image, List]`):
+            The image(s) to check. Can be a batch of images (4D tensor/array), a single image, or a list of valid
+            images.
+    Returns:
+        `bool`:
+            `True` if the input is valid, `False` otherwise.
+    """
+    if isinstance(images, (np.ndarray, torch.Tensor)) and images.ndim == 4:
+        return True
+    elif is_valid_image(images):
+        return True
+    elif isinstance(images, list):
+        return all(is_valid_image(image) for image in images)
+    return False
+def normalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
+    r"""
+    Normalize an image array to [-1,1].
+    Args:
+        images (`np.ndarray` or `torch.Tensor`):
+            The image array to normalize.
+    Returns:
+        `np.ndarray` or `torch.Tensor`:
+            The normalized image array.
+    """
+    return 2.0 * images - 1.0
+# Copy from: /home/baiyongqiang/miniforge-pypy3/envs/hf/lib/python3.9/site-packages/diffusers/image_processor.py#607
+def preprocess(
+    image: PipelineImageInput,
+    height: Optional[int] = None,
+    width: Optional[int] = None,
+    resize_mode: str = "default",  # "default", "fill", "crop"
+    crops_coords: Optional[Tuple[int, int, int, int]] = None,
+) -> torch.Tensor:
+    """
+    Preprocess the image input.
+    Args:
+        image (`PipelineImageInput`):
+            The image input, accepted formats are PIL images, NumPy arrays, PyTorch tensors; Also accept list of
+            supported formats.
+        height (`int`, *optional*):
+            The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default
+            height.
+        width (`int`, *optional*):
+            The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width.
+        resize_mode (`str`, *optional*, defaults to `default`):
+            The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit within
+            the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, will
+            resize the image to fit within the specified width and height, maintaining the aspect ratio, and then
+            center the image within the dimensions, filling empty with data from image. If `crop`, will resize the
+            image to fit within the specified width and height, maintaining the aspect ratio, and then center the
+            image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only
+            supported for PIL image input.
+        crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`):
+            The crop coordinates for each image in the batch. If `None`, will not crop the image.
+    Returns:
+        `torch.Tensor`:
+            The preprocessed image.
+    """
+    supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
+    # # Expand the missing dimension for 3-dimensional pytorch tensor or numpy array that represents grayscale image
+    # if self.config.do_convert_grayscale and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3:
+    #     if isinstance(image, torch.Tensor):
+    #         # if image is a pytorch tensor could have 2 possible shapes:
+    #         #    1. batch x height x width: we should insert the channel dimension at position 1
+    #         #    2. channel x height x width: we should insert batch dimension at position 0,
+    #         #       however, since both channel and batch dimension has same size 1, it is same to insert at position 1
+    #         #    for simplicity, we insert a dimension of size 1 at position 1 for both cases
+    #         image = image.unsqueeze(1)
+    #     else:
+    #         # if it is a numpy array, it could have 2 possible shapes:
+    #         #   1. batch x height x width: insert channel dimension on last position
+    #         #   2. height x width x channel: insert batch dimension on first position
+    #         if image.shape[-1] == 1:
+    #             image = np.expand_dims(image, axis=0)
+    #         else:
+    #             image = np.expand_dims(image, axis=-1)
+    if isinstance(image, list) and isinstance(image[0], np.ndarray) and image[0].ndim == 4:
+        warnings.warn(
+            "Passing `image` as a list of 4d np.ndarray is deprecated."
+            "Please concatenate the list along the batch dimension and pass it as a single 4d np.ndarray",
+            FutureWarning,
+        )
+        image = np.concatenate(image, axis=0)
+    if isinstance(image, list) and isinstance(image[0], torch.Tensor) and image[0].ndim == 4:
+        warnings.warn(
+            "Passing `image` as a list of 4d torch.Tensor is deprecated."
+            "Please concatenate the list along the batch dimension and pass it as a single 4d torch.Tensor",
+            FutureWarning,
+        )
+        image = torch.cat(image, axis=0)
+    if not is_valid_image_imagelist(image):
+        raise ValueError(
+            f"Input is in incorrect format. Currently, we only support {', '.join(str(x) for x in supported_formats)}"
+        )
+    if not isinstance(image, list):
+        image = [image]
+    if isinstance(image[0], PIL.Image.Image):
+        if crops_coords is not None:
+            image = [i.crop(crops_coords) for i in image]
+        # if self.config.do_resize:
+        #     height, width = self.get_default_height_width(image[0], height, width)
+        #     image = [self.resize(i, height, width, resize_mode=resize_mode) for i in image]
+        # if self.config.do_convert_rgb:
+        #     image = [self.convert_to_rgb(i) for i in image]
+        # elif self.config.do_convert_grayscale:
+        #     image = [self.convert_to_grayscale(i) for i in image]
+        image = pil_to_numpy(image)  # to np
+        image = numpy_to_pt(image)  # to pt
+    elif isinstance(image[0], np.ndarray):
+        image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+        # image = self.numpy_to_pt(image)
+        # height, width = self.get_default_height_width(image, height, width)
+        # if self.config.do_resize:
+        #     image = self.resize(image, height, width)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+        # if self.config.do_convert_grayscale and image.ndim == 3:
+        #     image = image.unsqueeze(1)
+        channel = image.shape[1]
+        # don't need any preprocess if the image is latents
+        # if channel == self.config.vae_latent_channels:
+        #     return image
+        # height, width = self.get_default_height_width(image, height, width)
+        # if self.config.do_resize:
+        #     image = self.resize(image, height, width)
+    # expected range [0,1], normalize to [-1,1]
+    do_normalize = True # self.config.do_normalize
+    if do_normalize and image.min() < 0:
+        warnings.warn(
+            "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
+            f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
+            FutureWarning,
+        )
+        do_normalize = False
+    if do_normalize:
+        image = normalize(image)
+    # if self.config.do_binarize:
+    #     image = self.binarize(image)
+    return image
+##########
+def get_args():
+    parser = argparse.ArgumentParser(
+        prog="StableDiffusion",
+        description="Generate picture with the input prompt"
+    )
+    parser.add_argument("--prompt", type=str, required=False, default="Astronauts in a jungle, cold color palette, muted colors, detailed, 8k", help="the input text prompt")
+    parser.add_argument("--text_model_dir", type=str, required=False, default="./models/", help="Path to text encoder and tokenizer files")
+    parser.add_argument("--unet_model", type=str, required=False, default="./models/unet.axmodel", help="Path to unet axmodel model")
+    parser.add_argument("--vae_encoder_model", type=str, required=False, default="./models/vae_encoder.axmodel", help="Path to vae encoder axmodel model")
+    parser.add_argument("--vae_decoder_model", type=str, required=False, default="./models/vae_decoder.axmodel", help="Path to vae decoder axmodel model")
+    parser.add_argument("--time_input", type=str, required=False, default="./models/time_input_img2img.npy", help="Path to time input file")
+    parser.add_argument("--init_image", type=str, required=False, default="./models/img2img-init.png", help="Path to initial image file")
+    parser.add_argument("--save_dir", type=str, required=False, default="./img2img_output_axe.png", help="Path to the output image file")
+    return parser.parse_args()
+def maybe_convert_prompt(prompt: Union[str, List[str]], tokenizer: "PreTrainedTokenizer"):  # noqa: F821
+    if not isinstance(prompt, List):
+        prompts = [prompt]
+    else:
+        prompts = prompt
+    prompts = [_maybe_convert_prompt(p, tokenizer) for p in prompts]
+    if not isinstance(prompt, List):
+        return prompts[0]
+    return prompts
+def _maybe_convert_prompt(prompt: str, tokenizer: "PreTrainedTokenizer"):  # noqa: F821
+    tokens = tokenizer.tokenize(prompt)
+    unique_tokens = set(tokens)
+    for token in unique_tokens:
+        if token in tokenizer.added_tokens_encoder:
+            replacement = token
+            i = 1
+            while f"{token}_{i}" in tokenizer.added_tokens_encoder:
+                replacement += f" {token}_{i}"
+                i += 1
+            prompt = prompt.replace(token, replacement)
+    return prompt
+def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
+    tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
+    text_encoder = CLIPTextModel.from_pretrained(text_encoder_dir,
+                                                 torch_dtype=torch.float32,
+                                                 variant="fp16")
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=77,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    prompt_embeds = text_encoder(text_input_ids.to("cpu"), attention_mask=None)
+    prompt_embeds_npy = prompt_embeds[0].detach().numpy()
+    return prompt_embeds_npy
+def get_alphas_cumprod():
+    betas = torch.linspace(0.00085 ** 0.5, 0.012 ** 0.5, 1000, dtype=torch.float32) ** 2
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0).detach().numpy()
+    final_alphas_cumprod = alphas_cumprod[0]
+    self_timesteps = np.arange(0, 1000)[::-1].copy().astype(np.int64)
+    return alphas_cumprod, final_alphas_cumprod, self_timesteps
+def resize_and_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
+    """
+    Resize the image to 512x512 and convert it to RGB.
+    """
+    return image.resize((512, 512)).convert("RGB")
+if __name__ == '__main__':
+    """
+    Usage:
+        - python3 run_img2img_axmodel_infer.py --prompt "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k" --unet_model output_onnx/unet_sim.onnx  --vae_encoder_model output_onnx/vae_encoder_sim.onnx --vae_decoder_model output_onnx/vae_decoder_sim.onnx  --time_input ./output_onnx/time_input.npy --save_dir ./img2img_output.png
+    """
+    args = get_args()
+    prompt = args.prompt
+    tokenizer_dir = args.text_model_dir + 'tokenizer'
+    text_encoder_dir = args.text_model_dir + 'text_encoder'
+    unet_model = args.unet_model
+    vae_decoder_model = args.vae_decoder_model
+    vae_encoder_model = args.vae_encoder_model
+    init_image = args.init_image
+    time_input = args.time_input
+    save_dir = args.save_dir
+    print(f"prompt: {prompt}")
+    print(f"text_tokenizer: {tokenizer_dir}")
+    print(f"text_encoder: {text_encoder_dir}")
+    print(f"unet_model: {unet_model}")
+    print(f"vae_encoder_model: {vae_encoder_model}")
+    print(f"vae_decoder_model: {vae_decoder_model}")
+    print(f"init image: {init_image}")
+    print(f"time_input: {time_input}")
+    print(f"save_dir: {save_dir}")
+    # timesteps = np.array([999, 759, 499, 259]).astype(np.int64)
+    # text encoder
+    start = time.time()
+    # prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
+    # prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k"
+    # prompt = "Caricature, a beautiful girl with black hair, 8k"
+    prompt_embeds_npy = get_embeds(prompt, tokenizer_dir, text_encoder_dir)
+    print(f"text encoder take {(1000 * (time.time() - start)):.1f}ms")
+    prompt_name = prompt.replace(" ", "_")
+    latents_shape = [1, 4, 64, 64]
+    # latent = torch.randn(latents_shape, generator=None, device="cpu", dtype=torch.float32,
+    #                      layout=torch.strided).detach().numpy()
+    alphas_cumprod, final_alphas_cumprod, self_timesteps = get_alphas_cumprod()
+    # load unet model and vae model
+    start = time.time()
+    vae_encoder = axengine.InferenceSession(vae_encoder_model)
+    unet_session_main = axengine.InferenceSession(unet_model)
+    vae_decoder = axengine.InferenceSession(vae_decoder_model)
+    print(f"load models take {(1000 * (time.time() - start)):.1f}ms")
+    # load time input file
+    time_input = np.load(time_input)
+    # load image
+    # url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
+    url = init_image
+    init_image = load_image(url, convert_method=resize_and_rgb) # U8, (512, 512, 3), RGB
+    init_image_show = init_image
+    # vae encoder inference
+    vae_start = time.time()
+    init_image = preprocess(init_image) # torch.Size([1, 3, 512, 512])
+    if isinstance(init_image, torch.Tensor):
+        init_image = init_image.detach().numpy()
+    vae_encoder_onnx_inp_name = vae_encoder.get_inputs()[0].name
+    vae_encoder_onnx_out_name = vae_encoder.get_outputs()[0].name
+    # vae_encoder_out.shape (1, 8, 64, 64)
+    vae_encoder_out = vae_encoder.run(None, {vae_encoder_onnx_inp_name: init_image})[0] # encoder out: torch.Size([1, 8, 64, 64])
+    print(f"vae encoder inference take {(1000 * (time.time() - vae_start)):.1f}ms")
+    # vae encoder inference
+    device = torch.device("cpu")
+    vae_encoder_out = torch.from_numpy(vae_encoder_out).to(torch.float32)
+    posterior = DiagonalGaussianDistribution(vae_encoder_out) # 数值基本对的上
+    vae_encode_info = AutoencoderKLOutput(latent_dist=posterior)
+    generator = torch.manual_seed(0)
+    init_latents = retrieve_latents(vae_encode_info, generator=generator) # 数值基本对的上
+    init_latents = init_latents * 0.18215 # 数值基本对的��
+    init_latents = torch.cat([init_latents], dim=0)
+    shape = init_latents.shape
+    dtype = torch.float16
+    noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) # dtype 不同, 随机值不同
+    # get latents
+    timestep = torch.tensor([499]).to(device)
+    init_latents = add_noise(init_latents.to(device), noise, timestep)
+    latents = init_latents
+    latents = latents.detach().cpu().numpy()
+    latent = latents
+    # unet inference loop
+    unet_loop_start = time.time()
+    timesteps = np.array([499, 259]).astype(np.int64)
+    self_timesteps = np.array([999, 759, 499, 259]).astype(np.int64)
+    step_index = [2, 3]
+    for i, timestep in enumerate(timesteps):
+        unet_start = time.time()
+        noise_pred = unet_session_main.run(None, {"sample": latent, \
+                                            "/down_blocks.0/resnets.0/act_1/Mul_output_0": np.expand_dims(time_input[i], axis=0), \
+                                            "encoder_hidden_states": prompt_embeds_npy})[0]
+        print(f"unet once take {(1000 * (time.time() - unet_start)):.1f}ms")
+        sample = latent
+        model_output = noise_pred
+        # 1. get previous step value
+        prev_step_index = step_index[i] + 1
+        if prev_step_index < len(self_timesteps):
+            prev_timestep = self_timesteps[prev_step_index]
+        else:
+            prev_timestep = timestep
+        alpha_prod_t = alphas_cumprod[timestep]
+        alpha_prod_t_prev = alphas_cumprod[prev_timestep] if prev_timestep >= 0 else final_alphas_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        # 3. Get scalings for boundary conditions
+        scaled_timestep = timestep * 10
+        c_skip = 0.5 ** 2 / (scaled_timestep ** 2 + 0.5 ** 2)
+        c_out = scaled_timestep / (scaled_timestep ** 2 + 0.5 ** 2) ** 0.5
+        predicted_original_sample = (sample - (beta_prod_t ** 0.5) * model_output) / (alpha_prod_t ** 0.5) # 数值基本对齐
+        denoised = c_out * predicted_original_sample + c_skip * sample
+        if step_index[i] != 3:
+            device = torch.device("cpu")
+            noise = randn_tensor(model_output.shape, generator=generator, device=device, dtype=torch.float16).numpy()
+            prev_sample = (alpha_prod_t_prev ** 0.5) * denoised + (beta_prod_t_prev ** 0.5) * noise
+        else:
+            prev_sample = denoised
+        latent = prev_sample
+    print(f"unet loop take {(1000 * (time.time() - unet_loop_start)):.1f}ms")
+    # vae decoder inference
+    vae_start = time.time()
+    latent = latent / 0.18215
+    image = vae_decoder.run(None, {"x": latent})[0] # ['784']
+    print(f"vae decoder inference take {(1000 * (time.time() - vae_start)):.1f}ms")
+    # save result
+    save_start = time.time()
+    image = np.transpose(image, (0, 2, 3, 1)).squeeze(axis=0)
+    image_denorm = np.clip(image / 2 + 0.5, 0, 1)
+    image = (image_denorm * 255).round().astype("uint8")
+    pil_image = Image.fromarray(image[:, :, :3])
+    pil_image.save(save_dir)
+    grid_img = make_image_grid([init_image_show, pil_image], rows=1, cols=2)
+    grid_img.save(f"./lcm_lora_sdv1-5_imgGrid_output.png")
+    print(f"grid image saved in ./lcm_lora_sdv1-5_imgGrid_output.png")
+    print(f"save image take {(1000 * (time.time() - save_start)):.1f}ms")

run_img2img_onnx_infer.py ADDED Viewed

	@@ -0,0 +1,546 @@

+from typing import List, Union
+import numpy as np
+import onnxruntime
+import torch
+from PIL import Image
+from transformers import CLIPTokenizer, CLIPTextModel, PreTrainedTokenizer, CLIPTextModelWithProjection
+# import axengine as axe
+import time
+import argparse
+from diffusers.utils import load_image
+import PIL.Image
+from typing import List, Optional, Tuple, Union
+from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.utils import make_image_grid, load_image
+########## Img2Img
+PipelineImageInput = Union[
+    PIL.Image.Image,
+    np.ndarray,
+    torch.Tensor,
+    List[PIL.Image.Image],
+    List[np.ndarray],
+    List[torch.Tensor],
+]
+PipelineDepthInput = PipelineImageInput
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+def add_noise(
+    original_samples: torch.Tensor,
+    noise: torch.Tensor,
+    timesteps: torch.IntTensor,
+) -> torch.Tensor:
+    # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+    # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+    # for the subsequent add_noise calls
+    # self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+    # Convert betas to alphas_bar_sqrt
+    beta_start = 0.00085
+    beta_end = 0.012
+    num_train_timesteps = 1000
+    betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_cumprod = alphas_cumprod.to(device=original_samples.device)
+    alphas_cumprod = alphas_cumprod.to(dtype=original_samples.dtype)
+    timesteps = timesteps.to(original_samples.device)
+    sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+    sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+    while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+        sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+    sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+    sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+    while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+    noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+    return noisy_samples
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+def numpy_to_pt(images: np.ndarray) -> torch.Tensor:
+    r"""
+    Convert a NumPy image to a PyTorch tensor.
+    Args:
+        images (`np.ndarray`):
+            The NumPy image array to convert to PyTorch format.
+    Returns:
+        `torch.Tensor`:
+            A PyTorch tensor representation of the images.
+    """
+    if images.ndim == 3:
+        images = images[..., None]
+    images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+    return images
+def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
+    r"""
+    Convert a PIL image or a list of PIL images to NumPy arrays.
+    Args:
+        images (`PIL.Image.Image` or `List[PIL.Image.Image]`):
+            The PIL image or list of images to convert to NumPy format.
+    Returns:
+        `np.ndarray`:
+            A NumPy array representation of the images.
+    """
+    if not isinstance(images, list):
+        images = [images]
+    images = [np.array(image).astype(np.float32) / 255.0 for image in images]
+    images = np.stack(images, axis=0)
+    return images
+def is_valid_image(image) -> bool:
+    r"""
+    Checks if the input is a valid image.
+    A valid image can be:
+    - A `PIL.Image.Image`.
+    - A 2D or 3D `np.ndarray` or `torch.Tensor` (grayscale or color image).
+    Args:
+        image (`Union[PIL.Image.Image, np.ndarray, torch.Tensor]`):
+            The image to validate. It can be a PIL image, a NumPy array, or a torch tensor.
+    Returns:
+        `bool`:
+            `True` if the input is a valid image, `False` otherwise.
+    """
+    return isinstance(image, PIL.Image.Image) or isinstance(image, (np.ndarray, torch.Tensor)) and image.ndim in (2, 3)
+def is_valid_image_imagelist(images):
+    r"""
+    Checks if the input is a valid image or list of images.
+    The input can be one of the following formats:
+    - A 4D tensor or numpy array (batch of images).
+    - A valid single image: `PIL.Image.Image`, 2D `np.ndarray` or `torch.Tensor` (grayscale image), 3D `np.ndarray` or
+      `torch.Tensor`.
+    - A list of valid images.
+    Args:
+        images (`Union[np.ndarray, torch.Tensor, PIL.Image.Image, List]`):
+            The image(s) to check. Can be a batch of images (4D tensor/array), a single image, or a list of valid
+            images.
+    Returns:
+        `bool`:
+            `True` if the input is valid, `False` otherwise.
+    """
+    if isinstance(images, (np.ndarray, torch.Tensor)) and images.ndim == 4:
+        return True
+    elif is_valid_image(images):
+        return True
+    elif isinstance(images, list):
+        return all(is_valid_image(image) for image in images)
+    return False
+def normalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
+    r"""
+    Normalize an image array to [-1,1].
+    Args:
+        images (`np.ndarray` or `torch.Tensor`):
+            The image array to normalize.
+    Returns:
+        `np.ndarray` or `torch.Tensor`:
+            The normalized image array.
+    """
+    return 2.0 * images - 1.0
+# Copy from: /home/baiyongqiang/miniforge-pypy3/envs/hf/lib/python3.9/site-packages/diffusers/image_processor.py#607
+def preprocess(
+    image: PipelineImageInput,
+    height: Optional[int] = None,
+    width: Optional[int] = None,
+    resize_mode: str = "default",  # "default", "fill", "crop"
+    crops_coords: Optional[Tuple[int, int, int, int]] = None,
+) -> torch.Tensor:
+    """
+    Preprocess the image input.
+    Args:
+        image (`PipelineImageInput`):
+            The image input, accepted formats are PIL images, NumPy arrays, PyTorch tensors; Also accept list of
+            supported formats.
+        height (`int`, *optional*):
+            The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default
+            height.
+        width (`int`, *optional*):
+            The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width.
+        resize_mode (`str`, *optional*, defaults to `default`):
+            The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit within
+            the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, will
+            resize the image to fit within the specified width and height, maintaining the aspect ratio, and then
+            center the image within the dimensions, filling empty with data from image. If `crop`, will resize the
+            image to fit within the specified width and height, maintaining the aspect ratio, and then center the
+            image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only
+            supported for PIL image input.
+        crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`):
+            The crop coordinates for each image in the batch. If `None`, will not crop the image.
+    Returns:
+        `torch.Tensor`:
+            The preprocessed image.
+    """
+    supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
+    # # Expand the missing dimension for 3-dimensional pytorch tensor or numpy array that represents grayscale image
+    # if self.config.do_convert_grayscale and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3:
+    #     if isinstance(image, torch.Tensor):
+    #         # if image is a pytorch tensor could have 2 possible shapes:
+    #         #    1. batch x height x width: we should insert the channel dimension at position 1
+    #         #    2. channel x height x width: we should insert batch dimension at position 0,
+    #         #       however, since both channel and batch dimension has same size 1, it is same to insert at position 1
+    #         #    for simplicity, we insert a dimension of size 1 at position 1 for both cases
+    #         image = image.unsqueeze(1)
+    #     else:
+    #         # if it is a numpy array, it could have 2 possible shapes:
+    #         #   1. batch x height x width: insert channel dimension on last position
+    #         #   2. height x width x channel: insert batch dimension on first position
+    #         if image.shape[-1] == 1:
+    #             image = np.expand_dims(image, axis=0)
+    #         else:
+    #             image = np.expand_dims(image, axis=-1)
+    if isinstance(image, list) and isinstance(image[0], np.ndarray) and image[0].ndim == 4:
+        warnings.warn(
+            "Passing `image` as a list of 4d np.ndarray is deprecated."
+            "Please concatenate the list along the batch dimension and pass it as a single 4d np.ndarray",
+            FutureWarning,
+        )
+        image = np.concatenate(image, axis=0)
+    if isinstance(image, list) and isinstance(image[0], torch.Tensor) and image[0].ndim == 4:
+        warnings.warn(
+            "Passing `image` as a list of 4d torch.Tensor is deprecated."
+            "Please concatenate the list along the batch dimension and pass it as a single 4d torch.Tensor",
+            FutureWarning,
+        )
+        image = torch.cat(image, axis=0)
+    if not is_valid_image_imagelist(image):
+        raise ValueError(
+            f"Input is in incorrect format. Currently, we only support {', '.join(str(x) for x in supported_formats)}"
+        )
+    if not isinstance(image, list):
+        image = [image]
+    if isinstance(image[0], PIL.Image.Image):
+        if crops_coords is not None:
+            image = [i.crop(crops_coords) for i in image]
+        # if self.config.do_resize:
+        #     height, width = self.get_default_height_width(image[0], height, width)
+        #     image = [self.resize(i, height, width, resize_mode=resize_mode) for i in image]
+        # if self.config.do_convert_rgb:
+        #     image = [self.convert_to_rgb(i) for i in image]
+        # elif self.config.do_convert_grayscale:
+        #     image = [self.convert_to_grayscale(i) for i in image]
+        image = pil_to_numpy(image)  # to np
+        image = numpy_to_pt(image)  # to pt
+    elif isinstance(image[0], np.ndarray):
+        image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+        # image = self.numpy_to_pt(image)
+        # height, width = self.get_default_height_width(image, height, width)
+        # if self.config.do_resize:
+        #     image = self.resize(image, height, width)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+        # if self.config.do_convert_grayscale and image.ndim == 3:
+        #     image = image.unsqueeze(1)
+        channel = image.shape[1]
+        # don't need any preprocess if the image is latents
+        # if channel == self.config.vae_latent_channels:
+        #     return image
+        # height, width = self.get_default_height_width(image, height, width)
+        # if self.config.do_resize:
+        #     image = self.resize(image, height, width)
+    # expected range [0,1], normalize to [-1,1]
+    do_normalize = True # self.config.do_normalize
+    if do_normalize and image.min() < 0:
+        warnings.warn(
+            "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
+            f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
+            FutureWarning,
+        )
+        do_normalize = False
+    if do_normalize:
+        image = normalize(image)
+    # if self.config.do_binarize:
+    #     image = self.binarize(image)
+    return image
+##########
+def get_args():
+    parser = argparse.ArgumentParser(
+        prog="StableDiffusion",
+        description="Generate picture with the input prompt"
+    )
+    parser.add_argument("--prompt", type=str, required=False, default="Astronauts in a jungle, cold color palette, muted colors, detailed, 8k", help="the input text prompt")
+    parser.add_argument("--text_model_dir", type=str, required=False, default="./models/", help="Path to text encoder and tokenizer files")
+    parser.add_argument("--unet_model", type=str, required=False, default="./models/unet.onnx", help="Path to unet ONNX model")
+    parser.add_argument("--vae_encoder_model", type=str, required=False, default="./models/vae_encoder.onnx", help="Path to vae encoder ONNX model")
+    parser.add_argument("--vae_decoder_model", type=str, required=False, default="./models/vae_decoder.onnx", help="Path to vae decoder ONNX model")
+    parser.add_argument("--time_input", type=str, required=False, default="./models/time_input_img2img.npy", help="Path to time input file")
+    parser.add_argument("--init_image", type=str, required=False, default="./models/img2img-init.png", help="Path to initial image file")
+    parser.add_argument("--save_dir", type=str, required=False, default="./img2img_output_onnx.png", help="Path to the output image file")
+    return parser.parse_args()
+def maybe_convert_prompt(prompt: Union[str, List[str]], tokenizer: "PreTrainedTokenizer"):  # noqa: F821
+    if not isinstance(prompt, List):
+        prompts = [prompt]
+    else:
+        prompts = prompt
+    prompts = [_maybe_convert_prompt(p, tokenizer) for p in prompts]
+    if not isinstance(prompt, List):
+        return prompts[0]
+    return prompts
+def _maybe_convert_prompt(prompt: str, tokenizer: "PreTrainedTokenizer"):  # noqa: F821
+    tokens = tokenizer.tokenize(prompt)
+    unique_tokens = set(tokens)
+    for token in unique_tokens:
+        if token in tokenizer.added_tokens_encoder:
+            replacement = token
+            i = 1
+            while f"{token}_{i}" in tokenizer.added_tokens_encoder:
+                replacement += f" {token}_{i}"
+                i += 1
+            prompt = prompt.replace(token, replacement)
+    return prompt
+def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
+    tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
+    text_encoder = CLIPTextModel.from_pretrained(text_encoder_dir,
+                                                 torch_dtype=torch.float32,
+                                                 variant="fp16")
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=77,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    prompt_embeds = text_encoder(text_input_ids.to("cpu"), attention_mask=None)
+    prompt_embeds_npy = prompt_embeds[0].detach().numpy()
+    return prompt_embeds_npy
+def get_alphas_cumprod():
+    betas = torch.linspace(0.00085 ** 0.5, 0.012 ** 0.5, 1000, dtype=torch.float32) ** 2
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0).detach().numpy()
+    final_alphas_cumprod = alphas_cumprod[0]
+    self_timesteps = np.arange(0, 1000)[::-1].copy().astype(np.int64)
+    return alphas_cumprod, final_alphas_cumprod, self_timesteps
+def resize_and_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
+    """
+    Resize the image to 512x512 and convert it to RGB.
+    """
+    return image.resize((512, 512)).convert("RGB")
+if __name__ == '__main__':
+    """
+    Usage:
+        - python3 run_img2img_onnx_infer.py --prompt "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k" --unet_model output_onnx/unet_sim.onnx  --vae_encoder_model output_onnx/vae_encoder_sim.onnx --vae_decoder_model output_onnx/vae_decoder_sim.onnx  --time_input ./output_onnx/time_input.npy --save_dir ./img2img_output.png
+    """
+    args = get_args()
+    prompt = args.prompt
+    tokenizer_dir = args.text_model_dir + 'tokenizer'
+    text_encoder_dir = args.text_model_dir + 'text_encoder'
+    unet_model = args.unet_model
+    vae_decoder_model = args.vae_decoder_model
+    vae_encoder_model = args.vae_encoder_model
+    init_image = args.init_image
+    time_input = args.time_input
+    save_dir = args.save_dir
+    print(f"prompt: {prompt}")
+    print(f"text_tokenizer: {tokenizer_dir}")
+    print(f"text_encoder: {text_encoder_dir}")
+    print(f"unet_model: {unet_model}")
+    print(f"vae_encoder_model: {vae_encoder_model}")
+    print(f"vae_decoder_model: {vae_decoder_model}")
+    print(f"init image: {init_image}")
+    print(f"time_input: {time_input}")
+    print(f"save_dir: {save_dir}")
+    # timesteps = np.array([999, 759, 499, 259]).astype(np.int64)
+    # text encoder
+    start = time.time()
+    # prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
+    # prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k"
+    # prompt = "Caricature, a beautiful girl with black hair, 8k"
+    prompt_embeds_npy = get_embeds(prompt, tokenizer_dir, text_encoder_dir)
+    print(f"text encoder take {(1000 * (time.time() - start)):.1f}ms")
+    prompt_name = prompt.replace(" ", "_")
+    latents_shape = [1, 4, 64, 64]
+    # latent = torch.randn(latents_shape, generator=None, device="cpu", dtype=torch.float32,
+    #                      layout=torch.strided).detach().numpy()
+    alphas_cumprod, final_alphas_cumprod, self_timesteps = get_alphas_cumprod()
+    # load unet model and vae model
+    start = time.time()
+    vae_encoder = onnxruntime.InferenceSession(vae_encoder_model)
+    unet_session_main = onnxruntime.InferenceSession(unet_model)
+    vae_decoder = onnxruntime.InferenceSession(vae_decoder_model)
+    print(f"load models take {(1000 * (time.time() - start)):.1f}ms")
+    # load time input file
+    time_input = np.load(time_input)
+    # load image
+    # url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
+    url = init_image
+    init_image = load_image(url, convert_method=resize_and_rgb) # U8, (512, 512, 3), RGB
+    init_image_show = init_image
+    # vae encoder inference
+    vae_start = time.time()
+    init_image = preprocess(init_image) # torch.Size([1, 3, 512, 512])
+    if isinstance(init_image, torch.Tensor):
+        init_image = init_image.detach().numpy()
+    vae_encoder_onnx_inp_name = vae_encoder.get_inputs()[0].name
+    vae_encoder_onnx_out_name = vae_encoder.get_outputs()[0].name
+    # vae_encoder_out.shape (1, 8, 64, 64)
+    vae_encoder_out = vae_encoder.run(None, {vae_encoder_onnx_inp_name: init_image})[0] # encoder out: torch.Size([1, 8, 64, 64])
+    print(f"vae encoder inference take {(1000 * (time.time() - vae_start)):.1f}ms")
+    # vae encoder inference
+    device = torch.device("cpu")
+    vae_encoder_out = torch.from_numpy(vae_encoder_out).to(torch.float32)
+    posterior = DiagonalGaussianDistribution(vae_encoder_out) # 数值基本对的上
+    vae_encode_info = AutoencoderKLOutput(latent_dist=posterior)
+    generator = torch.manual_seed(0)
+    init_latents = retrieve_latents(vae_encode_info, generator=generator) # 数值基本对的上
+    init_latents = init_latents * 0.18215 # 数值基本对的上
+    init_latents = torch.cat([init_latents], dim=0)
+    shape = init_latents.shape
+    dtype = torch.float16
+    noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) # dtype 不同, 随机值不同
+    # get latents
+    timestep = torch.tensor([499]).to(device)
+    init_latents = add_noise(init_latents.to(device), noise, timestep)
+    latents = init_latents
+    latents = latents.detach().cpu().numpy()
+    latent = latents
+    # unet inference loop
+    unet_loop_start = time.time()
+    timesteps = np.array([499, 259]).astype(np.int64)
+    self_timesteps = np.array([999, 759, 499, 259]).astype(np.int64)
+    step_index = [2, 3]
+    for i, timestep in enumerate(timesteps):
+        unet_start = time.time()
+        noise_pred = unet_session_main.run(None, {"sample": latent, \
+                                            "/down_blocks.0/resnets.0/act_1/Mul_output_0": np.expand_dims(time_input[i], axis=0), \
+                                            "encoder_hidden_states": prompt_embeds_npy})[0]
+        print(f"unet once take {(1000 * (time.time() - unet_start)):.1f}ms")
+        sample = latent
+        model_output = noise_pred
+        # 1. get previous step value
+        prev_step_index = step_index[i] + 1
+        if prev_step_index < len(self_timesteps):
+            prev_timestep = self_timesteps[prev_step_index]
+        else:
+            prev_timestep = timestep
+        alpha_prod_t = alphas_cumprod[timestep]
+        alpha_prod_t_prev = alphas_cumprod[prev_timestep] if prev_timestep >= 0 else final_alphas_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        # 3. Get scalings for boundary conditions
+        scaled_timestep = timestep * 10
+        c_skip = 0.5 ** 2 / (scaled_timestep ** 2 + 0.5 ** 2)
+        c_out = scaled_timestep / (scaled_timestep ** 2 + 0.5 ** 2) ** 0.5
+        predicted_original_sample = (sample - (beta_prod_t ** 0.5) * model_output) / (alpha_prod_t ** 0.5) # 数值基本对齐
+        denoised = c_out * predicted_original_sample + c_skip * sample
+        if step_index[i] != 3:
+            device = torch.device("cpu")
+            noise = randn_tensor(model_output.shape, generator=generator, device=device, dtype=torch.float16).numpy()
+            prev_sample = (alpha_prod_t_prev ** 0.5) * denoised + (beta_prod_t_prev ** 0.5) * noise
+        else:
+            prev_sample = denoised
+        latent = prev_sample
+    print(f"unet loop take {(1000 * (time.time() - unet_loop_start)):.1f}ms")
+    # vae decoder inference
+    vae_start = time.time()
+    latent = latent / 0.18215
+    image = vae_decoder.run(None, {"x": latent})[0] # ['784']
+    print(f"vae decoder inference take {(1000 * (time.time() - vae_start)):.1f}ms")
+    # save result
+    save_start = time.time()
+    image = np.transpose(image, (0, 2, 3, 1)).squeeze(axis=0)
+    image_denorm = np.clip(image / 2 + 0.5, 0, 1)
+    image = (image_denorm * 255).round().astype("uint8")
+    pil_image = Image.fromarray(image[:, :, :3])
+    pil_image.save(save_dir)
+    grid_img = make_image_grid([init_image_show, pil_image], rows=1, cols=2)
+    grid_img.save(f"./lcm_lora_sdv1-5_imgGrid_output.png")
+    print(f"grid image saved in ./lcm_lora_sdv1-5_imgGrid_output.png")
+    print(f"save image take {(1000 * (time.time() - save_start)):.1f}ms")

run_txt2img_axe_infer.py ADDED Viewed

	@@ -0,0 +1,183 @@

+from typing import List, Union
+import numpy as np
+# import onnxruntime
+import axengine
+import torch
+from PIL import Image
+from transformers import CLIPTokenizer, CLIPTextModel, PreTrainedTokenizer, CLIPTextModelWithProjection
+import time
+import argparse
+def get_args():
+    parser = argparse.ArgumentParser(
+        prog="StableDiffusion",
+        description="Generate picture with the input prompt"
+    )
+    parser.add_argument("--prompt", type=str, required=False, default="Self-portrait oil painting, a beautiful cyborg with golden hair, 8k", help="the input text prompt")
+    parser.add_argument("--text_model_dir", type=str, required=False, default="./models/", help="Path to text encoder and tokenizer files")
+    parser.add_argument("--unet_model", type=str, required=False, default="./models/unet.axmodel", help="Path to unet axmodel model")
+    parser.add_argument("--vae_decoder_model", type=str, required=False, default="./models/vae_decoder.axmodel", help="Path to vae decoder axmodel model")
+    parser.add_argument("--time_input", type=str, required=False, default="./models/time_input_txt2img.npy", help="Path to time input file")
+    parser.add_argument("--save_dir", type=str, required=False, default="./txt2img_output_axe.png", help="Path to the output image file")
+    return parser.parse_args()
+def maybe_convert_prompt(prompt: Union[str, List[str]], tokenizer: "PreTrainedTokenizer"):  # noqa: F821
+    if not isinstance(prompt, List):
+        prompts = [prompt]
+    else:
+        prompts = prompt
+    prompts = [_maybe_convert_prompt(p, tokenizer) for p in prompts]
+    if not isinstance(prompt, List):
+        return prompts[0]
+    return prompts
+def _maybe_convert_prompt(prompt: str, tokenizer: "PreTrainedTokenizer"):  # noqa: F821
+    tokens = tokenizer.tokenize(prompt)
+    unique_tokens = set(tokens)
+    for token in unique_tokens:
+        if token in tokenizer.added_tokens_encoder:
+            replacement = token
+            i = 1
+            while f"{token}_{i}" in tokenizer.added_tokens_encoder:
+                replacement += f" {token}_{i}"
+                i += 1
+            prompt = prompt.replace(token, replacement)
+    return prompt
+def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
+    tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
+    text_encoder = CLIPTextModel.from_pretrained(text_encoder_dir,
+                                                 torch_dtype=torch.float32,
+                                                 variant="fp16")
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=77,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    prompt_embeds = text_encoder(text_input_ids.to("cpu"), attention_mask=None)
+    prompt_embeds_npy = prompt_embeds[0].detach().numpy()
+    return prompt_embeds_npy
+def get_alphas_cumprod():
+    betas = torch.linspace(0.00085 ** 0.5, 0.012 ** 0.5, 1000, dtype=torch.float32) ** 2
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0).detach().numpy()
+    final_alphas_cumprod = alphas_cumprod[0]
+    self_timesteps = np.arange(0, 1000)[::-1].copy().astype(np.int64)
+    return alphas_cumprod, final_alphas_cumprod, self_timesteps
+if __name__ == '__main__':
+    args = get_args()
+    prompt = args.prompt
+    tokenizer_dir = args.text_model_dir + 'tokenizer'
+    text_encoder_dir = args.text_model_dir + 'text_encoder'
+    unet_model = args.unet_model
+    vae_decoder_model = args.vae_decoder_model
+    time_input = args.time_input
+    save_dir = args.save_dir
+    print(f"prompt: {prompt}")
+    print(f"text_tokenizer: {tokenizer_dir}")
+    print(f"text_encoder: {text_encoder_dir}")
+    print(f"unet_model: {unet_model}")
+    print(f"vae_decoder_model: {vae_decoder_model}")
+    print(f"time_input: {time_input}")
+    print(f"save_dir: {save_dir}")
+    timesteps = np.array([999, 759, 499, 259]).astype(np.int64)
+    # text encoder
+    start = time.time()
+    # prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
+    prompt_embeds_npy = get_embeds(prompt, tokenizer_dir, text_encoder_dir)
+    print(f"text encoder take {(1000 * (time.time() - start)):.1f}ms")
+    prompt_name = prompt.replace(" ", "_")
+    latents_shape = [1, 4, 64, 64]
+    latent = torch.randn(latents_shape, generator=None, device="cpu", dtype=torch.float32,
+                         layout=torch.strided).detach().numpy()
+    alphas_cumprod, final_alphas_cumprod, self_timesteps = get_alphas_cumprod()
+    # load unet model and vae model
+    start = time.time()
+    unet_session_main = axengine.InferenceSession(unet_model)
+    vae_decoder = axengine.InferenceSession(vae_decoder_model)
+    print(f"load models take {(1000 * (time.time() - start)):.1f}ms")
+    # load time input file
+    time_input = np.load(time_input)
+    # unet inference loop
+    unet_loop_start = time.time()
+    for i, timestep in enumerate(timesteps):
+        # print(i, timestep)
+        unet_start = time.time()
+        noise_pred = unet_session_main.run(None, {"sample": latent, \
+                                            "/down_blocks.0/resnets.0/act_1/Mul_output_0": np.expand_dims(time_input[i], axis=0), \
+                                            "encoder_hidden_states": prompt_embeds_npy})[0]
+        print(f"unet once take {(1000 * (time.time() - unet_start)):.1f}ms")
+        sample = latent
+        model_output = noise_pred
+        if i < 3:
+            prev_timestep = timesteps[i + 1]
+        else:
+            prev_timestep = timestep
+        alpha_prod_t = alphas_cumprod[timestep]
+        alpha_prod_t_prev = alphas_cumprod[prev_timestep] if prev_timestep >= 0 else final_alphas_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        # 3. Get scalings for boundary conditions
+        scaled_timestep = timestep * 10
+        c_skip = 0.5 ** 2 / (scaled_timestep ** 2 + 0.5 ** 2)
+        c_out = scaled_timestep / (scaled_timestep ** 2 + 0.5 ** 2) ** 0.5
+        predicted_original_sample = (sample - (beta_prod_t ** 0.5) * model_output) / (alpha_prod_t ** 0.5)
+        denoised = c_out * predicted_original_sample + c_skip * sample
+        if i != 3:
+            noise = torch.randn(model_output.shape, generator=None, device="cpu", dtype=torch.float32,
+                                layout=torch.strided).to("cpu").detach().numpy()
+            prev_sample = (alpha_prod_t_prev ** 0.5) * denoised + (beta_prod_t_prev ** 0.5) * noise
+        else:
+            prev_sample = denoised
+        latent = prev_sample
+    print(f"unet loop take {(1000 * (time.time() - unet_loop_start)):.1f}ms")
+    # vae inference
+    vae_start = time.time()
+    latent = latent / 0.18215
+    image = vae_decoder.run(None, {"x": latent})[0]
+    print(f"vae inference take {(1000 * (time.time() - vae_start)):.1f}ms")
+    # save result
+    save_start = time.time()
+    image = np.transpose(image, (0, 2, 3, 1)).squeeze(axis=0)
+    image_denorm = np.clip(image / 2 + 0.5, 0, 1)
+    image = (image_denorm * 255).round().astype("uint8")
+    pil_image = Image.fromarray(image[:, :, :3])
+    pil_image.save(save_dir)
+    print(f"save image take {(1000 * (time.time() - save_start)):.1f}ms")

run_txt2img_onnx_infer.py ADDED Viewed

	@@ -0,0 +1,183 @@

+from typing import List, Union
+import numpy as np
+import onnxruntime
+# import axengine
+import torch
+from PIL import Image
+from transformers import CLIPTokenizer, CLIPTextModel, PreTrainedTokenizer, CLIPTextModelWithProjection
+import time
+import argparse
+def get_args():
+    parser = argparse.ArgumentParser(
+        prog="StableDiffusion",
+        description="Generate picture with the input prompt"
+    )
+    parser.add_argument("--prompt", type=str, required=False, default="Self-portrait oil painting, a beautiful cyborg with golden hair, 8k", help="the input text prompt")
+    parser.add_argument("--text_model_dir", type=str, required=False, default="./models/", help="Path to text encoder and tokenizer files")
+    parser.add_argument("--unet_model", type=str, required=False, default="./models/unet.onnx", help="Path to unet ONNX model")
+    parser.add_argument("--vae_decoder_model", type=str, required=False, default="./models/vae_decoder.onnx", help="Path to vae decoder ONNX model")
+    parser.add_argument("--time_input", type=str, required=False, default="./models/time_input_txt2img.npy", help="Path to time input file")
+    parser.add_argument("--save_dir", type=str, required=False, default="./txt2img_output_onnx.png", help="Path to the output image file")
+    return parser.parse_args()
+def maybe_convert_prompt(prompt: Union[str, List[str]], tokenizer: "PreTrainedTokenizer"):  # noqa: F821
+    if not isinstance(prompt, List):
+        prompts = [prompt]
+    else:
+        prompts = prompt
+    prompts = [_maybe_convert_prompt(p, tokenizer) for p in prompts]
+    if not isinstance(prompt, List):
+        return prompts[0]
+    return prompts
+def _maybe_convert_prompt(prompt: str, tokenizer: "PreTrainedTokenizer"):  # noqa: F821
+    tokens = tokenizer.tokenize(prompt)
+    unique_tokens = set(tokens)
+    for token in unique_tokens:
+        if token in tokenizer.added_tokens_encoder:
+            replacement = token
+            i = 1
+            while f"{token}_{i}" in tokenizer.added_tokens_encoder:
+                replacement += f" {token}_{i}"
+                i += 1
+            prompt = prompt.replace(token, replacement)
+    return prompt
+def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
+    tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
+    text_encoder = CLIPTextModel.from_pretrained(text_encoder_dir,
+                                                 torch_dtype=torch.float32,
+                                                 variant="fp16")
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=77,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    prompt_embeds = text_encoder(text_input_ids.to("cpu"), attention_mask=None)
+    prompt_embeds_npy = prompt_embeds[0].detach().numpy()
+    return prompt_embeds_npy
+def get_alphas_cumprod():
+    betas = torch.linspace(0.00085 ** 0.5, 0.012 ** 0.5, 1000, dtype=torch.float32) ** 2
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0).detach().numpy()
+    final_alphas_cumprod = alphas_cumprod[0]
+    self_timesteps = np.arange(0, 1000)[::-1].copy().astype(np.int64)
+    return alphas_cumprod, final_alphas_cumprod, self_timesteps
+if __name__ == '__main__':
+    args = get_args()
+    prompt = args.prompt
+    tokenizer_dir = args.text_model_dir + 'tokenizer'
+    text_encoder_dir = args.text_model_dir + 'text_encoder'
+    unet_model = args.unet_model
+    vae_decoder_model = args.vae_decoder_model
+    time_input = args.time_input
+    save_dir = args.save_dir
+    print(f"prompt: {prompt}")
+    print(f"text_tokenizer: {tokenizer_dir}")
+    print(f"text_encoder: {text_encoder_dir}")
+    print(f"unet_model: {unet_model}")
+    print(f"vae_decoder_model: {vae_decoder_model}")
+    print(f"time_input: {time_input}")
+    print(f"save_dir: {save_dir}")
+    timesteps = np.array([999, 759, 499, 259]).astype(np.int64)
+    # text encoder
+    start = time.time()
+    # prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
+    prompt_embeds_npy = get_embeds(prompt, tokenizer_dir, text_encoder_dir)
+    print(f"text encoder take {(1000 * (time.time() - start)):.1f}ms")
+    prompt_name = prompt.replace(" ", "_")
+    latents_shape = [1, 4, 64, 64]
+    latent = torch.randn(latents_shape, generator=None, device="cpu", dtype=torch.float32,
+                         layout=torch.strided).detach().numpy()
+    alphas_cumprod, final_alphas_cumprod, self_timesteps = get_alphas_cumprod()
+    # load unet model and vae model
+    start = time.time()
+    unet_session_main = onnxruntime.InferenceSession(unet_model)
+    vae_decoder = onnxruntime.InferenceSession(vae_decoder_model)
+    print(f"load models take {(1000 * (time.time() - start)):.1f}ms")
+    # load time input file
+    time_input = np.load(time_input)
+    # unet inference loop
+    unet_loop_start = time.time()
+    for i, timestep in enumerate(timesteps):
+        # print(i, timestep)
+        unet_start = time.time()
+        noise_pred = unet_session_main.run(None, {"sample": latent, \
+                                            "/down_blocks.0/resnets.0/act_1/Mul_output_0": np.expand_dims(time_input[i], axis=0), \
+                                            "encoder_hidden_states": prompt_embeds_npy})[0]
+        print(f"unet once take {(1000 * (time.time() - unet_start)):.1f}ms")
+        sample = latent
+        model_output = noise_pred
+        if i < 3:
+            prev_timestep = timesteps[i + 1]
+        else:
+            prev_timestep = timestep
+        alpha_prod_t = alphas_cumprod[timestep]
+        alpha_prod_t_prev = alphas_cumprod[prev_timestep] if prev_timestep >= 0 else final_alphas_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        # 3. Get scalings for boundary conditions
+        scaled_timestep = timestep * 10
+        c_skip = 0.5 ** 2 / (scaled_timestep ** 2 + 0.5 ** 2)
+        c_out = scaled_timestep / (scaled_timestep ** 2 + 0.5 ** 2) ** 0.5
+        predicted_original_sample = (sample - (beta_prod_t ** 0.5) * model_output) / (alpha_prod_t ** 0.5)
+        denoised = c_out * predicted_original_sample + c_skip * sample
+        if i != 3:
+            noise = torch.randn(model_output.shape, generator=None, device="cpu", dtype=torch.float32,
+                                layout=torch.strided).to("cpu").detach().numpy()
+            prev_sample = (alpha_prod_t_prev ** 0.5) * denoised + (beta_prod_t_prev ** 0.5) * noise
+        else:
+            prev_sample = denoised
+        latent = prev_sample
+    print(f"unet loop take {(1000 * (time.time() - unet_loop_start)):.1f}ms")
+    # vae inference
+    vae_start = time.time()
+    latent = latent / 0.18215
+    image = vae_decoder.run(None, {"x": latent})[0]
+    print(f"vae inference take {(1000 * (time.time() - vae_start)):.1f}ms")
+    # save result
+    save_start = time.time()
+    image = np.transpose(image, (0, 2, 3, 1)).squeeze(axis=0)
+    image_denorm = np.clip(image / 2 + 0.5, 0, 1)
+    image = (image_denorm * 255).round().astype("uint8")
+    pil_image = Image.fromarray(image[:, :, :3])
+    pil_image.save(save_dir)
+    print(f"save image take {(1000 * (time.time() - save_start)):.1f}ms")