Spaces:

ashen0209
/

Flux-Consistancy-v2

Runtime error

App Files Files Community

zs38 commited on Mar 28

Commit

c82ab6a

0 Parent(s):

init

Browse files

Files changed (12) hide show

README.md +13 -0
app.py +160 -0
assets/ref_man1.jpg +0 -0
assets/ref_man2.jpg +0 -0
assets/ref_man3.jpg +0 -0
assets/ref_woman1.jpg +0 -0
assets/ref_woman2.jpg +0 -0
assets/ref_woman3.jpg +0 -0
config.json +15 -0
projection.py +161 -0
requirements.txt +10 -0
transformer_flux_custom.py +942 -0

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Flux Consistancy V2
+emoji: 👁
+colorFrom: gray
+colorTo: green
+sdk: gradio
+sdk_version: 5.23.1
+app_file: app.py
+pinned: false
+short_description: Generate images with texts and reference images
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import os
+import torch
+import spaces
+import safetensors
+import gradio as gr
+from PIL import Image
+from loguru import logger
+from torchvision import transforms
+from huggingface_hub import hf_hub_download, login
+from diffusers import FluxPipeline, FluxTransformer2DModel
+from projection import ImageEncoder
+from transformer_flux_custom import FluxTransformer2DModel as FluxTransformer2DModelWithIP
+model_config = './config.json'
+pretrained_model_name = 'black-forest-labs/FLUX.1-dev'
+adapter_path = 'model.safetensors'
+adapter_repo_id = "ashen0209/Flux-Consistancy-v2"
+conditioner_base_model = 'eva02_large_patch14_448.mim_in22k_ft_in1k'
+conditioner_layer_num = 12
+device = "cuda" if torch.cuda.is_available() else "cpu"
+output_dim = 4096
+logger.info(f"pretrained_model_name: {pretrained_model_name}, adapter_repo_id: {adapter_repo_id}, adapter_path: {adapter_path}, conditioner_layer: {conditioner_layer_num}, output_dim {output_dim}, device: {device}")
+logger.info("init model")
+model = FluxTransformer2DModelWithIP.from_config(model_config, torch_dtype=torch.bfloat16) # type: ignore
+logger.info("load model")
+copy = FluxTransformer2DModel.from_pretrained(pretrained_model_name, subfolder='transformer', torch_dtype=torch.bfloat16)
+model.load_state_dict(copy.state_dict(), strict=False)
+del copy
+logger.info("load proj")
+extra_embedder = ImageEncoder(output_dim, layer_num=conditioner_layer_num, seq_len=2, device=device, base_model=conditioner_base_model, use_pyramid=True).to(device=device, dtype=torch.bfloat16)
+logger.info("load pipe")
+pipe = FluxPipeline.from_pretrained(pretrained_model_name, transformer=model, torch_dtype=torch.bfloat16)
+pipe.to(dtype=torch.bfloat16, device=device)
+logger.info("download adapter")
+login(token=os.environ['HF_TOKEN'])
+file_path = hf_hub_download(repo_id=adapter_repo_id, filename=adapter_path)
+logger.info("load adapter")
+state_dict = safetensors.torch.load_file(file_path)
+state_dict = {'.'.join(k.split('.')[1:]): state_dict[k] for k in state_dict.keys()}
+diff = model.load_state_dict(state_dict, strict=False)
+diff = extra_embedder.load_state_dict(state_dict, strict=False)
+IMAGE_PROCESS_TRANSFORM = transforms.Compose([
+    transforms.Resize((448, 448)),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.4815, 0.4578, 0.4082], std=[0.2686, 0.2613, 0.276])
+])
+@spaces.GPU
+def generate_image(ref_image, ref_image2=None, prompt="", height=512, width=512, num_steps=25, guidance_scale=3.5, seed=0, ip_scale=1.0):
+    print(f"ref_image: {ref_image.size if ref_image is not None else None}, "
+          f"ref_image2: {ref_image2.size if ref_image2 is not None else None}, "
+          f"prompt: {prompt}, height: {height}, width: {width}, num_steps: {num_steps}, guidance_scale: {guidance_scale}, ip_scale: {ip_scale}")
+    with torch.no_grad():
+        image_refs = map(torch.stack, [
+            [IMAGE_PROCESS_TRANSFORM(i) for i in [ref_image, ref_image2] if i is not None]
+        ])
+        image_refs = [i.to(dtype=torch.bfloat16, device='cuda') for i in image_refs]
+        prompt_embeds, pooled_prompt_embeds, txt_ids = pipe.encode_prompt(prompt, prompt)
+        visual_prompt_embeds = extra_embedder(image_refs)
+        prompt_embeds_with_ref = torch.cat([prompt_embeds, visual_prompt_embeds], dim=1)
+        pipe.transformer.ip_scale = ip_scale
+        image = pipe(
+            prompt_embeds=prompt_embeds_with_ref,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            # negative_prompt_embeds=negative_prompt_embeds,
+            # negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            height=height,
+            width=width,
+            num_inference_steps=num_steps,
+            guidance_scale=guidance_scale,
+        ).images[0]
+        return image
+examples = [
+    ["assets/ref_woman1.jpg", None, "A photo of the woman dancing in the desert, blue sky, cinematic studio photography of high-fidelity subject, natural lightning, insanely detailed and intricate.", 512, 768],
+    ["assets/ref_man1.jpg", "assets/ref_woman1.jpg", "The man and woman are standing in a sunlit meadow. The man is taking photos of the woman as she poses with the bouquet", 512, 768],
+    ["assets/ref_man2.jpg", "assets/ref_woman2.jpg", "The man and woman are standing next to a motorcycle on a deserted road. The woman is pointing at the map, while the man looks confused but intrigued.", 512, 768],
+    ["assets/ref_man3.jpg", "assets/ref_woman3.jpg", "The man and woman are at a glamorous ballroom dance. The man is offering the woman a glass of champagne, while she fans herself gracefully.", 512, 768],
+]
+with gr.Blocks() as demo:
+    # Top-level inputs that are always visible
+    with gr.Row():
+        gr.Markdown("""
+## Character Consistancy Image Generation based on Flux
+- The model can be downloaded at https://huggingface.co/ashen0209/Flux-Consistancy-v2
+- The model is good at generating consistent images of human characters, capable of multi-subjects generation especisally on realistic scenes
+""")
+    with gr.Row():
+        with gr.Column():
+            with gr.Row():
+                ref_image = gr.Image(type="pil", label="Upload Reference Subject Image", width=300)
+                ref_image2 = gr.Image(type="pil", label="[Optional] complement image or additional image from different category", width=200)
+            description = gr.Textbox(lines=2, placeholder="Describe the desired contents", label="Description Text")
+            generate_btn = gr.Button("Generate Image")
+            # Advanced options hidden inside an accordion (click to expand)
+            with gr.Accordion("Advanced Options", open=False):
+                height_slider = gr.Slider(minimum=256, maximum=1024, value=512, step=64, label="Height")
+                width_slider = gr.Slider(minimum=256, maximum=1024, value=512, step=64, label="Width")
+                steps_slider = gr.Slider(minimum=20, maximum=50, value=25, step=1, label="Number of Steps")
+                guidance_slider = gr.Slider(minimum=1.0, maximum=8.0, value=3.5, step=0.1, label="Guidance Scale")
+                ref_scale_slider = gr.Slider(minimum=0.0, maximum=2.0, value=1.0, step=0.1, label="Reference Image Scale")
+        with gr.Column():
+            output = gr.Image(type="pil", label="Generated Image", )
+            # with gr.Row():
+            # with gr.Group():
+            #     with gr.Row(equal_height=True):
+            #         with gr.Column(scale=1, min_width=50, ):
+            #             randomize_checkbox = gr.Checkbox(label="Randomize Seed", value=True)
+            #         with gr.Column(scale=3, min_width=100):
+            #             seed_io = gr.Number(label="Seed (if not randomizing)", value=0, interactive=True, )
+    with gr.Row():
+        gr.Examples(
+            label='Click on following examples to load and try',
+            examples=examples,
+            inputs=[ref_image, ref_image2, description, height_slider, width_slider],
+            fn=generate_image,
+            outputs=output,
+            # example_labels=['Reference Subject', 'Additional Reference', 'Prompt', 'Height', 'Width'],
+            cache_examples=True,
+            cache_mode='lazy'
+        )
+    with gr.Row():
+        gr.Markdown("""
+### Tips:
+- Images with human subjects tend to perform better than other categories.
+- Images where the subject occupies most of the frame with a clean, uncluttered background yield improved results.
+- Including multiple subjects of the same category may cause blending issues.
+""")
+    # When the button is clicked, pass all inputs to generate_image
+    generate_btn.click(
+        fn=generate_image,
+        inputs=[ref_image, ref_image2, description, height_slider, width_slider, steps_slider, guidance_slider, ref_scale_slider],
+        outputs=output,
+    )
+if __name__ == "__main__":
+    demo.launch()

assets/ref_man1.jpg ADDED Viewed

assets/ref_man2.jpg ADDED Viewed

assets/ref_man3.jpg ADDED Viewed

assets/ref_woman1.jpg ADDED Viewed

assets/ref_woman2.jpg ADDED Viewed

assets/ref_woman3.jpg ADDED Viewed

config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "_class_name": "FluxTransformer2DModel",
+    "_diffusers_version": "0.30.0.dev0",
+    "_name_or_path": "../checkpoints/flux-dev/transformer",
+    "attention_head_dim": 128,
+    "guidance_embeds": true,
+    "in_channels": 64,
+    "joint_attention_dim": 4096,
+    "num_attention_heads": 24,
+    "num_layers": 19,
+    "num_single_layers": 38,
+    "patch_size": 1,
+    "pooled_projection_dim": 768
+  }

projection.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import torch
+from torch import nn
+from torch.nn import functional as F
+from loguru import logger
+# from prodigyopt import Prodigy
+from torch.utils.checkpoint import checkpoint
+from transformers import pipeline
+# from sbp.nn.model_paths import MODEL_PATHS
+# # from sbp.nn.torch.models.qformer import ModifiedQFormer
+class ImageEncoder(nn.Module):
+    def __init__(self, output_dim, base_model='eva02_base_patch14_224.mim_in22k', layer_num=6, seq_len=3, device='cpu', use_pe=False, use_pyramid=False, use_global_feature=False, use_qformer_dim=False):
+        super().__init__()
+        self.output_dim = output_dim
+        import timm
+        # paths = {
+        #     'eva02_large_patch14_448.mim_in22k_ft_in1k': MODEL_PATHS.EVA02_LARGE_448_MIM_IN22K,
+        #     'eva02_base_patch14_224.mim_in22k': MODEL_PATHS.EVA02_BASE_224_MIM_IN22K,
+        # }
+        if base_model == 'eva02_base_patch14_224.mim_in22k':
+            self.img_seq = 257
+        elif base_model == 'eva02_large_patch14_448.mim_in22k_ft_in1k':
+            self.img_seq = 1025
+        elif base_model == 'siglip2':
+            self.img_seq = 1024
+        else:
+            raise ValueError(f" unknown {base_model}, supported: {list(paths.keys())}")
+        # self.base_model = timm.create_model(base_model, pretrained=True, pretrained_cfg_overlay={'file': paths[base_model], 'custom_load': False})
+        self.base_model = timm.create_model(base_model, pretrained=False)
+        del self.base_model.norm, self.base_model.fc_norm, self.base_model.head, self.base_model.head_drop
+        del self.base_model.blocks[layer_num:]
+        dim_mult = 3 if use_pyramid else 1
+        image_output_dim = self.base_model.num_features * dim_mult
+        self.seq_len = seq_len
+        self.device = device
+        self.use_pe = use_pe
+        self.use_pyramid = use_pyramid
+        self.use_global_feature = use_global_feature
+        self.use_qformer = use_qformer_dim > 0
+        if self.use_pe:
+            self.pe = torch.zeros([1, self.seq_len * self.img_seq, self.output_dim], device=self.device, dtype=torch.bfloat16)
+            for i in range(self.seq_len):
+                self.pe[:, i * self.img_seq: (i + 1) * self.img_seq, i::self.seq_len] = 0.05
+        if self.use_qformer:
+            logger.info("image projection use qformer ...")
+            self.qformer = ModifiedQFormer(
+                input_dim=image_output_dim,
+                hidden_dim=use_qformer_dim,
+                num_heads=12,
+                num_layers=6,
+                output_dim=output_dim,
+                num_queries=512,
+                use_self_attention=False
+            ).cuda()
+        else:
+            self.project = nn.Linear(image_output_dim, output_dim)
+        self.final_norm = nn.LayerNorm(output_dim)
+    def apply_feature_pyramid(self, original_tokens, original_grid_size=32, downsample = [1, 4, 32]):
+        B, seq_len, D = original_tokens[0].shape
+        H = W = original_grid_size
+        token_lst = []
+        for i, tokens in enumerate(original_tokens):
+            downsample_size = downsample[i]
+            if downsample_size == 0:
+                pass
+            elif downsample_size == 1:
+                token_lst.append(tokens)
+            else:
+                head, tokens = torch.split(tokens, [1, 1024], dim=1)
+                tokens_2d = tokens.view(B, H, W, D).permute(0, 3, 1, 2) # Reshape tokens to 2D grid (B, D, H, W)
+                pooled = F.avg_pool2d(tokens_2d, kernel_size=downsample_size, stride=downsample_size)  # (B, D, 32//ds, 32//ds)
+                up = F.interpolate(pooled, size=(H, W), mode='nearest')  # (B, D, 32, 32)
+                up = up.permute(0, 2, 3, 1).reshape(B, seq_len - 1, D)
+                up = torch.cat([head, up], dim=1)
+                token_lst.append(up / downsample_size ** 0.5)
+        combined_tokens = torch.cat(token_lst, dim=2)
+        return combined_tokens
+    def apply_global_feature(self, original_tokens, original_grid_size=32, pool_size=4):
+        B, seq_len, D = original_tokens.shape
+        H = W = original_grid_size
+        tokens_2d = original_tokens.view(B, H, W, D).permute(0, 3, 1, 2)
+        pooled = F.avg_pool2d(tokens_2d, kernel_size=pool_size, stride=pool_size)  # (B, D, 8, 8)
+        pooled = pooled.permute((0, 2, 3, 1)).reshape((B, seq_len // pool_size // pool_size, D))
+        return pooled
+    def forward(self, image_list):
+        splits = [len(lst) for lst in image_list]
+        if sum(splits) == 0:
+            return torch.zeros([len(splits), self.seq_len * self.img_seq, self.output_dim], device=self.device, dtype=torch.bfloat16)
+        x = torch.concat(image_list, dim=0).to(device=self.device, dtype=torch.bfloat16)
+        x = self.base_model.patch_embed(x)
+        x, rot_pos_embed = self.base_model._pos_embed(x)
+        intermediates = []
+        for i, blk in enumerate(self.base_model.blocks):
+            x = blk(x, rope=rot_pos_embed)
+            if i in [11]:
+                intermediates.append(x)
+                intermediates.append(x)
+        if self.use_pyramid:
+            x = self.apply_feature_pyramid(intermediates + [x])
+        elif self.use_global_feature:
+            x = self.apply_global_feature(x)
+        if self.use_qformer:
+            x = self.qformer(x)
+        else:
+            x = self.project(x)
+        x = self.final_norm(x)
+        b, seq_len, c= x.shape
+        split_patches = torch.split(x, splits, dim=0)
+        split_patches = [nn.functional.pad(sample, (0, 0, 0, 0, 0, self.seq_len - len(sample))) for sample in split_patches]
+        x = torch.stack(split_patches, dim=0)
+        x = x.reshape((len(splits), self.seq_len * seq_len, c))
+        if self.use_pe:
+            x = x + self.pe
+        return x
+class ImageEncoderWithSiglip(nn.Module):
+    def __init__(self, output_dim, base_model="siglip2-so400m-patch16-512", layer_num=6, seq_len=3, device='cpu', use_pe=False):
+        super().__init__()
+        self.output_dim = output_dim
+        ckpt = {
+            'siglip-so400m-patch14-384': MODEL_PATHS.SIGLIP_SO400M_384,
+            'siglip2-so400m-patch16-512': MODEL_PATHS.SIGLIP2_SO400M_512
+        }[base_model]
+        image_classifier = pipeline(model=ckpt, task="zero-shot-image-classification", device='cpu')
+        logger.info(f"using {layer_num} / {len(image_classifier.model.vision_model.encoder.layers)} layers of {base_model} ... ")
+        del image_classifier.model.vision_model.encoder.layers[layer_num:]
+        num_features = image_classifier.model.vision_model.post_layernorm.normalized_shape[0]
+        self.base_model = image_classifier.model.vision_model
+        self.project = nn.Linear(num_features, output_dim)
+        self.final_norm = nn.LayerNorm(output_dim)
+        self.seq_len = seq_len
+        self.device = device
+        self.use_pe = use_pe
+    def forward(self, image_list):
+        splits = [len(lst) for lst in image_list]
+        if sum(splits) == 0:
+            return torch.zeros([len(splits), self.seq_len * self.img_seq, self.output_dim], device=self.device, dtype=torch.bfloat16)
+        x = torch.concat(image_list, dim=0).to(device=self.device, dtype=torch.bfloat16)
+        x = self.base_model(x).last_hidden_state
+        x = self.project(x)
+        x = self.final_norm(x)
+        b, seq_len, c= x.shape
+        split_patches = torch.split(x, splits, dim=0)
+        split_patches = [nn.functional.pad(sample, (0, 0, 0, 0, 0, self.seq_len - len(sample))) for sample in split_patches]
+        x = torch.stack(split_patches, dim=0)
+        x = x.reshape((len(splits), self.seq_len * seq_len, c))
+        if self.use_pe:
+            x = x + self.pe
+        return x

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+accelerate
+diffusers
+invisible_watermark
+torch
+transformers
+torchvision
+timm
+loguru
+einops
+sentencepiece

transformer_flux_custom.py ADDED Viewed

	@@ -0,0 +1,942 @@

+# Copyright 2024 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.models.attention import FeedForward
+from diffusers.models.attention_processor import (
+    Attention,
+    AttentionProcessor,
+    FluxAttnProcessor2_0,
+    FluxAttnProcessor2_0_NPU,
+    FusedFluxAttnProcessor2_0,
+)
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
+from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.import_utils import is_torch_npu_available
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings, FluxPosEmbed
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class FluxIPAttnProcessor2_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("FluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = img_q = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
+        if encoder_hidden_states is not None:
+            # `context` projections.
+            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+            encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            if attn.norm_added_q is not None:
+                encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+            if attn.norm_added_k is not None:
+                encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+            # attention
+            query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        if image_rotary_emb is not None:
+            from diffusers.models.embeddings import apply_rotary_emb
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        if encoder_hidden_states is not None:
+            encoder_hidden_states, hidden_states = (
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+                hidden_states[:, encoder_hidden_states.shape[1] :],
+            )
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+            return hidden_states, encoder_hidden_states, img_q
+        else:
+            return hidden_states, img_q
+@maybe_allow_in_graph
+class FluxSingleTransformerBlock(nn.Module):
+    r"""
+    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
+    Reference: https://arxiv.org/abs/2403.03206
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
+            processing of `context` conditions.
+    """
+    def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
+        super().__init__()
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+        self.norm = AdaLayerNormZeroSingle(dim)
+        self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
+        self.act_mlp = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
+        if is_torch_npu_available():
+            processor = FluxAttnProcessor2_0_NPU()
+        else:
+            processor = FluxAttnProcessor2_0()
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=True,
+            processor=processor,
+            qk_norm="rms_norm",
+            eps=1e-6,
+            pre_only=True,
+        )
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        image_rotary_emb=None,
+        joint_attention_kwargs=None,
+    ):
+        residual = hidden_states
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        gate = gate.unsqueeze(1)
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = residual + hidden_states
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+        return hidden_states
+@maybe_allow_in_graph
+class FluxTransformerBlock(nn.Module):
+    r"""
+    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
+    Reference: https://arxiv.org/abs/2403.03206
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
+            processing of `context` conditions.
+    """
+    def __init__(self, dim, num_attention_heads, attention_head_dim, qk_norm="rms_norm", eps=1e-6):
+        super().__init__()
+        self.norm1 = AdaLayerNormZero(dim)
+        self.norm1_context = AdaLayerNormZero(dim)
+        if hasattr(F, "scaled_dot_product_attention"):
+            processor = FluxAttnProcessor2_0()
+        else:
+            raise ValueError(
+                "The current PyTorch version does not support the `scaled_dot_product_attention` function."
+            )
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            context_pre_only=False,
+            bias=True,
+            processor=processor,
+            qk_norm=qk_norm,
+            eps=eps,
+        )
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        image_rotary_emb=None,
+        joint_attention_kwargs=None,
+    ):
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        # Attention.
+        attn_output, context_attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + attn_output
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = hidden_states + ff_output
+        # Process attention outputs for the `encoder_hidden_states`.
+        context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+        encoder_hidden_states = encoder_hidden_states + context_attn_output
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+        return encoder_hidden_states, hidden_states
+@maybe_allow_in_graph
+class FluxTransformerIPBlock(nn.Module):
+    def __init__(self, dim, num_attention_heads, attention_head_dim, qk_norm="rms_norm", eps=1e-6, ip_dim=3072):
+        super().__init__()
+        self.norm1 = AdaLayerNormZero(dim)
+        self.norm1_context = AdaLayerNormZero(dim)
+        if hasattr(F, "scaled_dot_product_attention"):
+            processor = FluxIPAttnProcessor2_0()
+        else:
+            raise ValueError(
+                "The current PyTorch version does not support the `scaled_dot_product_attention` function."
+            )
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            context_pre_only=False,
+            bias=True,
+            processor=processor,
+            qk_norm=qk_norm,
+            eps=eps,
+        )
+        self.ip_k_proj = nn.Linear(ip_dim, num_attention_heads * attention_head_dim, bias=True)
+        self.ip_v_proj = nn.Linear(ip_dim, num_attention_heads * attention_head_dim, bias=True)
+        self.ip_dim = ip_dim
+        self.num_heads = num_attention_heads
+        self.head_dim = attention_head_dim
+        nn.init.zeros_(self.ip_v_proj.weight)
+        nn.init.zeros_(self.ip_v_proj.bias)
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        image_rotary_emb=None,
+        joint_attention_kwargs=None,
+        image_proj=None,
+        ip_scale = 1.0,
+        return_mask = False
+    ):
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        # Attention.
+        attn_output, context_attn_output, img_q = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + attn_output
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = hidden_states + ff_output
+        # Process attention outputs for the `encoder_hidden_states`.
+        context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+        encoder_hidden_states = encoder_hidden_states + context_attn_output
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+        if image_proj is not None and ip_scale != 0:
+            ip_q = img_q
+            # image_proj = encoder_hidden_states[:, -512:, :]
+            # print("image_proj:", image_proj.shape, "encoder_hidden_states:", encoder_hidden_states.shape)
+            ip_k = self.ip_k_proj(image_proj)
+            ip_v = self.ip_v_proj(image_proj)
+            ip_k = rearrange(ip_k, 'B L (H D) -> B H L D', H=self.num_heads, D=self.head_dim)
+            ip_v = rearrange(ip_v, 'B L (H D) -> B H L D', H=self.num_heads, D=self.head_dim)
+            # print("qkv shape:", ip_q.shape, ip_k.shape, ip_v.shape)
+            ip_attention = F.scaled_dot_product_attention(ip_q, ip_k, ip_v, attn_mask=None, dropout_p=0.0, is_causal=False)
+            ip_attention = rearrange(ip_attention, 'B H L D -> B L (H D)', H=self.num_heads, D=self.head_dim)
+            hidden_states = hidden_states + ip_scale * ip_attention
+        if return_mask and image_proj is not None:
+            query_mask = compute_attention_mask_on_query(ip_q, ip_k)
+            return encoder_hidden_states, hidden_states, query_mask
+        else:
+            return encoder_hidden_states, hidden_states
+def compute_attention_mask_on_query(q, k, split=2):
+    L, S = q.size(-2), k.size(-2)
+    splits = [S//split for _ in range(split)]
+    scale_factor = 1 / math.sqrt(q.size(-1))
+    attn_weight = q @ k.transpose(-2, -1) * scale_factor
+    attn_weight = torch.softmax(attn_weight, dim=(1, 3)).mean(dim-1) # bhqk -> bq
+    return attn_weight
+@maybe_allow_in_graph
+class FluxSingleTransformerIPBlock(nn.Module):
+    r"""
+    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
+    Reference: https://arxiv.org/abs/2403.03206
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
+            processing of `context` conditions.
+    """
+    def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0, ip_dim=4096):
+        super().__init__()
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+        self.norm = AdaLayerNormZeroSingle(dim)
+        self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
+        self.act_mlp = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
+        if is_torch_npu_available():
+            processor = FluxAttnProcessor2_0_NPU()
+        else:
+            processor = FluxIPAttnProcessor2_0()
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=True,
+            processor=processor,
+            qk_norm="rms_norm",
+            eps=1e-6,
+            pre_only=True,
+        )
+        self.ip_k_proj = nn.Linear(ip_dim, num_attention_heads * attention_head_dim, bias=True)
+        self.ip_v_proj = nn.Linear(ip_dim, num_attention_heads * attention_head_dim, bias=True)
+        nn.init.zeros_(self.ip_v_proj.weight)
+        nn.init.zeros_(self.ip_v_proj.bias)
+        self.ip_dim = ip_dim
+        self.num_heads = num_attention_heads
+        self.head_dim = attention_head_dim
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        image_rotary_emb=None,
+        joint_attention_kwargs=None,
+        image_proj=None,
+        ip_scale=1.0,
+        return_mask=False
+    ):
+        residual = hidden_states
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        attn_output, img_q = self.attn(
+            hidden_states=norm_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        gate = gate.unsqueeze(1)
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = residual + hidden_states
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+        ip_q = img_q
+        if image_proj is not None and ip_scale != 0:
+            # image_proj = encoder_hidden_states[:, -512:, :]
+            ip_k = self.ip_k_proj(image_proj)
+            ip_v = self.ip_v_proj(image_proj)
+            ip_k = rearrange(ip_k, 'B L (H D) -> B H L D', H=self.num_heads, D=self.head_dim)
+            ip_v = rearrange(ip_v, 'B L (H D) -> B H L D', H=self.num_heads, D=self.head_dim)
+            # print("qkv shape:", ip_q.shape, ip_k.shape, ip_v.shape)
+            ip_attention = F.scaled_dot_product_attention(ip_q, ip_k, ip_v, attn_mask=None, dropout_p=0.0, is_causal=False)
+            ip_attention = rearrange(ip_attention, 'B H L D -> B L (H D)', H=self.num_heads, D=self.head_dim)
+            hidden_states = hidden_states + ip_scale * ip_attention
+        if return_mask and image_proj is not None:
+            query_mask = compute_attention_mask_on_query(ip_q, ip_k)
+            return hidden_states, query_mask
+        else:
+            return hidden_states
+class FluxTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+    """
+    The Transformer model introduced in Flux.
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+    Parameters:
+        patch_size (`int`): Patch size to turn the input data into small patches.
+        in_channels (`int`, *optional*, defaults to 16): The number of channels in the input.
+        num_layers (`int`, *optional*, defaults to 18): The number of layers of MMDiT blocks to use.
+        num_single_layers (`int`, *optional*, defaults to 18): The number of layers of single DiT blocks to use.
+        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
+        num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention.
+        joint_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        pooled_projection_dim (`int`): Number of dimensions to use when projecting the `pooled_projections`.
+        guidance_embeds (`bool`, defaults to False): Whether to use guidance embeddings.
+    """
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 1,
+        in_channels: int = 64,
+        out_channels: Optional[int] = None,
+        num_layers: int = 19,
+        num_single_layers: int = 38,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 24,
+        joint_attention_dim: int = 4096,
+        pooled_projection_dim: int = 768,
+        guidance_embeds: bool = False,
+        axes_dims_rope: Tuple[int] = (16, 56, 56),
+        ip_dim: int = 4096
+    ):
+        super().__init__()
+        self.out_channels = out_channels or in_channels
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+        self.ip_scale = 1.0
+        self.addition_seq_len = 512
+        self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
+        text_time_guidance_cls = (
+            CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
+        )
+        self.time_text_embed = text_time_guidance_cls(
+            embedding_dim=self.inner_dim, pooled_projection_dim=self.config.pooled_projection_dim
+        )
+        self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.inner_dim)
+        self.x_embedder = nn.Linear(self.config.in_channels, self.inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                FluxTransformerIPBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
+                    ip_dim=ip_dim
+                )
+                for i in range(self.config.num_layers)
+            ]
+        )
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                FluxSingleTransformerIPBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
+                    ip_dim=ip_dim
+                )
+                for i in range(self.config.num_single_layers)
+            ]
+        )
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+        self.gradient_checkpointing = False
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedFluxAttnProcessor2_0
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+        self.set_attn_processor(FusedFluxAttnProcessor2_0())
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor = None,
+        image_proj_hidden_states: torch.Tensor = None,
+        pooled_projections: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_block_samples=None,
+        controlnet_single_block_samples=None,
+        return_dict: bool = True,
+        controlnet_blocks_repeat: bool = False,
+        ip_scale: int = 1.0,
+        return_mask: bool = False
+    ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
+        """
+        The [`FluxTransformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
+                from the embeddings of input conditions.
+            timestep ( `torch.LongTensor`):
+                Used to indicate denoising step.
+            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
+                A list of tensors that if specified are added to the residuals of transformer blocks.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        hidden_states = self.x_embedder(hidden_states)
+        timestep = timestep.to(hidden_states.dtype) * 1000
+        if guidance is not None:
+            guidance = guidance.to(hidden_states.dtype) * 1000
+        else:
+            guidance = None
+        temb = (
+            self.time_text_embed(timestep, pooled_projections)
+            if guidance is None
+            else self.time_text_embed(timestep, guidance, pooled_projections)
+        )
+        _, _s, _ = encoder_hidden_states.shape
+        if _s > 2048:
+            _im_len = -2050
+        elif _s > 1024:
+            _im_len = -1025
+        elif _s > 512:
+            _im_len = -514
+        else:
+            _im_len = 0
+        if _im_len != 0:
+            image_proj = encoder_hidden_states[:, _im_len:, :]
+            encoder_hidden_states = self.context_embedder(encoder_hidden_states[:, :_im_len, :])
+            txt_ids = txt_ids[:_im_len, :]
+        else:
+            image_proj = image_proj_hidden_states
+            encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+        if txt_ids.ndim == 3:
+            logger.warning(
+                "Passing `txt_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            txt_ids = txt_ids[0]
+        if img_ids.ndim == 3:
+            logger.warning(
+                "Passing `img_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            img_ids = img_ids[0]
+        ids = torch.cat((txt_ids, img_ids), dim=0)
+        # print("id shape:", txt_ids.shape, img_ids.shape, ids.shape)
+        image_rotary_emb = self.pos_embed(ids)
+        masks = []
+        ip_scale = ip_scale or self.ip_scale
+        for index_block, block in enumerate(self.transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                results = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    joint_attention_kwargs,
+                    image_proj,
+                    ip_scale,
+                    return_mask,
+                    **ckpt_kwargs
+                )
+            else:
+                results = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                    image_proj=image_proj,
+                    ip_scale=ip_scale,
+                    return_mask=return_mask
+                )
+            if return_mask:
+                encoder_hidden_states, hidden_states, query_mask = results
+                masks.append(query_mask)
+            else:
+                encoder_hidden_states, hidden_states = results
+            # controlnet residual
+            if controlnet_block_samples is not None:
+                interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                # For Xlabs ControlNet.
+                if controlnet_blocks_repeat:
+                    hidden_states = (
+                        hidden_states + controlnet_block_samples[index_block % len(controlnet_block_samples)]
+                    )
+                else:
+                    hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                results = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    joint_attention_kwargs,
+                    image_proj,
+                    ip_scale,
+                    return_mask,
+                    **ckpt_kwargs,
+                )
+            else:
+                results = block(
+                    hidden_states=hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                    image_proj=image_proj,
+                    ip_scale=ip_scale,
+                    return_mask=return_mask
+                )
+            if return_mask:
+                hidden_states, query_mask = results
+                masks.append(query_mask)
+            else:
+                hidden_states = results
+            # controlnet residual
+            if controlnet_single_block_samples is not None:
+                interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
+                    hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+                    + controlnet_single_block_samples[index_block // interval_control]
+                )
+        hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+        hidden_states = self.norm_out(hidden_states, temb)
+        output = self.proj_out(hidden_states)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)