tuandunghcmut commited on
Commit
7ee2e81
·
verified ·
1 Parent(s): 2789936

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/attention.py +435 -0
  2. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/motion_module.py +330 -0
  3. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/mutual_self_attention.py +321 -0
  4. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/transformer_2d.py +363 -0
  5. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/unet_2d_blocks.py +1031 -0
  6. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/unet_2d_condition.py +1189 -0
  7. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/unet_3d_blocks.py +739 -0
  8. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/transformer_temporal.py +173 -0
  9. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/noise_conditions.py +123 -0
  10. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/samplers.py +79 -0
  11. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/schedulers.py +229 -0
  12. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +925 -0
  13. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/animatediff/pipeline_animatediff.py +657 -0
  14. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/audioldm/__init__.py +60 -0
  15. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/audioldm/pipeline_audioldm.py +553 -0
  16. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +337 -0
  17. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/__init__.py +113 -0
  18. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/multicontrolnet.py +191 -0
  19. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_controlnet.py +1159 -0
  20. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +400 -0
  21. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +1116 -0
  22. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +1428 -0
  23. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +1579 -0
  24. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +1398 -0
  25. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_fastdeploy_stable_diffusion_controlnet.py +33 -0
  26. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_paddleinfer_stable_diffusion_controlnet.py +33 -0
  27. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/ddim/__init__.py +31 -0
  28. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/ddim/pipeline_ddim.py +153 -0
  29. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/dit/pipeline_dit.py +255 -0
  30. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/hotshot_xl/__init__.py +29 -0
  31. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/hotshot_xl/hotshot_xl_controlnet_pipeline.py +1067 -0
  32. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py +73 -0
  33. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +787 -0
  34. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_largedit.py +362 -0
  35. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +201 -0
  36. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_uvit.py +477 -0
  37. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/lvdm/__init__.py +66 -0
  38. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_text2video.py +704 -0
  39. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_output.py +37 -0
  40. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/musicldm/__init__.py +62 -0
  41. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/musicldm/pipeline_musicldm.py +590 -0
  42. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/pndm/__init__.py +31 -0
  43. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/repaint/__init__.py +32 -0
  44. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/repaint/pipeline_repaint.py +227 -0
  45. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_3/__init__.py +51 -0
  46. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +899 -0
  47. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/__init__.py +150 -0
  48. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/pipeline_fastdeploy_stable_diffusion_xl_img2img.py +552 -0
  49. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/pipeline_output.py +35 -0
  50. VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/pipeline_paddleinfer_stable_diffusion_xl_img2img.py +548 -0
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/attention.py ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py
16
+
17
+ from typing import Any, Dict, Optional
18
+
19
+ import paddle
20
+ from einops import rearrange
21
+
22
+ from ppdiffusers.models.attention import AdaLayerNorm, Attention, FeedForward
23
+ from ppdiffusers.models.embeddings import SinusoidalPositionalEmbedding
24
+
25
+
26
+ class BasicTransformerBlock(paddle.nn.Layer):
27
+ r"""
28
+ A basic Transformer block.
29
+
30
+ Parameters:
31
+ dim (`int`): The number of channels in the input and output.
32
+ num_attention_heads (`int`): The number of heads to use for multi-head attention.
33
+ attention_head_dim (`int`): The number of channels in each head.
34
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
35
+ cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
36
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
37
+ num_embeds_ada_norm (:
38
+ obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
39
+ attention_bias (:
40
+ obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
41
+ only_cross_attention (`bool`, *optional*):
42
+ Whether to use only cross-attention layers. In this case two cross attention layers are used.
43
+ double_self_attention (`bool`, *optional*):
44
+ Whether to use two self-attention layers. In this case no cross attention layers are used.
45
+ upcast_attention (`bool`, *optional*):
46
+ Whether to upcast the attention computation to float32. This is useful for mixed precision training.
47
+ norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
48
+ Whether to use learnable elementwise affine parameters for normalization.
49
+ norm_type (`str`, *optional*, defaults to `"layer_norm"`):
50
+ The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
51
+ final_dropout (`bool` *optional*, defaults to False):
52
+ Whether to apply a final dropout after the last feed-forward layer.
53
+ attention_type (`str`, *optional*, defaults to `"default"`):
54
+ The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
55
+ positional_embeddings (`str`, *optional*, defaults to `None`):
56
+ The type of positional embeddings to apply to.
57
+ num_positional_embeddings (`int`, *optional*, defaults to `None`):
58
+ The maximum number of positional embeddings to apply.
59
+ """
60
+
61
+ def __init__(
62
+ self,
63
+ dim: int,
64
+ num_attention_heads: int,
65
+ attention_head_dim: int,
66
+ dropout=0.0,
67
+ cross_attention_dim: Optional[int] = None,
68
+ activation_fn: str = "geglu",
69
+ num_embeds_ada_norm: Optional[int] = None,
70
+ attention_bias: bool = False,
71
+ only_cross_attention: bool = False,
72
+ double_self_attention: bool = False,
73
+ upcast_attention: bool = False,
74
+ norm_elementwise_affine: bool = True,
75
+ norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single'
76
+ norm_eps: float = 1e-5,
77
+ final_dropout: bool = False,
78
+ attention_type: str = "default",
79
+ positional_embeddings: Optional[str] = None,
80
+ num_positional_embeddings: Optional[int] = None,
81
+ ):
82
+ super().__init__()
83
+ self.only_cross_attention = only_cross_attention
84
+
85
+ self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
86
+ self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
87
+ self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
88
+ self.use_layer_norm = norm_type == "layer_norm"
89
+
90
+ if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
91
+ raise ValueError(
92
+ f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
93
+ f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
94
+ )
95
+
96
+ if positional_embeddings and (num_positional_embeddings is None):
97
+ raise ValueError(
98
+ "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
99
+ )
100
+
101
+ if positional_embeddings == "sinusoidal":
102
+ self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
103
+ else:
104
+ self.pos_embed = None
105
+
106
+ # Define 3 blocks. Each block has its own normalization layer.
107
+ # 1. Self-Attn
108
+ if self.use_ada_layer_norm:
109
+ self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
110
+ else:
111
+ self.norm1 = paddle.nn.LayerNorm(
112
+ normalized_shape=dim,
113
+ weight_attr=norm_elementwise_affine,
114
+ bias_attr=norm_elementwise_affine,
115
+ epsilon=norm_eps,
116
+ )
117
+
118
+ self.attn1 = Attention(
119
+ query_dim=dim,
120
+ heads=num_attention_heads,
121
+ dim_head=attention_head_dim,
122
+ dropout=dropout,
123
+ bias=attention_bias,
124
+ cross_attention_dim=cross_attention_dim if only_cross_attention else None,
125
+ upcast_attention=upcast_attention,
126
+ )
127
+
128
+ # 2. Cross-Attn
129
+ if cross_attention_dim is not None or double_self_attention:
130
+ # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
131
+ # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
132
+ # the second cross attention block.
133
+ self.norm2 = (
134
+ AdaLayerNorm(dim, num_embeds_ada_norm)
135
+ if self.use_ada_layer_norm
136
+ else paddle.nn.LayerNorm(
137
+ normalized_shape=dim,
138
+ weight_attr=norm_elementwise_affine,
139
+ bias_attr=norm_elementwise_affine,
140
+ epsilon=norm_eps,
141
+ )
142
+ )
143
+ self.attn2 = Attention(
144
+ query_dim=dim,
145
+ cross_attention_dim=cross_attention_dim if not double_self_attention else None,
146
+ heads=num_attention_heads,
147
+ dim_head=attention_head_dim,
148
+ dropout=dropout,
149
+ bias=attention_bias,
150
+ upcast_attention=upcast_attention,
151
+ ) # is self-attn if encoder_hidden_states is none
152
+ else:
153
+ self.norm2 = None
154
+ self.attn2 = None
155
+
156
+ # 3. Feed-forward
157
+ if not self.use_ada_layer_norm_single:
158
+ self.norm3 = paddle.nn.LayerNorm(
159
+ normalized_shape=dim,
160
+ weight_attr=norm_elementwise_affine,
161
+ bias_attr=norm_elementwise_affine,
162
+ epsilon=norm_eps,
163
+ )
164
+
165
+ self.ff = FeedForward(
166
+ dim,
167
+ dropout=dropout,
168
+ activation_fn=activation_fn,
169
+ final_dropout=final_dropout,
170
+ )
171
+
172
+ # 4. Scale-shift for PixArt-Alpha.
173
+ if self.use_ada_layer_norm_single:
174
+ out_0 = paddle.create_parameter(
175
+ shape=(paddle.randn(shape=[6, dim]) / dim**0.5).shape,
176
+ dtype=(paddle.randn(shape=[6, dim]) / dim**0.5).numpy().dtype,
177
+ default_initializer=paddle.nn.initializer.Assign(paddle.randn(shape=[6, dim]) / dim**0.5),
178
+ )
179
+ out_0.stop_gradient = not True
180
+ self.scale_shift_table = out_0
181
+
182
+ # let chunk size default to None
183
+ self._chunk_size = None
184
+ self._chunk_dim = 0
185
+
186
+ def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
187
+ # Sets chunk feed-forward
188
+ self._chunk_size = chunk_size
189
+ self._chunk_dim = dim
190
+
191
+ def forward(
192
+ self,
193
+ hidden_states: paddle.Tensor,
194
+ attention_mask: Optional[paddle.Tensor] = None,
195
+ encoder_hidden_states: Optional[paddle.Tensor] = None,
196
+ encoder_attention_mask: Optional[paddle.Tensor] = None,
197
+ timestep: Optional[paddle.Tensor] = None,
198
+ cross_attention_kwargs: Dict[str, Any] = None,
199
+ class_labels: Optional[paddle.Tensor] = None,
200
+ ) -> paddle.Tensor:
201
+ # Notice that normalization is always applied before the real computation in the following blocks.
202
+ # 0. Self-Attention
203
+ batch_size = hidden_states.shape[0]
204
+
205
+ if self.use_ada_layer_norm:
206
+ norm_hidden_states = self.norm1(hidden_states, timestep)
207
+ elif self.use_ada_layer_norm_zero:
208
+ norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
209
+ hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
210
+ )
211
+ elif self.use_layer_norm:
212
+ norm_hidden_states = self.norm1(hidden_states)
213
+ elif self.use_ada_layer_norm_single:
214
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
215
+ self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
216
+ ).chunk(6, dim=1)
217
+ norm_hidden_states = self.norm1(hidden_states)
218
+ norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
219
+ norm_hidden_states = norm_hidden_states.squeeze(1)
220
+ else:
221
+ raise ValueError("Incorrect norm used")
222
+
223
+ if self.pos_embed is not None:
224
+ norm_hidden_states = self.pos_embed(norm_hidden_states)
225
+
226
+ # 1. Retrieve lora scale.
227
+ lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
228
+
229
+ # 2. Prepare GLIGEN inputs
230
+ cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
231
+ gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
232
+
233
+ attn_output = self.attn1(
234
+ norm_hidden_states,
235
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
236
+ attention_mask=attention_mask,
237
+ **cross_attention_kwargs,
238
+ )
239
+ if self.use_ada_layer_norm_zero:
240
+ attn_output = gate_msa.unsqueeze(1) * attn_output
241
+ elif self.use_ada_layer_norm_single:
242
+ attn_output = gate_msa * attn_output
243
+
244
+ hidden_states = attn_output + hidden_states
245
+ if hidden_states.ndim == 4:
246
+ hidden_states = hidden_states.squeeze(1)
247
+
248
+ # 2.5 GLIGEN Control
249
+ if gligen_kwargs is not None:
250
+ hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
251
+
252
+ # 3. Cross-Attention
253
+ if self.attn2 is not None:
254
+ if self.use_ada_layer_norm:
255
+ norm_hidden_states = self.norm2(hidden_states, timestep)
256
+ elif self.use_ada_layer_norm_zero or self.use_layer_norm:
257
+ norm_hidden_states = self.norm2(hidden_states)
258
+ elif self.use_ada_layer_norm_single:
259
+ # For PixArt norm2 isn't applied here:
260
+ # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
261
+ norm_hidden_states = hidden_states
262
+ else:
263
+ raise ValueError("Incorrect norm")
264
+
265
+ if self.pos_embed is not None and self.use_ada_layer_norm_single is False:
266
+ norm_hidden_states = self.pos_embed(norm_hidden_states)
267
+
268
+ attn_output = self.attn2(
269
+ norm_hidden_states,
270
+ encoder_hidden_states=encoder_hidden_states,
271
+ attention_mask=encoder_attention_mask,
272
+ **cross_attention_kwargs,
273
+ )
274
+ hidden_states = attn_output + hidden_states
275
+
276
+ # 4. Feed-forward
277
+ if not self.use_ada_layer_norm_single:
278
+ norm_hidden_states = self.norm3(hidden_states)
279
+
280
+ if self.use_ada_layer_norm_zero:
281
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
282
+
283
+ if self.use_ada_layer_norm_single:
284
+ norm_hidden_states = self.norm2(hidden_states)
285
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
286
+
287
+ ff_output = self.ff(norm_hidden_states, scale=lora_scale)
288
+
289
+ if self.use_ada_layer_norm_zero:
290
+ ff_output = gate_mlp.unsqueeze(1) * ff_output
291
+ elif self.use_ada_layer_norm_single:
292
+ ff_output = gate_mlp * ff_output
293
+
294
+ hidden_states = ff_output + hidden_states
295
+ if hidden_states.ndim == 4:
296
+ hidden_states = hidden_states.squeeze(1)
297
+
298
+ return hidden_states
299
+
300
+
301
+ class TemporalBasicTransformerBlock(paddle.nn.Layer):
302
+ def __init__(
303
+ self,
304
+ dim: int,
305
+ num_attention_heads: int,
306
+ attention_head_dim: int,
307
+ dropout=0.0,
308
+ cross_attention_dim: Optional[int] = None,
309
+ activation_fn: str = "geglu",
310
+ num_embeds_ada_norm: Optional[int] = None,
311
+ attention_bias: bool = False,
312
+ only_cross_attention: bool = False,
313
+ upcast_attention: bool = False,
314
+ unet_use_cross_frame_attention=None,
315
+ unet_use_temporal_attention=None,
316
+ ):
317
+ super().__init__()
318
+ self.only_cross_attention = only_cross_attention
319
+ self.use_ada_layer_norm = num_embeds_ada_norm is not None
320
+ self.unet_use_cross_frame_attention = unet_use_cross_frame_attention
321
+ self.unet_use_temporal_attention = unet_use_temporal_attention
322
+
323
+ # SC-Attn
324
+ self.attn1 = Attention(
325
+ query_dim=dim,
326
+ heads=num_attention_heads,
327
+ dim_head=attention_head_dim,
328
+ dropout=dropout,
329
+ bias=attention_bias,
330
+ upcast_attention=upcast_attention,
331
+ )
332
+ self.norm1 = (
333
+ AdaLayerNorm(dim, num_embeds_ada_norm)
334
+ if self.use_ada_layer_norm
335
+ else paddle.nn.LayerNorm(normalized_shape=dim)
336
+ )
337
+
338
+ # Cross-Attn
339
+ if cross_attention_dim is not None:
340
+ self.attn2 = Attention(
341
+ query_dim=dim,
342
+ cross_attention_dim=cross_attention_dim,
343
+ heads=num_attention_heads,
344
+ dim_head=attention_head_dim,
345
+ dropout=dropout,
346
+ bias=attention_bias,
347
+ upcast_attention=upcast_attention,
348
+ )
349
+ else:
350
+ self.attn2 = None
351
+
352
+ if cross_attention_dim is not None:
353
+ self.norm2 = (
354
+ AdaLayerNorm(dim, num_embeds_ada_norm)
355
+ if self.use_ada_layer_norm
356
+ else paddle.nn.LayerNorm(normalized_shape=dim)
357
+ )
358
+ else:
359
+ self.norm2 = None
360
+
361
+ # Feed-forward
362
+ self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
363
+ self.norm3 = paddle.nn.LayerNorm(normalized_shape=dim)
364
+ self.use_ada_layer_norm_zero = False
365
+
366
+ # Temp-Attn
367
+ assert unet_use_temporal_attention is not None
368
+ if unet_use_temporal_attention:
369
+ self.attn_temp = Attention(
370
+ query_dim=dim,
371
+ heads=num_attention_heads,
372
+ dim_head=attention_head_dim,
373
+ dropout=dropout,
374
+ bias=attention_bias,
375
+ upcast_attention=upcast_attention,
376
+ )
377
+
378
+ self.norm_temp = (
379
+ AdaLayerNorm(dim, num_embeds_ada_norm)
380
+ if self.use_ada_layer_norm
381
+ else paddle.nn.LayerNorm(normalized_shape=dim)
382
+ )
383
+
384
+ def forward(
385
+ self,
386
+ hidden_states,
387
+ encoder_hidden_states=None,
388
+ timestep=None,
389
+ attention_mask=None,
390
+ video_length=None,
391
+ ):
392
+ norm_hidden_states = (
393
+ self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(hidden_states)
394
+ )
395
+
396
+ if self.unet_use_cross_frame_attention:
397
+ hidden_states = (
398
+ self.attn1(
399
+ norm_hidden_states,
400
+ attention_mask=attention_mask,
401
+ video_length=video_length,
402
+ )
403
+ + hidden_states
404
+ )
405
+ else:
406
+ hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask) + hidden_states
407
+
408
+ if self.attn2 is not None:
409
+ # Cross-Attention
410
+ norm_hidden_states = (
411
+ self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
412
+ )
413
+ hidden_states = (
414
+ self.attn2(
415
+ norm_hidden_states,
416
+ encoder_hidden_states=encoder_hidden_states,
417
+ attention_mask=attention_mask,
418
+ )
419
+ + hidden_states
420
+ )
421
+
422
+ # Feed-forward
423
+ hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
424
+
425
+ # Temporal-Attention
426
+ if self.unet_use_temporal_attention:
427
+ d = hidden_states.shape[1]
428
+ hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
429
+ norm_hidden_states = (
430
+ self.norm_temp(hidden_states, timestep) if self.use_ada_layer_norm else self.norm_temp(hidden_states)
431
+ )
432
+ hidden_states = self.attn_temp(norm_hidden_states) + hidden_states
433
+ hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
434
+
435
+ return hidden_states
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/motion_module.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Adapt from https://github.com/guoyww/AnimateDiff/blob/main/animatediff/models/motion_module.py
16
+ import math
17
+ from dataclasses import dataclass
18
+
19
+ import paddle
20
+ from einops import rearrange, repeat
21
+
22
+ from ppdiffusers.models.attention import FeedForward
23
+ from ppdiffusers.models.attention_processor import Attention
24
+ from ppdiffusers.utils import BaseOutput
25
+
26
+
27
+ def zero_module(module):
28
+ # Zero out the parameters of a module and return it.
29
+ for p in module.parameters():
30
+ p.detach().zero_()
31
+ return module
32
+
33
+
34
+ @dataclass
35
+ class TemporalTransformer3DModelOutput(BaseOutput):
36
+ sample: paddle.Tensor
37
+
38
+
39
+ def get_motion_module(in_channels, motion_module_type: str, motion_module_kwargs: dict):
40
+ if motion_module_type == "Vanilla":
41
+ return VanillaTemporalModule(
42
+ in_channels=in_channels,
43
+ **motion_module_kwargs,
44
+ )
45
+ else:
46
+ raise ValueError
47
+
48
+
49
+ class VanillaTemporalModule(paddle.nn.Layer):
50
+ def __init__(
51
+ self,
52
+ in_channels,
53
+ num_attention_heads=8,
54
+ num_transformer_block=2,
55
+ attention_block_types=("Temporal_Self", "Temporal_Self"),
56
+ cross_frame_attention_mode=None,
57
+ temporal_position_encoding=False,
58
+ temporal_position_encoding_max_len=24,
59
+ temporal_attention_dim_div=1,
60
+ zero_initialize=True,
61
+ ):
62
+ super().__init__()
63
+
64
+ self.temporal_transformer = TemporalTransformer3DModel(
65
+ in_channels=in_channels,
66
+ num_attention_heads=num_attention_heads,
67
+ attention_head_dim=in_channels // num_attention_heads // temporal_attention_dim_div,
68
+ num_layers=num_transformer_block,
69
+ attention_block_types=attention_block_types,
70
+ cross_frame_attention_mode=cross_frame_attention_mode,
71
+ temporal_position_encoding=temporal_position_encoding,
72
+ temporal_position_encoding_max_len=temporal_position_encoding_max_len,
73
+ )
74
+
75
+ if zero_initialize:
76
+ self.temporal_transformer.proj_out = zero_module(self.temporal_transformer.proj_out)
77
+
78
+ def forward(
79
+ self,
80
+ input_tensor,
81
+ temb,
82
+ encoder_hidden_states,
83
+ attention_mask=None,
84
+ anchor_frame_idx=None,
85
+ ):
86
+ hidden_states = input_tensor
87
+ hidden_states = self.temporal_transformer(hidden_states, encoder_hidden_states, attention_mask)
88
+
89
+ output = hidden_states
90
+ return output
91
+
92
+
93
+ class TemporalTransformer3DModel(paddle.nn.Layer):
94
+ def __init__(
95
+ self,
96
+ in_channels,
97
+ num_attention_heads,
98
+ attention_head_dim,
99
+ num_layers,
100
+ attention_block_types=(
101
+ "Temporal_Self",
102
+ "Temporal_Self",
103
+ ),
104
+ dropout=0.0,
105
+ norm_num_groups=32,
106
+ cross_attention_dim=768,
107
+ activation_fn="geglu",
108
+ attention_bias=False,
109
+ upcast_attention=False,
110
+ cross_frame_attention_mode=None,
111
+ temporal_position_encoding=False,
112
+ temporal_position_encoding_max_len=24,
113
+ ):
114
+ super().__init__()
115
+
116
+ inner_dim = num_attention_heads * attention_head_dim
117
+
118
+ self.norm = paddle.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-06)
119
+ self.proj_in = paddle.nn.Linear(in_features=in_channels, out_features=inner_dim)
120
+
121
+ self.transformer_blocks = paddle.nn.LayerList(
122
+ sublayers=[
123
+ TemporalTransformerBlock(
124
+ dim=inner_dim,
125
+ num_attention_heads=num_attention_heads,
126
+ attention_head_dim=attention_head_dim,
127
+ attention_block_types=attention_block_types,
128
+ dropout=dropout,
129
+ norm_num_groups=norm_num_groups,
130
+ cross_attention_dim=cross_attention_dim,
131
+ activation_fn=activation_fn,
132
+ attention_bias=attention_bias,
133
+ upcast_attention=upcast_attention,
134
+ cross_frame_attention_mode=cross_frame_attention_mode,
135
+ temporal_position_encoding=temporal_position_encoding,
136
+ temporal_position_encoding_max_len=temporal_position_encoding_max_len,
137
+ )
138
+ for d in range(num_layers)
139
+ ]
140
+ )
141
+ self.proj_out = paddle.nn.Linear(in_features=inner_dim, out_features=in_channels)
142
+
143
+ def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
144
+ assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
145
+ video_length = hidden_states.shape[2]
146
+ hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
147
+
148
+ batch, channel, height, weight = hidden_states.shape
149
+ residual = hidden_states
150
+
151
+ hidden_states = self.norm(hidden_states)
152
+
153
+ inner_dim = hidden_states.shape[1]
154
+ hidden_states = hidden_states.transpose([0, 2, 3, 1]).reshape((batch, height * weight, inner_dim))
155
+ hidden_states = self.proj_in(hidden_states)
156
+ # Transformer Blocks
157
+ for block in self.transformer_blocks:
158
+ hidden_states = block(
159
+ hidden_states,
160
+ encoder_hidden_states=encoder_hidden_states,
161
+ video_length=video_length,
162
+ )
163
+
164
+ # output
165
+ hidden_states = self.proj_out(hidden_states)
166
+ hidden_states = hidden_states.reshape((batch, height, weight, inner_dim)).transpose([0, 3, 1, 2])
167
+
168
+ output = hidden_states + residual
169
+ output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
170
+
171
+ return output
172
+
173
+
174
+ class TemporalTransformerBlock(paddle.nn.Layer):
175
+ def __init__(
176
+ self,
177
+ dim,
178
+ num_attention_heads,
179
+ attention_head_dim,
180
+ attention_block_types=(
181
+ "Temporal_Self",
182
+ "Temporal_Self",
183
+ ),
184
+ dropout=0.0,
185
+ norm_num_groups=32,
186
+ cross_attention_dim=768,
187
+ activation_fn="geglu",
188
+ attention_bias=False,
189
+ upcast_attention=False,
190
+ cross_frame_attention_mode=None,
191
+ temporal_position_encoding=False,
192
+ temporal_position_encoding_max_len=24,
193
+ ):
194
+ super().__init__()
195
+
196
+ attention_blocks = []
197
+ norms = []
198
+
199
+ for block_name in attention_block_types:
200
+ attention_blocks.append(
201
+ VersatileAttention(
202
+ attention_mode=block_name.split("_")[0],
203
+ cross_attention_dim=cross_attention_dim if block_name.endswith("_Cross") else None,
204
+ query_dim=dim,
205
+ heads=num_attention_heads,
206
+ dim_head=attention_head_dim,
207
+ dropout=dropout,
208
+ bias=attention_bias,
209
+ upcast_attention=upcast_attention,
210
+ cross_frame_attention_mode=cross_frame_attention_mode,
211
+ temporal_position_encoding=temporal_position_encoding,
212
+ temporal_position_encoding_max_len=temporal_position_encoding_max_len,
213
+ )
214
+ )
215
+ norms.append(paddle.nn.LayerNorm(normalized_shape=dim))
216
+
217
+ self.attention_blocks = paddle.nn.LayerList(sublayers=attention_blocks)
218
+ self.norms = paddle.nn.LayerList(sublayers=norms)
219
+
220
+ self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
221
+ self.ff_norm = paddle.nn.LayerNorm(normalized_shape=dim)
222
+
223
+ def forward(
224
+ self,
225
+ hidden_states,
226
+ encoder_hidden_states=None,
227
+ attention_mask=None,
228
+ video_length=None,
229
+ ):
230
+ for attention_block, norm in zip(self.attention_blocks, self.norms):
231
+ norm_hidden_states = norm(hidden_states)
232
+ hidden_states = (
233
+ attention_block(
234
+ norm_hidden_states,
235
+ encoder_hidden_states=encoder_hidden_states if attention_block.is_cross_attention else None,
236
+ video_length=video_length,
237
+ )
238
+ + hidden_states
239
+ )
240
+
241
+ hidden_states = self.ff(self.ff_norm(hidden_states)) + hidden_states
242
+
243
+ output = hidden_states
244
+ return output
245
+
246
+
247
+ class PositionalEncoding(paddle.nn.Layer):
248
+ def __init__(self, d_model, dropout=0.0, max_len=24):
249
+ super().__init__()
250
+
251
+ self.dropout = paddle.nn.Dropout(p=dropout)
252
+ position = paddle.arange(end=max_len, dtype="float32").unsqueeze(axis=1)
253
+ div_term = paddle.exp(
254
+ x=paddle.arange(start=0, end=d_model, step=2, dtype="float32") * (-math.log(10000.0) / d_model)
255
+ )
256
+ pe = paddle.zeros(shape=[1, max_len, d_model])
257
+ pe[0, :, 0::2] = paddle.sin(x=position * div_term)
258
+ pe[0, :, 1::2] = paddle.cos(x=position * div_term)
259
+ self.register_buffer(name="pe", tensor=pe)
260
+
261
+ def forward(self, x):
262
+ x = x + self.pe[:, : x.shape[1]]
263
+ return self.dropout(x)
264
+
265
+
266
+ class VersatileAttention(Attention):
267
+ def __init__(
268
+ self,
269
+ attention_mode=None,
270
+ cross_frame_attention_mode=None,
271
+ temporal_position_encoding=False,
272
+ temporal_position_encoding_max_len=24,
273
+ *args,
274
+ **kwargs,
275
+ ):
276
+ super().__init__(*args, **kwargs)
277
+ assert attention_mode == "Temporal"
278
+
279
+ self.attention_mode = attention_mode
280
+ self.is_cross_attention = kwargs["cross_attention_dim"] is not None
281
+
282
+ self.pos_encoder = (
283
+ PositionalEncoding(
284
+ kwargs["query_dim"],
285
+ dropout=0.0,
286
+ max_len=temporal_position_encoding_max_len,
287
+ )
288
+ if (temporal_position_encoding and attention_mode == "Temporal")
289
+ else None
290
+ )
291
+
292
+ def extra_repr(self):
293
+ return f"(Module Info) Attention_Mode: {self.attention_mode}, Is_Cross_Attention: {self.is_cross_attention}"
294
+
295
+ def forward(
296
+ self,
297
+ hidden_states,
298
+ encoder_hidden_states=None,
299
+ attention_mask=None,
300
+ video_length=None,
301
+ **cross_attention_kwargs,
302
+ ):
303
+ if self.attention_mode == "Temporal":
304
+ d = hidden_states.shape[1] # d means HxW
305
+ hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
306
+
307
+ if self.pos_encoder is not None:
308
+ hidden_states = self.pos_encoder(hidden_states)
309
+
310
+ encoder_hidden_states = (
311
+ repeat(encoder_hidden_states, "b n c -> (b d) n c", d=d)
312
+ if encoder_hidden_states is not None
313
+ else encoder_hidden_states
314
+ )
315
+
316
+ else:
317
+ raise NotImplementedError
318
+
319
+ hidden_states = self.processor(
320
+ self,
321
+ hidden_states,
322
+ encoder_hidden_states=encoder_hidden_states,
323
+ attention_mask=attention_mask,
324
+ **cross_attention_kwargs,
325
+ )
326
+
327
+ if self.attention_mode == "Temporal":
328
+ hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
329
+
330
+ return hidden_states
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/mutual_self_attention.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Adapted from https://github.com/magic-research/magic-animate/blob/main/magicanimate/models/mutual_self_attention.py
16
+ from typing import Any, Dict, Optional
17
+
18
+ import paddle
19
+ from einops import rearrange
20
+
21
+ from ppdiffusers.models.animate_anyone.attention import (
22
+ BasicTransformerBlock,
23
+ TemporalBasicTransformerBlock,
24
+ )
25
+
26
+
27
+ def paddle_dfs(model: paddle.nn.Layer):
28
+ result = [model]
29
+ for child in model.children():
30
+ result += paddle_dfs(child)
31
+ return result
32
+
33
+
34
+ class ReferenceAttentionControl:
35
+ def __init__(
36
+ self,
37
+ unet,
38
+ mode="write",
39
+ do_classifier_free_guidance=False,
40
+ attention_auto_machine_weight=float("inf"),
41
+ gn_auto_machine_weight=1.0,
42
+ style_fidelity=1.0,
43
+ reference_attn=True,
44
+ reference_adain=False,
45
+ fusion_blocks="midup",
46
+ batch_size=1,
47
+ ) -> None:
48
+ # 10. Modify self attention and group norm
49
+ self.unet = unet
50
+ assert mode in ["read", "write"]
51
+ assert fusion_blocks in ["midup", "full"]
52
+ self.reference_attn = reference_attn
53
+ self.reference_adain = reference_adain
54
+ self.fusion_blocks = fusion_blocks
55
+ self.register_reference_hooks(
56
+ mode,
57
+ do_classifier_free_guidance,
58
+ attention_auto_machine_weight,
59
+ gn_auto_machine_weight,
60
+ style_fidelity,
61
+ reference_attn,
62
+ reference_adain,
63
+ fusion_blocks,
64
+ batch_size=batch_size,
65
+ )
66
+
67
+ def register_reference_hooks(
68
+ self,
69
+ mode,
70
+ do_classifier_free_guidance,
71
+ attention_auto_machine_weight,
72
+ gn_auto_machine_weight,
73
+ style_fidelity,
74
+ reference_attn,
75
+ reference_adain,
76
+ dtype="float16",
77
+ batch_size=1,
78
+ num_images_per_prompt=1,
79
+ fusion_blocks="midup",
80
+ ):
81
+ MODE = mode
82
+ do_classifier_free_guidance = do_classifier_free_guidance
83
+ attention_auto_machine_weight = attention_auto_machine_weight
84
+ gn_auto_machine_weight = gn_auto_machine_weight
85
+ style_fidelity = style_fidelity
86
+ reference_attn = reference_attn
87
+ reference_adain = reference_adain
88
+ fusion_blocks = fusion_blocks
89
+ num_images_per_prompt = num_images_per_prompt
90
+ dtype = dtype
91
+ if do_classifier_free_guidance:
92
+ uc_mask = paddle.to_tensor(
93
+ data=[1] * batch_size * num_images_per_prompt * 16 + [0] * batch_size * num_images_per_prompt * 16,
94
+ dtype="float32",
95
+ ).astype(dtype="bool")
96
+ else:
97
+ uc_mask = paddle.to_tensor(data=[0] * batch_size * num_images_per_prompt * 2, dtype="float32").astype(
98
+ dtype="bool"
99
+ )
100
+
101
+ def hacked_basic_transformer_inner_forward(
102
+ self,
103
+ hidden_states: paddle.Tensor,
104
+ attention_mask: Optional[paddle.Tensor] = None,
105
+ encoder_hidden_states: Optional[paddle.Tensor] = None,
106
+ encoder_attention_mask: Optional[paddle.Tensor] = None,
107
+ timestep: Optional[paddle.Tensor] = None,
108
+ cross_attention_kwargs: Dict[str, Any] = None,
109
+ class_labels: Optional[paddle.Tensor] = None,
110
+ video_length=None,
111
+ ):
112
+
113
+ if self.use_ada_layer_norm:
114
+ norm_hidden_states = self.norm1(hidden_states, timestep)
115
+ elif self.use_ada_layer_norm_zero:
116
+ (norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp,) = self.norm1(
117
+ hidden_states,
118
+ timestep,
119
+ class_labels,
120
+ hidden_dtype=hidden_states.dtype,
121
+ )
122
+ else:
123
+ norm_hidden_states = self.norm1(hidden_states)
124
+
125
+ # 1. Self-Attention
126
+ cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
127
+ if self.only_cross_attention:
128
+ attn_output = self.attn1(
129
+ norm_hidden_states,
130
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
131
+ attention_mask=attention_mask,
132
+ **cross_attention_kwargs,
133
+ )
134
+ else:
135
+ if MODE == "write":
136
+ self.bank.append(norm_hidden_states.clone())
137
+
138
+ attn_output = self.attn1(
139
+ norm_hidden_states,
140
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
141
+ attention_mask=attention_mask,
142
+ **cross_attention_kwargs,
143
+ )
144
+ if MODE == "read":
145
+ bank_fea = [
146
+ rearrange(
147
+ d.unsqueeze(1).tile((1, video_length, 1, 1)).astype(norm_hidden_states.dtype),
148
+ "b t l c -> (b t) l c",
149
+ )
150
+ for d in self.bank
151
+ ]
152
+
153
+ modify_norm_hidden_states = paddle.concat(x=[norm_hidden_states] + bank_fea, axis=1)
154
+
155
+ hidden_states_uc = (
156
+ self.attn1(
157
+ norm_hidden_states,
158
+ encoder_hidden_states=modify_norm_hidden_states,
159
+ attention_mask=attention_mask,
160
+ )
161
+ + hidden_states
162
+ )
163
+ if do_classifier_free_guidance:
164
+ hidden_states_c = hidden_states_uc.clone()
165
+ _uc_mask = uc_mask.clone()
166
+ if hidden_states.shape[0] != _uc_mask.shape[0]:
167
+
168
+ _uc_mask = paddle.to_tensor(
169
+ data=[1] * (hidden_states.shape[0] // 2) + [0] * (hidden_states.shape[0] // 2),
170
+ dtype="float32",
171
+ ).astype(dtype="bool")
172
+
173
+ hidden_states_c[_uc_mask] = (
174
+ self.attn1(
175
+ norm_hidden_states[_uc_mask],
176
+ encoder_hidden_states=norm_hidden_states[_uc_mask],
177
+ attention_mask=attention_mask,
178
+ )
179
+ + hidden_states[_uc_mask]
180
+ )
181
+ hidden_states = hidden_states_c.clone()
182
+ else:
183
+ hidden_states = hidden_states_uc
184
+
185
+ # self.bank.clear()
186
+ if self.attn2 is not None:
187
+ # Cross-Attention
188
+ norm_hidden_states = (
189
+ self.norm2(hidden_states, timestep)
190
+ if self.use_ada_layer_norm
191
+ else self.norm2(hidden_states)
192
+ )
193
+ hidden_states = (
194
+ self.attn2(
195
+ norm_hidden_states,
196
+ encoder_hidden_states=encoder_hidden_states,
197
+ attention_mask=attention_mask,
198
+ )
199
+ + hidden_states
200
+ )
201
+
202
+ # Feed-forward
203
+ hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
204
+
205
+ # Temporal-Attention
206
+ if self.unet_use_temporal_attention:
207
+ d = hidden_states.shape[1]
208
+ hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
209
+ norm_hidden_states = (
210
+ self.norm_temp(hidden_states, timestep)
211
+ if self.use_ada_layer_norm
212
+ else self.norm_temp(hidden_states)
213
+ )
214
+ hidden_states = self.attn_temp(norm_hidden_states) + hidden_states
215
+ hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
216
+
217
+ return hidden_states
218
+
219
+ if self.use_ada_layer_norm_zero:
220
+ attn_output = gate_msa.unsqueeze(1) * attn_output
221
+ hidden_states = attn_output + hidden_states
222
+
223
+ if self.attn2 is not None:
224
+ norm_hidden_states = (
225
+ self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
226
+ )
227
+
228
+ # 2. Cross-Attention
229
+ attn_output = self.attn2(
230
+ norm_hidden_states,
231
+ encoder_hidden_states=encoder_hidden_states,
232
+ attention_mask=encoder_attention_mask,
233
+ **cross_attention_kwargs,
234
+ )
235
+ hidden_states = attn_output + hidden_states
236
+
237
+ # 3. Feed-forward
238
+ norm_hidden_states = self.norm3(hidden_states)
239
+
240
+ if self.use_ada_layer_norm_zero:
241
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
242
+
243
+ ff_output = self.ff(norm_hidden_states)
244
+
245
+ if self.use_ada_layer_norm_zero:
246
+ ff_output = gate_mlp.unsqueeze(1) * ff_output
247
+
248
+ hidden_states = ff_output + hidden_states
249
+
250
+ return hidden_states
251
+
252
+ if self.reference_attn:
253
+ if self.fusion_blocks == "midup":
254
+ attn_modules = [
255
+ module
256
+ for module in (paddle_dfs(self.unet.mid_block) + paddle_dfs(self.unet.up_blocks))
257
+ if isinstance(module, BasicTransformerBlock) or isinstance(module, TemporalBasicTransformerBlock)
258
+ ]
259
+ elif self.fusion_blocks == "full":
260
+ attn_modules = [
261
+ module
262
+ for module in paddle_dfs(self.unet)
263
+ if isinstance(module, BasicTransformerBlock) or isinstance(module, TemporalBasicTransformerBlock)
264
+ ]
265
+ attn_modules = sorted(attn_modules, key=lambda x: -x.norm1._normalized_shape[0])
266
+
267
+ for i, module in enumerate(attn_modules):
268
+ module._original_inner_forward = module.forward
269
+ if isinstance(module, BasicTransformerBlock):
270
+ module.forward = hacked_basic_transformer_inner_forward.__get__(module, BasicTransformerBlock)
271
+ if isinstance(module, TemporalBasicTransformerBlock):
272
+ module.forward = hacked_basic_transformer_inner_forward.__get__(
273
+ module, TemporalBasicTransformerBlock
274
+ )
275
+
276
+ module.bank = []
277
+ module.attn_weight = float(i) / float(len(attn_modules))
278
+
279
+ def update(self, writer, dtype="float16"):
280
+ if self.reference_attn:
281
+ if self.fusion_blocks == "midup":
282
+ reader_attn_modules = [
283
+ module
284
+ for module in (paddle_dfs(self.unet.mid_block) + paddle_dfs(self.unet.up_blocks))
285
+ if isinstance(module, TemporalBasicTransformerBlock)
286
+ ]
287
+ writer_attn_modules = [
288
+ module
289
+ for module in (paddle_dfs(writer.unet.mid_block) + paddle_dfs(writer.unet.up_blocks))
290
+ if isinstance(module, BasicTransformerBlock)
291
+ ]
292
+ elif self.fusion_blocks == "full":
293
+ reader_attn_modules = [
294
+ module for module in paddle_dfs(self.unet) if isinstance(module, TemporalBasicTransformerBlock)
295
+ ]
296
+ writer_attn_modules = [
297
+ module for module in paddle_dfs(writer.unet) if isinstance(module, BasicTransformerBlock)
298
+ ]
299
+ reader_attn_modules = sorted(reader_attn_modules, key=lambda x: -x.norm1._normalized_shape[0])
300
+ writer_attn_modules = sorted(writer_attn_modules, key=lambda x: -x.norm1._normalized_shape[0])
301
+ for r, w in zip(reader_attn_modules, writer_attn_modules):
302
+ r.bank = [v.clone() for v in w.bank]
303
+ # w.bank.clear()
304
+
305
+ def clear(self):
306
+ if self.reference_attn:
307
+ if self.fusion_blocks == "midup":
308
+ reader_attn_modules = [
309
+ module
310
+ for module in (paddle_dfs(self.unet.mid_block) + paddle_dfs(self.unet.up_blocks))
311
+ if isinstance(module, BasicTransformerBlock) or isinstance(module, TemporalBasicTransformerBlock)
312
+ ]
313
+ elif self.fusion_blocks == "full":
314
+ reader_attn_modules = [
315
+ module
316
+ for module in paddle_dfs(self.unet)
317
+ if isinstance(module, BasicTransformerBlock) or isinstance(module, TemporalBasicTransformerBlock)
318
+ ]
319
+ reader_attn_modules = sorted(reader_attn_modules, key=lambda x: -x.norm1._normalized_shape[0])
320
+ for r in reader_attn_modules:
321
+ r.bank.clear()
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/transformer_2d.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Adapted from https://github.com/huggingface/ppdiffusers/blob/main/src/ppdiffusers/models/transformer_2d.py
16
+ from dataclasses import dataclass
17
+ from typing import Any, Dict, Optional
18
+
19
+ import paddle
20
+
21
+ from ppdiffusers.configuration_utils import ConfigMixin, register_to_config
22
+ from ppdiffusers.models.embeddings import CaptionProjection
23
+ from ppdiffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
24
+ from ppdiffusers.models.modeling_utils import ModelMixin
25
+ from ppdiffusers.models.normalization import AdaLayerNormSingle
26
+ from ppdiffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate
27
+
28
+ from .attention import BasicTransformerBlock
29
+
30
+
31
+ @dataclass
32
+ class Transformer2DModelOutput(BaseOutput):
33
+ """
34
+ The output of [`Transformer2DModel`].
35
+
36
+ Args:
37
+ sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
38
+ The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
39
+ distributions for the unnoised latent pixels.
40
+ """
41
+
42
+ sample: paddle.Tensor
43
+ ref_feature: paddle.Tensor
44
+
45
+
46
+ class Transformer2DModel(ModelMixin, ConfigMixin):
47
+ """
48
+ A 2D Transformer model for image-like data.
49
+
50
+ Parameters:
51
+ num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
52
+ attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
53
+ in_channels (`int`, *optional*):
54
+ The number of channels in the input and output (specify if the input is **continuous**).
55
+ num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
56
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
57
+ cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
58
+ sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
59
+ This is fixed during training since it is used to learn a number of position embeddings.
60
+ num_vector_embeds (`int`, *optional*):
61
+ The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
62
+ Includes the class for the masked latent pixel.
63
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
64
+ num_embeds_ada_norm ( `int`, *optional*):
65
+ The number of diffusion steps used during training. Pass if at least one of the norm_layers is
66
+ `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
67
+ added to the hidden states.
68
+
69
+ During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
70
+ attention_bias (`bool`, *optional*):
71
+ Configure if the `TransformerBlocks` attention should contain a bias parameter.
72
+ """
73
+
74
+ _supports_gradient_checkpointing = True
75
+
76
+ @register_to_config
77
+ def __init__(
78
+ self,
79
+ num_attention_heads: int = 16,
80
+ attention_head_dim: int = 88,
81
+ in_channels: Optional[int] = None,
82
+ out_channels: Optional[int] = None,
83
+ num_layers: int = 1,
84
+ dropout: float = 0.0,
85
+ norm_num_groups: int = 32,
86
+ cross_attention_dim: Optional[int] = None,
87
+ attention_bias: bool = False,
88
+ sample_size: Optional[int] = None,
89
+ num_vector_embeds: Optional[int] = None,
90
+ patch_size: Optional[int] = None,
91
+ activation_fn: str = "geglu",
92
+ num_embeds_ada_norm: Optional[int] = None,
93
+ use_linear_projection: bool = False,
94
+ only_cross_attention: bool = False,
95
+ double_self_attention: bool = False,
96
+ upcast_attention: bool = False,
97
+ norm_type: str = "layer_norm",
98
+ norm_elementwise_affine: bool = True,
99
+ norm_eps: float = 1e-5,
100
+ attention_type: str = "default",
101
+ caption_channels: int = None,
102
+ ):
103
+ super().__init__()
104
+ self.use_linear_projection = use_linear_projection
105
+ self.num_attention_heads = num_attention_heads
106
+ self.attention_head_dim = attention_head_dim
107
+ inner_dim = num_attention_heads * attention_head_dim
108
+
109
+ # conv_cls = LoRACompatibleConv if USE_PEFT_BACKEND else paddle.nn.Conv2D
110
+ # linear_cls = LoRACompatibleLinear if USE_PEFT_BACKEND else paddle.nn.Linear
111
+ conv_cls = paddle.nn.Conv2D if USE_PEFT_BACKEND else LoRACompatibleConv
112
+ linear_cls = paddle.nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear
113
+
114
+ # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
115
+ # Define whether input is continuous or discrete depending on configuration
116
+ self.is_input_continuous = (in_channels is not None) and (patch_size is None)
117
+ self.is_input_vectorized = num_vector_embeds is not None
118
+ self.is_input_patches = in_channels is not None and patch_size is not None
119
+
120
+ if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
121
+ deprecation_message = (
122
+ f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
123
+ " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config."
124
+ " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
125
+ " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
126
+ " would be very nice if you could open a Pull request for the `transformer/config.json` file"
127
+ )
128
+ deprecate(
129
+ "norm_type!=num_embeds_ada_norm",
130
+ "1.0.0",
131
+ deprecation_message,
132
+ standard_warn=False,
133
+ )
134
+ norm_type = "ada_norm"
135
+
136
+ if self.is_input_continuous and self.is_input_vectorized:
137
+ raise ValueError(
138
+ f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
139
+ " sure that either `in_channels` or `num_vector_embeds` is None."
140
+ )
141
+ elif self.is_input_vectorized and self.is_input_patches:
142
+ raise ValueError(
143
+ f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
144
+ " sure that either `num_vector_embeds` or `num_patches` is None."
145
+ )
146
+ elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
147
+ raise ValueError(
148
+ f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
149
+ f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
150
+ )
151
+
152
+ # 2. Define input layers
153
+ self.in_channels = in_channels
154
+
155
+ self.norm = paddle.nn.GroupNorm(
156
+ num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-06, weight_attr=True, bias_attr=True
157
+ )
158
+
159
+ if use_linear_projection:
160
+ self.proj_in = linear_cls(in_channels, inner_dim)
161
+ else:
162
+ self.proj_in = conv_cls(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
163
+
164
+ self.transformer_blocks = paddle.nn.LayerList(
165
+ sublayers=[
166
+ BasicTransformerBlock(
167
+ inner_dim,
168
+ num_attention_heads,
169
+ attention_head_dim,
170
+ dropout=dropout,
171
+ cross_attention_dim=cross_attention_dim,
172
+ activation_fn=activation_fn,
173
+ num_embeds_ada_norm=num_embeds_ada_norm,
174
+ attention_bias=attention_bias,
175
+ only_cross_attention=only_cross_attention,
176
+ double_self_attention=double_self_attention,
177
+ upcast_attention=upcast_attention,
178
+ norm_type=norm_type,
179
+ norm_elementwise_affine=norm_elementwise_affine,
180
+ norm_eps=norm_eps,
181
+ attention_type=attention_type,
182
+ )
183
+ for d in range(num_layers)
184
+ ]
185
+ )
186
+
187
+ # 4. Define output layers
188
+ self.out_channels = in_channels if out_channels is None else out_channels
189
+ # TODO: should use out_channels for continuous projections
190
+ if use_linear_projection:
191
+ self.proj_out = linear_cls(inner_dim, in_channels)
192
+ else:
193
+ self.proj_out = conv_cls(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
194
+
195
+ # 5. PixArt-Alpha blocks.
196
+ self.adaln_single = None
197
+ self.use_additional_conditions = False
198
+ if norm_type == "ada_norm_single":
199
+ self.use_additional_conditions = self.config.sample_size == 128
200
+ # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use
201
+ # additional conditions until we find better name
202
+ self.adaln_single = AdaLayerNormSingle(inner_dim, use_additional_conditions=self.use_additional_conditions)
203
+
204
+ self.caption_projection = None
205
+ if caption_channels is not None:
206
+ self.caption_projection = CaptionProjection(in_features=caption_channels, hidden_size=inner_dim)
207
+
208
+ self.gradient_checkpointing = False
209
+
210
+ def _set_gradient_checkpointing(self, module, value=False):
211
+ if hasattr(module, "gradient_checkpointing"):
212
+ module.gradient_checkpointing = value
213
+
214
+ def forward(
215
+ self,
216
+ hidden_states: paddle.Tensor,
217
+ encoder_hidden_states: Optional[paddle.Tensor] = None,
218
+ timestep: Optional[paddle.Tensor] = None,
219
+ added_cond_kwargs: Dict[str, paddle.Tensor] = None,
220
+ class_labels: Optional[paddle.Tensor] = None,
221
+ cross_attention_kwargs: Dict[str, Any] = None,
222
+ attention_mask: Optional[paddle.Tensor] = None,
223
+ encoder_attention_mask: Optional[paddle.Tensor] = None,
224
+ return_dict: bool = True,
225
+ ):
226
+ """
227
+ The [`Transformer2DModel`] forward method.
228
+
229
+ Args:
230
+ hidden_states (`paddle.Tensor` of shape `(batch size, num latent pixels)` if discrete, `paddle.Tensor` of shape `(batch size, channel, height, width)` if continuous):
231
+ Input `hidden_states`.
232
+ encoder_hidden_states ( `paddle.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
233
+ Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
234
+ self-attention.
235
+ timestep ( `paddle.Tensor`, *optional*):
236
+ Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
237
+ class_labels ( `paddle.Tensor` of shape `(batch size, num classes)`, *optional*):
238
+ Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
239
+ `AdaLayerZeroNorm`.
240
+ cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
241
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
242
+ `self.processor` in
243
+ [ppdiffusers.models.attention_processor](https://github.com/huggingface/ppdiffusers/blob/main/src/ppdiffusers/models/attention_processor.py).
244
+ attention_mask ( `paddle.Tensor`, *optional*):
245
+ An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
246
+ is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
247
+ negative values to the attention scores corresponding to "discard" tokens.
248
+ encoder_attention_mask ( `paddle.Tensor`, *optional*):
249
+ Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
250
+
251
+ * Mask `(batch, sequence_length)` True = keep, False = discard.
252
+ * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
253
+
254
+ If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
255
+ above. This bias will be added to the cross-attention scores.
256
+ return_dict (`bool`, *optional*, defaults to `True`):
257
+ Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
258
+ tuple.
259
+
260
+ Returns:
261
+ If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
262
+ `tuple` where the first element is the sample tensor.
263
+ """
264
+ # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
265
+ # we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
266
+ # we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
267
+ # expects mask of shape:
268
+ # [batch, key_tokens]
269
+ # adds singleton query_tokens dimension:
270
+ # [batch, 1, key_tokens]
271
+ # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
272
+ # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn)
273
+ # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
274
+ if attention_mask is not None and attention_mask.ndim == 2:
275
+ # assume that mask is expressed as:
276
+ # (1 = keep, 0 = discard)
277
+ # convert mask into a bias that can be added to attention scores:
278
+ # (keep = +0, discard = -10000.0)
279
+ attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
280
+ attention_mask = attention_mask.unsqueeze(1)
281
+
282
+ # convert encoder_attention_mask to a bias the same way we do for attention_mask
283
+ if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
284
+ encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
285
+ encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
286
+
287
+ # Retrieve lora scale.
288
+ lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
289
+
290
+ # 1. Input
291
+ batch, _, height, width = hidden_states.shape
292
+ residual = hidden_states
293
+
294
+ hidden_states = self.norm(hidden_states)
295
+ if not self.use_linear_projection:
296
+ hidden_states = (
297
+ self.proj_in(hidden_states, scale=lora_scale) if not USE_PEFT_BACKEND else self.proj_in(hidden_states)
298
+ )
299
+ inner_dim = hidden_states.shape[1]
300
+ hidden_states = hidden_states.permute(0, 2, 3, 1).reshape((batch, height * width, inner_dim))
301
+ else:
302
+ inner_dim = hidden_states.shape[1]
303
+ hidden_states = hidden_states.permute(0, 2, 3, 1).reshape((batch, height * width, inner_dim))
304
+ hidden_states = (
305
+ self.proj_in(hidden_states, scale=lora_scale) if not USE_PEFT_BACKEND else self.proj_in(hidden_states)
306
+ )
307
+
308
+ # 2. Blocks
309
+ if self.caption_projection is not None:
310
+ batch_size = hidden_states.shape[0]
311
+ encoder_hidden_states = self.caption_projection(encoder_hidden_states)
312
+ encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
313
+
314
+ ref_feature = hidden_states.reshape(shape=(batch, height, width, inner_dim))
315
+ for block in self.transformer_blocks:
316
+ if self.training and self.gradient_checkpointing:
317
+
318
+ def create_custom_forward(module, return_dict=None):
319
+ def custom_forward(*inputs):
320
+ if return_dict is not None:
321
+ return module(*inputs, return_dict=return_dict)
322
+ else:
323
+ return module(*inputs)
324
+
325
+ return custom_forward
326
+
327
+ else:
328
+ hidden_states = block(
329
+ hidden_states,
330
+ attention_mask=attention_mask,
331
+ encoder_hidden_states=encoder_hidden_states,
332
+ encoder_attention_mask=encoder_attention_mask,
333
+ timestep=timestep,
334
+ cross_attention_kwargs=cross_attention_kwargs,
335
+ class_labels=class_labels,
336
+ )
337
+
338
+ # 3. Output
339
+ if self.is_input_continuous:
340
+ if not self.use_linear_projection:
341
+ hidden_states = (
342
+ hidden_states.reshape((batch, height, width, inner_dim)).permute(0, 3, 1, 2).contiguous()
343
+ )
344
+ hidden_states = (
345
+ self.proj_out(hidden_states, scale=lora_scale)
346
+ if not USE_PEFT_BACKEND
347
+ else self.proj_out(hidden_states)
348
+ )
349
+ else:
350
+ hidden_states = (
351
+ self.proj_out(hidden_states, scale=lora_scale)
352
+ if not USE_PEFT_BACKEND
353
+ else self.proj_out(hidden_states)
354
+ )
355
+ hidden_states = (
356
+ hidden_states.reshape((batch, height, width, inner_dim)).permute(0, 3, 1, 2).contiguous()
357
+ )
358
+
359
+ output = hidden_states + residual
360
+ if not return_dict:
361
+ return (output, ref_feature)
362
+
363
+ return Transformer2DModelOutput(sample=output, ref_feature=ref_feature)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/unet_2d_blocks.py ADDED
@@ -0,0 +1,1031 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Adapted from https://github.com/huggingface/ppdiffusers/blob/main/src/ppdiffusers/models/unet_2d_blocks.py
16
+ from typing import Any, Dict, Optional, Tuple, Union
17
+
18
+ import paddle
19
+ from paddle.distributed.fleet.utils import recompute
20
+
21
+ from ppdiffusers.models.activations import get_activation
22
+ from ppdiffusers.models.attention_processor import Attention
23
+ from ppdiffusers.models.dual_transformer_2d import DualTransformer2DModel
24
+ from ppdiffusers.models.resnet import Downsample2D, ResnetBlock2D, Upsample2D
25
+ from ppdiffusers.utils import logging, recompute_use_reentrant
26
+ from ppdiffusers.utils.paddle_utils import apply_freeu
27
+
28
+ from .transformer_2d import Transformer2DModel
29
+
30
+ logger = logging.get_logger(__name__)
31
+
32
+
33
+ def get_down_block(
34
+ down_block_type: str,
35
+ num_layers: int,
36
+ in_channels: int,
37
+ out_channels: int,
38
+ temb_channels: int,
39
+ add_downsample: bool,
40
+ resnet_eps: float,
41
+ resnet_act_fn: str,
42
+ transformer_layers_per_block: int = 1,
43
+ num_attention_heads: Optional[int] = None,
44
+ resnet_groups: Optional[int] = None,
45
+ cross_attention_dim: Optional[int] = None,
46
+ downsample_padding: Optional[int] = None,
47
+ dual_cross_attention: bool = False,
48
+ use_linear_projection: bool = False,
49
+ only_cross_attention: bool = False,
50
+ upcast_attention: bool = False,
51
+ resnet_time_scale_shift: str = "default",
52
+ attention_type: str = "default",
53
+ resnet_skip_time_act: bool = False,
54
+ resnet_out_scale_factor: float = 1.0,
55
+ cross_attention_norm: Optional[str] = None,
56
+ attention_head_dim: Optional[int] = None,
57
+ downsample_type: Optional[str] = None,
58
+ dropout: float = 0.0,
59
+ ):
60
+ # If attn head dim is not defined, we default it to the number of heads
61
+ if attention_head_dim is None:
62
+ logger.warn(
63
+ f"It is recommended to provide `attention_head_dim` when calling `get_down_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
64
+ )
65
+ attention_head_dim = num_attention_heads
66
+
67
+ down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
68
+ if down_block_type == "DownBlock2D":
69
+ return DownBlock2D(
70
+ num_layers=num_layers,
71
+ in_channels=in_channels,
72
+ out_channels=out_channels,
73
+ temb_channels=temb_channels,
74
+ dropout=dropout,
75
+ add_downsample=add_downsample,
76
+ resnet_eps=resnet_eps,
77
+ resnet_act_fn=resnet_act_fn,
78
+ resnet_groups=resnet_groups,
79
+ downsample_padding=downsample_padding,
80
+ resnet_time_scale_shift=resnet_time_scale_shift,
81
+ )
82
+ elif down_block_type == "CrossAttnDownBlock2D":
83
+ if cross_attention_dim is None:
84
+ raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
85
+ return CrossAttnDownBlock2D(
86
+ num_layers=num_layers,
87
+ transformer_layers_per_block=transformer_layers_per_block,
88
+ in_channels=in_channels,
89
+ out_channels=out_channels,
90
+ temb_channels=temb_channels,
91
+ dropout=dropout,
92
+ add_downsample=add_downsample,
93
+ resnet_eps=resnet_eps,
94
+ resnet_act_fn=resnet_act_fn,
95
+ resnet_groups=resnet_groups,
96
+ downsample_padding=downsample_padding,
97
+ cross_attention_dim=cross_attention_dim,
98
+ num_attention_heads=num_attention_heads,
99
+ dual_cross_attention=dual_cross_attention,
100
+ use_linear_projection=use_linear_projection,
101
+ only_cross_attention=only_cross_attention,
102
+ upcast_attention=upcast_attention,
103
+ resnet_time_scale_shift=resnet_time_scale_shift,
104
+ attention_type=attention_type,
105
+ )
106
+ raise ValueError(f"{down_block_type} does not exist.")
107
+
108
+
109
+ def get_up_block(
110
+ up_block_type: str,
111
+ num_layers: int,
112
+ in_channels: int,
113
+ out_channels: int,
114
+ prev_output_channel: int,
115
+ temb_channels: int,
116
+ add_upsample: bool,
117
+ resnet_eps: float,
118
+ resnet_act_fn: str,
119
+ resolution_idx: Optional[int] = None,
120
+ transformer_layers_per_block: int = 1,
121
+ num_attention_heads: Optional[int] = None,
122
+ resnet_groups: Optional[int] = None,
123
+ cross_attention_dim: Optional[int] = None,
124
+ dual_cross_attention: bool = False,
125
+ use_linear_projection: bool = False,
126
+ only_cross_attention: bool = False,
127
+ upcast_attention: bool = False,
128
+ resnet_time_scale_shift: str = "default",
129
+ attention_type: str = "default",
130
+ resnet_skip_time_act: bool = False,
131
+ resnet_out_scale_factor: float = 1.0,
132
+ cross_attention_norm: Optional[str] = None,
133
+ attention_head_dim: Optional[int] = None,
134
+ upsample_type: Optional[str] = None,
135
+ dropout: float = 0.0,
136
+ ) -> paddle.nn.Layer:
137
+ # If attn head dim is not defined, we default it to the number of heads
138
+ if attention_head_dim is None:
139
+ logger.warn(
140
+ f"It is recommended to provide `attention_head_dim` when calling `get_up_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
141
+ )
142
+ attention_head_dim = num_attention_heads
143
+
144
+ up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
145
+ if up_block_type == "UpBlock2D":
146
+ return UpBlock2D(
147
+ num_layers=num_layers,
148
+ in_channels=in_channels,
149
+ out_channels=out_channels,
150
+ prev_output_channel=prev_output_channel,
151
+ temb_channels=temb_channels,
152
+ resolution_idx=resolution_idx,
153
+ dropout=dropout,
154
+ add_upsample=add_upsample,
155
+ resnet_eps=resnet_eps,
156
+ resnet_act_fn=resnet_act_fn,
157
+ resnet_groups=resnet_groups,
158
+ resnet_time_scale_shift=resnet_time_scale_shift,
159
+ )
160
+ elif up_block_type == "CrossAttnUpBlock2D":
161
+ if cross_attention_dim is None:
162
+ raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
163
+ return CrossAttnUpBlock2D(
164
+ num_layers=num_layers,
165
+ transformer_layers_per_block=transformer_layers_per_block,
166
+ in_channels=in_channels,
167
+ out_channels=out_channels,
168
+ prev_output_channel=prev_output_channel,
169
+ temb_channels=temb_channels,
170
+ resolution_idx=resolution_idx,
171
+ dropout=dropout,
172
+ add_upsample=add_upsample,
173
+ resnet_eps=resnet_eps,
174
+ resnet_act_fn=resnet_act_fn,
175
+ resnet_groups=resnet_groups,
176
+ cross_attention_dim=cross_attention_dim,
177
+ num_attention_heads=num_attention_heads,
178
+ dual_cross_attention=dual_cross_attention,
179
+ use_linear_projection=use_linear_projection,
180
+ only_cross_attention=only_cross_attention,
181
+ upcast_attention=upcast_attention,
182
+ resnet_time_scale_shift=resnet_time_scale_shift,
183
+ attention_type=attention_type,
184
+ )
185
+
186
+ raise ValueError(f"{up_block_type} does not exist.")
187
+
188
+
189
+ class AutoencoderTinyBlock(paddle.nn.Layer):
190
+ """
191
+ Tiny Autoencoder block used in [`AutoencoderTiny`]. It is a mini residual module consisting of plain conv + ReLU
192
+ blocks.
193
+
194
+ Args:
195
+ in_channels (`int`): The number of input channels.
196
+ out_channels (`int`): The number of output channels.
197
+ act_fn (`str`):
198
+ ` The activation function to use. Supported values are `"swish"`, `"mish"`, `"gelu"`, and `"relu"`.
199
+
200
+ Returns:
201
+ `paddle.Tensor`: A tensor with the same shape as the input tensor, but with the number of channels equal to
202
+ `out_channels`.
203
+ """
204
+
205
+ def __init__(self, in_channels: int, out_channels: int, act_fn: str):
206
+ super().__init__()
207
+ act_fn = get_activation(act_fn)
208
+
209
+ self.conv = paddle.nn.Sequential(
210
+ paddle.nn.Conv2D(in_channels=in_channels, out_channels=out_channels, kernel_size=3, padding=1),
211
+ act_fn,
212
+ paddle.nn.Conv2D(in_channels=out_channels, out_channels=out_channels, kernel_size=3, padding=1),
213
+ act_fn,
214
+ paddle.nn.Conv2D(in_channels=out_channels, out_channels=out_channels, kernel_size=3, padding=1),
215
+ )
216
+ self.skip = (
217
+ paddle.nn.Conv2D(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias_attr=False)
218
+ if in_channels != out_channels
219
+ else paddle.nn.Identity()
220
+ )
221
+ self.fuse = paddle.nn.ReLU()
222
+
223
+ def forward(self, x: paddle.Tensor) -> paddle.Tensor:
224
+ return self.fuse(self.conv(x) + self.skip(x))
225
+
226
+
227
+ class UNetMidBlock2D(paddle.nn.Layer):
228
+ """
229
+ A 2D UNet mid-block [`UNetMidBlock2D`] with multiple residual blocks and optional attention blocks.
230
+
231
+ Args:
232
+ in_channels (`int`): The number of input channels.
233
+ temb_channels (`int`): The number of temporal embedding channels.
234
+ dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
235
+ num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
236
+ resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
237
+ resnet_time_scale_shift (`str`, *optional*, defaults to `default`):
238
+ The type of normalization to apply to the time embeddings. This can help to improve the performance of the
239
+ model on tasks with long-range temporal dependencies.
240
+ resnet_act_fn (`str`, *optional*, defaults to `swish`): The activation function for the resnet blocks.
241
+ resnet_groups (`int`, *optional*, defaults to 32):
242
+ The number of groups to use in the group normalization layers of the resnet blocks.
243
+ attn_groups (`Optional[int]`, *optional*, defaults to None): The number of groups for the attention blocks.
244
+ resnet_pre_norm (`bool`, *optional*, defaults to `True`):
245
+ Whether to use pre-normalization for the resnet blocks.
246
+ add_attention (`bool`, *optional*, defaults to `True`): Whether to add attention blocks.
247
+ attention_head_dim (`int`, *optional*, defaults to 1):
248
+ Dimension of a single attention head. The number of attention heads is determined based on this value and
249
+ the number of input channels.
250
+ output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor.
251
+
252
+ Returns:
253
+ `paddle.Tensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
254
+ in_channels, height, width)`.
255
+
256
+ """
257
+
258
+ def __init__(
259
+ self,
260
+ in_channels: int,
261
+ temb_channels: int,
262
+ dropout: float = 0.0,
263
+ num_layers: int = 1,
264
+ resnet_eps: float = 1e-6,
265
+ resnet_time_scale_shift: str = "default", # default, spatial
266
+ resnet_act_fn: str = "swish",
267
+ resnet_groups: int = 32,
268
+ attn_groups: Optional[int] = None,
269
+ resnet_pre_norm: bool = True,
270
+ add_attention: bool = True,
271
+ attention_head_dim: int = 1,
272
+ output_scale_factor: float = 1.0,
273
+ ):
274
+ super().__init__()
275
+ resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
276
+ self.add_attention = add_attention
277
+
278
+ if attn_groups is None:
279
+ attn_groups = resnet_groups if resnet_time_scale_shift == "default" else None
280
+
281
+ resnets = [
282
+ ResnetBlock2D(
283
+ in_channels=in_channels,
284
+ out_channels=in_channels,
285
+ temb_channels=temb_channels,
286
+ eps=resnet_eps,
287
+ groups=resnet_groups,
288
+ dropout=dropout,
289
+ time_embedding_norm=resnet_time_scale_shift,
290
+ non_linearity=resnet_act_fn,
291
+ output_scale_factor=output_scale_factor,
292
+ pre_norm=resnet_pre_norm,
293
+ )
294
+ ]
295
+ attentions = []
296
+
297
+ if attention_head_dim is None:
298
+ logger.warn(
299
+ f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {in_channels}."
300
+ )
301
+ attention_head_dim = in_channels
302
+
303
+ for _ in range(num_layers):
304
+ if self.add_attention:
305
+ attentions.append(
306
+ Attention(
307
+ in_channels,
308
+ heads=in_channels // attention_head_dim,
309
+ dim_head=attention_head_dim,
310
+ rescale_output_factor=output_scale_factor,
311
+ eps=resnet_eps,
312
+ norm_num_groups=attn_groups,
313
+ spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
314
+ residual_connection=True,
315
+ bias=True,
316
+ upcast_softmax=True,
317
+ _from_deprecated_attn_block=True,
318
+ )
319
+ )
320
+ else:
321
+ attentions.append(None)
322
+
323
+ resnets.append(
324
+ ResnetBlock2D(
325
+ in_channels=in_channels,
326
+ out_channels=in_channels,
327
+ temb_channels=temb_channels,
328
+ eps=resnet_eps,
329
+ groups=resnet_groups,
330
+ dropout=dropout,
331
+ time_embedding_norm=resnet_time_scale_shift,
332
+ non_linearity=resnet_act_fn,
333
+ output_scale_factor=output_scale_factor,
334
+ pre_norm=resnet_pre_norm,
335
+ )
336
+ )
337
+
338
+ self.attentions = paddle.nn.LayerList(sublayers=attentions)
339
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
340
+
341
+ def forward(self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None) -> paddle.Tensor:
342
+ hidden_states = self.resnets[0](hidden_states, temb)
343
+ for attn, resnet in zip(self.attentions, self.resnets[1:]):
344
+ if attn is not None:
345
+ hidden_states = attn(hidden_states, temb=temb)
346
+ hidden_states = resnet(hidden_states, temb)
347
+
348
+ return hidden_states
349
+
350
+
351
+ class UNetMidBlock2DCrossAttn(paddle.nn.Layer):
352
+ def __init__(
353
+ self,
354
+ in_channels: int,
355
+ temb_channels: int,
356
+ dropout: float = 0.0,
357
+ num_layers: int = 1,
358
+ transformer_layers_per_block: Union[int, Tuple[int]] = 1,
359
+ resnet_eps: float = 1e-6,
360
+ resnet_time_scale_shift: str = "default",
361
+ resnet_act_fn: str = "swish",
362
+ resnet_groups: int = 32,
363
+ resnet_pre_norm: bool = True,
364
+ num_attention_heads: int = 1,
365
+ output_scale_factor: float = 1.0,
366
+ cross_attention_dim: int = 1280,
367
+ dual_cross_attention: bool = False,
368
+ use_linear_projection: bool = False,
369
+ upcast_attention: bool = False,
370
+ attention_type: str = "default",
371
+ ):
372
+ super().__init__()
373
+
374
+ self.has_cross_attention = True
375
+ self.num_attention_heads = num_attention_heads
376
+ resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
377
+
378
+ # support for variable transformer layers per block
379
+ if isinstance(transformer_layers_per_block, int):
380
+ transformer_layers_per_block = [transformer_layers_per_block] * num_layers
381
+
382
+ # there is always at least one resnet
383
+ resnets = [
384
+ ResnetBlock2D(
385
+ in_channels=in_channels,
386
+ out_channels=in_channels,
387
+ temb_channels=temb_channels,
388
+ eps=resnet_eps,
389
+ groups=resnet_groups,
390
+ dropout=dropout,
391
+ time_embedding_norm=resnet_time_scale_shift,
392
+ non_linearity=resnet_act_fn,
393
+ output_scale_factor=output_scale_factor,
394
+ pre_norm=resnet_pre_norm,
395
+ )
396
+ ]
397
+ attentions = []
398
+
399
+ for i in range(num_layers):
400
+ if not dual_cross_attention:
401
+ attentions.append(
402
+ Transformer2DModel(
403
+ num_attention_heads,
404
+ in_channels // num_attention_heads,
405
+ in_channels=in_channels,
406
+ num_layers=transformer_layers_per_block[i],
407
+ cross_attention_dim=cross_attention_dim,
408
+ norm_num_groups=resnet_groups,
409
+ use_linear_projection=use_linear_projection,
410
+ upcast_attention=upcast_attention,
411
+ attention_type=attention_type,
412
+ )
413
+ )
414
+ else:
415
+ attentions.append(
416
+ DualTransformer2DModel(
417
+ num_attention_heads,
418
+ in_channels // num_attention_heads,
419
+ in_channels=in_channels,
420
+ num_layers=1,
421
+ cross_attention_dim=cross_attention_dim,
422
+ norm_num_groups=resnet_groups,
423
+ )
424
+ )
425
+ resnets.append(
426
+ ResnetBlock2D(
427
+ in_channels=in_channels,
428
+ out_channels=in_channels,
429
+ temb_channels=temb_channels,
430
+ eps=resnet_eps,
431
+ groups=resnet_groups,
432
+ dropout=dropout,
433
+ time_embedding_norm=resnet_time_scale_shift,
434
+ non_linearity=resnet_act_fn,
435
+ output_scale_factor=output_scale_factor,
436
+ pre_norm=resnet_pre_norm,
437
+ )
438
+ )
439
+
440
+ self.attentions = paddle.nn.LayerList(sublayers=attentions)
441
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
442
+
443
+ self.gradient_checkpointing = False
444
+
445
+ def forward(
446
+ self,
447
+ hidden_states: paddle.Tensor,
448
+ temb: Optional[paddle.Tensor] = None,
449
+ encoder_hidden_states: Optional[paddle.Tensor] = None,
450
+ attention_mask: Optional[paddle.Tensor] = None,
451
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
452
+ encoder_attention_mask: Optional[paddle.Tensor] = None,
453
+ ) -> paddle.Tensor:
454
+ lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
455
+ hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale)
456
+ for attn, resnet in zip(self.attentions, self.resnets[1:]):
457
+ if self.training and self.gradient_checkpointing:
458
+
459
+ def create_custom_forward(module, return_dict=None):
460
+ def custom_forward(*inputs):
461
+ if return_dict is not None:
462
+ return module(*inputs, return_dict=return_dict)
463
+ else:
464
+ return module(*inputs)
465
+
466
+ return custom_forward
467
+
468
+ ckpt_kwargs = {} if recompute_use_reentrant() else {"use_reentrant": False}
469
+
470
+ hidden_states, ref_feature = attn(
471
+ hidden_states,
472
+ encoder_hidden_states=encoder_hidden_states,
473
+ cross_attention_kwargs=cross_attention_kwargs,
474
+ attention_mask=attention_mask,
475
+ encoder_attention_mask=encoder_attention_mask,
476
+ return_dict=False,
477
+ )
478
+
479
+ hidden_states = recompute(
480
+ create_custom_forward(resnet),
481
+ hidden_states,
482
+ temb,
483
+ **ckpt_kwargs,
484
+ )
485
+
486
+ else:
487
+ hidden_states, ref_feature = attn(
488
+ hidden_states,
489
+ encoder_hidden_states=encoder_hidden_states,
490
+ cross_attention_kwargs=cross_attention_kwargs,
491
+ attention_mask=attention_mask,
492
+ encoder_attention_mask=encoder_attention_mask,
493
+ return_dict=False,
494
+ )
495
+ hidden_states = resnet(hidden_states, temb, scale=lora_scale)
496
+
497
+ return hidden_states
498
+
499
+
500
+ class CrossAttnDownBlock2D(paddle.nn.Layer):
501
+ def __init__(
502
+ self,
503
+ in_channels: int,
504
+ out_channels: int,
505
+ temb_channels: int,
506
+ dropout: float = 0.0,
507
+ num_layers: int = 1,
508
+ transformer_layers_per_block: Union[int, Tuple[int]] = 1,
509
+ resnet_eps: float = 1e-6,
510
+ resnet_time_scale_shift: str = "default",
511
+ resnet_act_fn: str = "swish",
512
+ resnet_groups: int = 32,
513
+ resnet_pre_norm: bool = True,
514
+ num_attention_heads: int = 1,
515
+ cross_attention_dim: int = 1280,
516
+ output_scale_factor: float = 1.0,
517
+ downsample_padding: int = 1,
518
+ add_downsample: bool = True,
519
+ dual_cross_attention: bool = False,
520
+ use_linear_projection: bool = False,
521
+ only_cross_attention: bool = False,
522
+ upcast_attention: bool = False,
523
+ attention_type: str = "default",
524
+ ):
525
+ super().__init__()
526
+ resnets = []
527
+ attentions = []
528
+
529
+ self.has_cross_attention = True
530
+ self.num_attention_heads = num_attention_heads
531
+ if isinstance(transformer_layers_per_block, int):
532
+ transformer_layers_per_block = [transformer_layers_per_block] * num_layers
533
+
534
+ for i in range(num_layers):
535
+ in_channels = in_channels if i == 0 else out_channels
536
+ resnets.append(
537
+ ResnetBlock2D(
538
+ in_channels=in_channels,
539
+ out_channels=out_channels,
540
+ temb_channels=temb_channels,
541
+ eps=resnet_eps,
542
+ groups=resnet_groups,
543
+ dropout=dropout,
544
+ time_embedding_norm=resnet_time_scale_shift,
545
+ non_linearity=resnet_act_fn,
546
+ output_scale_factor=output_scale_factor,
547
+ pre_norm=resnet_pre_norm,
548
+ )
549
+ )
550
+ if not dual_cross_attention:
551
+ attentions.append(
552
+ Transformer2DModel(
553
+ num_attention_heads,
554
+ out_channels // num_attention_heads,
555
+ in_channels=out_channels,
556
+ num_layers=transformer_layers_per_block[i],
557
+ cross_attention_dim=cross_attention_dim,
558
+ norm_num_groups=resnet_groups,
559
+ use_linear_projection=use_linear_projection,
560
+ only_cross_attention=only_cross_attention,
561
+ upcast_attention=upcast_attention,
562
+ attention_type=attention_type,
563
+ )
564
+ )
565
+ else:
566
+ attentions.append(
567
+ DualTransformer2DModel(
568
+ num_attention_heads,
569
+ out_channels // num_attention_heads,
570
+ in_channels=out_channels,
571
+ num_layers=1,
572
+ cross_attention_dim=cross_attention_dim,
573
+ norm_num_groups=resnet_groups,
574
+ )
575
+ )
576
+
577
+ self.attentions = paddle.nn.LayerList(sublayers=attentions)
578
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
579
+
580
+ if add_downsample:
581
+
582
+ self.downsamplers = paddle.nn.LayerList(
583
+ sublayers=[
584
+ Downsample2D(
585
+ out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
586
+ )
587
+ ]
588
+ )
589
+ else:
590
+ self.downsamplers = None
591
+
592
+ self.gradient_checkpointing = False
593
+
594
+ def forward(
595
+ self,
596
+ hidden_states: paddle.Tensor,
597
+ temb: Optional[paddle.Tensor] = None,
598
+ encoder_hidden_states: Optional[paddle.Tensor] = None,
599
+ attention_mask: Optional[paddle.Tensor] = None,
600
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
601
+ encoder_attention_mask: Optional[paddle.Tensor] = None,
602
+ additional_residuals: Optional[paddle.Tensor] = None,
603
+ ) -> Tuple[paddle.Tensor, Tuple[paddle.Tensor, ...]]:
604
+ output_states = ()
605
+
606
+ lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
607
+
608
+ blocks = list(zip(self.resnets, self.attentions))
609
+
610
+ for i, (resnet, attn) in enumerate(blocks):
611
+ if self.training and self.gradient_checkpointing:
612
+
613
+ def create_custom_forward(module, return_dict=None):
614
+ def custom_forward(*inputs):
615
+ if return_dict is not None:
616
+ return module(*inputs, return_dict=return_dict)
617
+ else:
618
+ return module(*inputs)
619
+
620
+ return custom_forward
621
+
622
+ ckpt_kwargs = {} if recompute_use_reentrant() else {"use_reentrant": False}
623
+ hidden_states = recompute(
624
+ create_custom_forward(resnet),
625
+ hidden_states,
626
+ temb,
627
+ **ckpt_kwargs,
628
+ )
629
+
630
+ hidden_states, ref_feature = attn(
631
+ hidden_states,
632
+ encoder_hidden_states=encoder_hidden_states,
633
+ cross_attention_kwargs=cross_attention_kwargs,
634
+ attention_mask=attention_mask,
635
+ encoder_attention_mask=encoder_attention_mask,
636
+ return_dict=False,
637
+ )
638
+ else:
639
+ hidden_states = resnet(hidden_states, temb, scale=lora_scale)
640
+
641
+ hidden_states, ref_feature = attn(
642
+ hidden_states,
643
+ encoder_hidden_states=encoder_hidden_states,
644
+ cross_attention_kwargs=cross_attention_kwargs,
645
+ attention_mask=attention_mask,
646
+ encoder_attention_mask=encoder_attention_mask,
647
+ return_dict=False,
648
+ )
649
+
650
+ # apply additional residuals to the output of the last pair of resnet and attention blocks
651
+ if i == len(blocks) - 1 and additional_residuals is not None:
652
+ hidden_states = hidden_states + additional_residuals
653
+
654
+ output_states = output_states + (hidden_states,)
655
+
656
+ if self.downsamplers is not None:
657
+ for downsampler in self.downsamplers:
658
+ hidden_states = downsampler(hidden_states, scale=lora_scale)
659
+
660
+ output_states = output_states + (hidden_states,)
661
+
662
+ return hidden_states, output_states
663
+
664
+
665
+ class DownBlock2D(paddle.nn.Layer):
666
+ def __init__(
667
+ self,
668
+ in_channels: int,
669
+ out_channels: int,
670
+ temb_channels: int,
671
+ dropout: float = 0.0,
672
+ num_layers: int = 1,
673
+ resnet_eps: float = 1e-6,
674
+ resnet_time_scale_shift: str = "default",
675
+ resnet_act_fn: str = "swish",
676
+ resnet_groups: int = 32,
677
+ resnet_pre_norm: bool = True,
678
+ output_scale_factor: float = 1.0,
679
+ add_downsample: bool = True,
680
+ downsample_padding: int = 1,
681
+ ):
682
+ super().__init__()
683
+ resnets = []
684
+
685
+ for i in range(num_layers):
686
+ in_channels = in_channels if i == 0 else out_channels
687
+ resnets.append(
688
+ ResnetBlock2D(
689
+ in_channels=in_channels,
690
+ out_channels=out_channels,
691
+ temb_channels=temb_channels,
692
+ eps=resnet_eps,
693
+ groups=resnet_groups,
694
+ dropout=dropout,
695
+ time_embedding_norm=resnet_time_scale_shift,
696
+ non_linearity=resnet_act_fn,
697
+ output_scale_factor=output_scale_factor,
698
+ pre_norm=resnet_pre_norm,
699
+ )
700
+ )
701
+
702
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
703
+
704
+ if add_downsample:
705
+
706
+ self.downsamplers = paddle.nn.LayerList(
707
+ sublayers=[
708
+ Downsample2D(
709
+ out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
710
+ )
711
+ ]
712
+ )
713
+ else:
714
+ self.downsamplers = None
715
+
716
+ self.gradient_checkpointing = False
717
+
718
+ def forward(
719
+ self,
720
+ hidden_states: paddle.Tensor,
721
+ temb: Optional[paddle.Tensor] = None,
722
+ scale: float = 1.0,
723
+ ) -> Tuple[paddle.Tensor, Tuple[paddle.Tensor, ...]]:
724
+ output_states = ()
725
+
726
+ for resnet in self.resnets:
727
+ if self.training and self.gradient_checkpointing:
728
+
729
+ def create_custom_forward(module):
730
+ def custom_forward(*inputs):
731
+ return module(*inputs)
732
+
733
+ return custom_forward
734
+
735
+ ckpt_kwargs = {} if recompute_use_reentrant() else {"use_reentrant": False}
736
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb, **ckpt_kwargs)
737
+ else:
738
+ hidden_states = resnet(hidden_states, temb, scale=scale)
739
+
740
+ output_states = output_states + (hidden_states,)
741
+
742
+ if self.downsamplers is not None:
743
+ for downsampler in self.downsamplers:
744
+ hidden_states = downsampler(hidden_states, scale=scale)
745
+
746
+ output_states = output_states + (hidden_states,)
747
+
748
+ return hidden_states, output_states
749
+
750
+
751
+ class CrossAttnUpBlock2D(paddle.nn.Layer):
752
+ def __init__(
753
+ self,
754
+ in_channels: int,
755
+ out_channels: int,
756
+ prev_output_channel: int,
757
+ temb_channels: int,
758
+ resolution_idx: Optional[int] = None,
759
+ dropout: float = 0.0,
760
+ num_layers: int = 1,
761
+ transformer_layers_per_block: Union[int, Tuple[int]] = 1,
762
+ resnet_eps: float = 1e-6,
763
+ resnet_time_scale_shift: str = "default",
764
+ resnet_act_fn: str = "swish",
765
+ resnet_groups: int = 32,
766
+ resnet_pre_norm: bool = True,
767
+ num_attention_heads: int = 1,
768
+ cross_attention_dim: int = 1280,
769
+ output_scale_factor: float = 1.0,
770
+ add_upsample: bool = True,
771
+ dual_cross_attention: bool = False,
772
+ use_linear_projection: bool = False,
773
+ only_cross_attention: bool = False,
774
+ upcast_attention: bool = False,
775
+ attention_type: str = "default",
776
+ ):
777
+ super().__init__()
778
+ resnets = []
779
+ attentions = []
780
+
781
+ self.has_cross_attention = True
782
+ self.num_attention_heads = num_attention_heads
783
+
784
+ if isinstance(transformer_layers_per_block, int):
785
+ transformer_layers_per_block = [transformer_layers_per_block] * num_layers
786
+
787
+ for i in range(num_layers):
788
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
789
+ resnet_in_channels = prev_output_channel if i == 0 else out_channels
790
+
791
+ resnets.append(
792
+ ResnetBlock2D(
793
+ in_channels=resnet_in_channels + res_skip_channels,
794
+ out_channels=out_channels,
795
+ temb_channels=temb_channels,
796
+ eps=resnet_eps,
797
+ groups=resnet_groups,
798
+ dropout=dropout,
799
+ time_embedding_norm=resnet_time_scale_shift,
800
+ non_linearity=resnet_act_fn,
801
+ output_scale_factor=output_scale_factor,
802
+ pre_norm=resnet_pre_norm,
803
+ )
804
+ )
805
+ if not dual_cross_attention:
806
+ attentions.append(
807
+ Transformer2DModel(
808
+ num_attention_heads,
809
+ out_channels // num_attention_heads,
810
+ in_channels=out_channels,
811
+ num_layers=transformer_layers_per_block[i],
812
+ cross_attention_dim=cross_attention_dim,
813
+ norm_num_groups=resnet_groups,
814
+ use_linear_projection=use_linear_projection,
815
+ only_cross_attention=only_cross_attention,
816
+ upcast_attention=upcast_attention,
817
+ attention_type=attention_type,
818
+ )
819
+ )
820
+ else:
821
+ attentions.append(
822
+ DualTransformer2DModel(
823
+ num_attention_heads,
824
+ out_channels // num_attention_heads,
825
+ in_channels=out_channels,
826
+ num_layers=1,
827
+ cross_attention_dim=cross_attention_dim,
828
+ norm_num_groups=resnet_groups,
829
+ )
830
+ )
831
+
832
+ self.attentions = paddle.nn.LayerList(sublayers=attentions)
833
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
834
+
835
+ if add_upsample:
836
+
837
+ self.upsamplers = paddle.nn.LayerList(
838
+ sublayers=[Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
839
+ )
840
+ else:
841
+ self.upsamplers = None
842
+
843
+ self.gradient_checkpointing = False
844
+ self.resolution_idx = resolution_idx
845
+
846
+ def forward(
847
+ self,
848
+ hidden_states: paddle.Tensor,
849
+ res_hidden_states_tuple: Tuple[paddle.Tensor, ...],
850
+ temb: Optional[paddle.Tensor] = None,
851
+ encoder_hidden_states: Optional[paddle.Tensor] = None,
852
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
853
+ upsample_size: Optional[int] = None,
854
+ attention_mask: Optional[paddle.Tensor] = None,
855
+ encoder_attention_mask: Optional[paddle.Tensor] = None,
856
+ ) -> paddle.Tensor:
857
+ lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
858
+ is_freeu_enabled = (
859
+ getattr(self, "s1", None)
860
+ and getattr(self, "s2", None)
861
+ and getattr(self, "b1", None)
862
+ and getattr(self, "b2", None)
863
+ )
864
+
865
+ for resnet, attn in zip(self.resnets, self.attentions):
866
+ # pop res hidden states
867
+ res_hidden_states = res_hidden_states_tuple[-1]
868
+ res_hidden_states_tuple = res_hidden_states_tuple[:-1]
869
+
870
+ # FreeU: Only operate on the first two stages
871
+ if is_freeu_enabled:
872
+ hidden_states, res_hidden_states = apply_freeu(
873
+ self.resolution_idx,
874
+ hidden_states,
875
+ res_hidden_states,
876
+ s1=self.s1,
877
+ s2=self.s2,
878
+ b1=self.b1,
879
+ b2=self.b2,
880
+ )
881
+
882
+ hidden_states = paddle.concat(x=[hidden_states, res_hidden_states], axis=1)
883
+ if self.training and self.gradient_checkpointing:
884
+
885
+ def create_custom_forward(module, return_dict=None):
886
+ def custom_forward(*inputs):
887
+ if return_dict is not None:
888
+ return module(*inputs, return_dict=return_dict)
889
+ else:
890
+ return module(*inputs)
891
+
892
+ return custom_forward
893
+
894
+ ckpt_kwargs = {} if recompute_use_reentrant() else {"use_reentrant": False}
895
+ hidden_states = recompute(
896
+ create_custom_forward(resnet),
897
+ hidden_states,
898
+ temb,
899
+ **ckpt_kwargs,
900
+ )
901
+
902
+ hidden_states, ref_feature = attn(
903
+ hidden_states,
904
+ encoder_hidden_states=encoder_hidden_states,
905
+ cross_attention_kwargs=cross_attention_kwargs,
906
+ attention_mask=attention_mask,
907
+ encoder_attention_mask=encoder_attention_mask,
908
+ return_dict=False,
909
+ )
910
+ else:
911
+ hidden_states = resnet(hidden_states, temb, scale=lora_scale)
912
+ hidden_states, ref_feature = attn(
913
+ hidden_states,
914
+ encoder_hidden_states=encoder_hidden_states,
915
+ cross_attention_kwargs=cross_attention_kwargs,
916
+ attention_mask=attention_mask,
917
+ encoder_attention_mask=encoder_attention_mask,
918
+ return_dict=False,
919
+ )
920
+
921
+ if self.upsamplers is not None:
922
+ for upsampler in self.upsamplers:
923
+ hidden_states = upsampler(hidden_states, upsample_size, scale=lora_scale)
924
+
925
+ return hidden_states
926
+
927
+
928
+ class UpBlock2D(paddle.nn.Layer):
929
+ def __init__(
930
+ self,
931
+ in_channels: int,
932
+ prev_output_channel: int,
933
+ out_channels: int,
934
+ temb_channels: int,
935
+ resolution_idx: Optional[int] = None,
936
+ dropout: float = 0.0,
937
+ num_layers: int = 1,
938
+ resnet_eps: float = 1e-6,
939
+ resnet_time_scale_shift: str = "default",
940
+ resnet_act_fn: str = "swish",
941
+ resnet_groups: int = 32,
942
+ resnet_pre_norm: bool = True,
943
+ output_scale_factor: float = 1.0,
944
+ add_upsample: bool = True,
945
+ ):
946
+ super().__init__()
947
+ resnets = []
948
+
949
+ for i in range(num_layers):
950
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
951
+ resnet_in_channels = prev_output_channel if i == 0 else out_channels
952
+
953
+ resnets.append(
954
+ ResnetBlock2D(
955
+ in_channels=resnet_in_channels + res_skip_channels,
956
+ out_channels=out_channels,
957
+ temb_channels=temb_channels,
958
+ eps=resnet_eps,
959
+ groups=resnet_groups,
960
+ dropout=dropout,
961
+ time_embedding_norm=resnet_time_scale_shift,
962
+ non_linearity=resnet_act_fn,
963
+ output_scale_factor=output_scale_factor,
964
+ pre_norm=resnet_pre_norm,
965
+ )
966
+ )
967
+
968
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
969
+
970
+ if add_upsample:
971
+ self.upsamplers = paddle.nn.LayerList(
972
+ sublayers=[Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
973
+ )
974
+ else:
975
+ self.upsamplers = None
976
+
977
+ self.gradient_checkpointing = False
978
+ self.resolution_idx = resolution_idx
979
+
980
+ def forward(
981
+ self,
982
+ hidden_states: paddle.Tensor,
983
+ res_hidden_states_tuple: Tuple[paddle.Tensor, ...],
984
+ temb: Optional[paddle.Tensor] = None,
985
+ upsample_size: Optional[int] = None,
986
+ scale: float = 1.0,
987
+ ) -> paddle.Tensor:
988
+ is_freeu_enabled = (
989
+ getattr(self, "s1", None)
990
+ and getattr(self, "s2", None)
991
+ and getattr(self, "b1", None)
992
+ and getattr(self, "b2", None)
993
+ )
994
+
995
+ for resnet in self.resnets:
996
+ # pop res hidden states
997
+ res_hidden_states = res_hidden_states_tuple[-1]
998
+ res_hidden_states_tuple = res_hidden_states_tuple[:-1]
999
+
1000
+ # FreeU: Only operate on the first two stages
1001
+ if is_freeu_enabled:
1002
+ hidden_states, res_hidden_states = apply_freeu(
1003
+ self.resolution_idx,
1004
+ hidden_states,
1005
+ res_hidden_states,
1006
+ s1=self.s1,
1007
+ s2=self.s2,
1008
+ b1=self.b1,
1009
+ b2=self.b2,
1010
+ )
1011
+
1012
+ hidden_states = paddle.concat(x=[hidden_states, res_hidden_states], axis=1)
1013
+
1014
+ if self.training and self.gradient_checkpointing:
1015
+
1016
+ def create_custom_forward(module):
1017
+ def custom_forward(*inputs):
1018
+ return module(*inputs)
1019
+
1020
+ return custom_forward
1021
+
1022
+ ckpt_kwargs = {} if recompute_use_reentrant() else {"use_reentrant": False}
1023
+ hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb, **ckpt_kwargs)
1024
+ else:
1025
+ hidden_states = resnet(hidden_states, temb, scale=scale)
1026
+
1027
+ if self.upsamplers is not None:
1028
+ for upsampler in self.upsamplers:
1029
+ hidden_states = upsampler(hidden_states, upsample_size, scale=scale)
1030
+
1031
+ return hidden_states
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/unet_2d_condition.py ADDED
@@ -0,0 +1,1189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Adapted from https://github.com/huggingface/ppdiffusers/blob/main/src/ppdiffusers/models/unet_2d_condition.py
16
+ from dataclasses import dataclass
17
+ from typing import Any, Dict, List, Optional, Tuple, Union
18
+
19
+ import paddle
20
+
21
+ from ppdiffusers.configuration_utils import ConfigMixin, register_to_config
22
+ from ppdiffusers.loaders import UNet2DConditionLoadersMixin
23
+ from ppdiffusers.models.activations import get_activation
24
+ from ppdiffusers.models.attention_processor import (
25
+ ADDED_KV_ATTENTION_PROCESSORS,
26
+ CROSS_ATTENTION_PROCESSORS,
27
+ AttentionProcessor,
28
+ AttnAddedKVProcessor,
29
+ AttnProcessor,
30
+ )
31
+ from ppdiffusers.models.embeddings import (
32
+ GaussianFourierProjection,
33
+ ImageHintTimeEmbedding,
34
+ ImageProjection,
35
+ ImageTimeEmbedding,
36
+ PositionNet,
37
+ TextImageProjection,
38
+ TextImageTimeEmbedding,
39
+ TextTimeEmbedding,
40
+ TimestepEmbedding,
41
+ Timesteps,
42
+ )
43
+ from ppdiffusers.models.modeling_utils import ModelMixin
44
+ from ppdiffusers.utils import (
45
+ USE_PEFT_BACKEND,
46
+ BaseOutput,
47
+ deprecate,
48
+ logging,
49
+ scale_lora_layers,
50
+ unscale_lora_layers,
51
+ )
52
+
53
+ from .unet_2d_blocks import (
54
+ UNetMidBlock2D,
55
+ UNetMidBlock2DCrossAttn,
56
+ get_down_block,
57
+ get_up_block,
58
+ )
59
+
60
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
61
+
62
+
63
+ @dataclass
64
+ class UNet2DConditionOutput(BaseOutput):
65
+ """
66
+ The output of [`UNet2DConditionModel`].
67
+
68
+ Args:
69
+ sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
70
+ The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
71
+ """
72
+
73
+ sample: paddle.Tensor = None
74
+ ref_features: Tuple[paddle.Tensor] = None
75
+
76
+
77
+ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
78
+ r"""
79
+ A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
80
+ shaped output.
81
+
82
+ This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
83
+ for all models (such as downloading or saving).
84
+
85
+ Parameters:
86
+ sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
87
+ Height and width of input/output sample.
88
+ in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
89
+ out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
90
+ center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
91
+ flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
92
+ Whether to flip the sin to cos in the time embedding.
93
+ freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
94
+ down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
95
+ The tuple of downsample blocks to use.
96
+ mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
97
+ Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or
98
+ `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
99
+ up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
100
+ The tuple of upsample blocks to use.
101
+ only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
102
+ Whether to include self-attention in the basic transformer blocks, see
103
+ [`~models.attention.BasicTransformerBlock`].
104
+ block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
105
+ The tuple of output channels for each block.
106
+ layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
107
+ downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
108
+ mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
109
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
110
+ act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
111
+ norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
112
+ If `None`, normalization and activation layers is skipped in post-processing.
113
+ norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
114
+ cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
115
+ The dimension of the cross attention features.
116
+ transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
117
+ The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
118
+ [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
119
+ [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
120
+ reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
121
+ The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
122
+ blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
123
+ [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
124
+ [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
125
+ encoder_hid_dim (`int`, *optional*, defaults to None):
126
+ If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
127
+ dimension to `cross_attention_dim`.
128
+ encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
129
+ If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
130
+ embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
131
+ attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
132
+ num_attention_heads (`int`, *optional*):
133
+ The number of attention heads. If not defined, defaults to `attention_head_dim`
134
+ resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
135
+ for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
136
+ class_embed_type (`str`, *optional*, defaults to `None`):
137
+ The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
138
+ `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
139
+ addition_embed_type (`str`, *optional*, defaults to `None`):
140
+ Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
141
+ "text". "text" will use the `TextTimeEmbedding` layer.
142
+ addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
143
+ Dimension for the timestep embeddings.
144
+ num_class_embeds (`int`, *optional*, defaults to `None`):
145
+ Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
146
+ class conditioning with `class_embed_type` equal to `None`.
147
+ time_embedding_type (`str`, *optional*, defaults to `positional`):
148
+ The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
149
+ time_embedding_dim (`int`, *optional*, defaults to `None`):
150
+ An optional override for the dimension of the projected time embedding.
151
+ time_embedding_act_fn (`str`, *optional*, defaults to `None`):
152
+ Optional activation function to use only once on the time embeddings before they are passed to the rest of
153
+ the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
154
+ timestep_post_act (`str`, *optional*, defaults to `None`):
155
+ The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
156
+ time_cond_proj_dim (`int`, *optional*, defaults to `None`):
157
+ The dimension of `cond_proj` layer in the timestep embedding.
158
+ conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. conv_out_kernel (`int`,
159
+ *optional*, default to `3`): The kernel size of `conv_out` layer. projection_class_embeddings_input_dim (`int`,
160
+ *optional*): The dimension of the `class_labels` input when
161
+ `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
162
+ class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
163
+ embeddings with the class embeddings.
164
+ mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
165
+ Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
166
+ `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
167
+ `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
168
+ otherwise.
169
+ """
170
+
171
+ _supports_gradient_checkpointing = True
172
+
173
+ @register_to_config
174
+ def __init__(
175
+ self,
176
+ sample_size: Optional[int] = None,
177
+ in_channels: int = 4,
178
+ out_channels: int = 4,
179
+ center_input_sample: bool = False,
180
+ flip_sin_to_cos: bool = True,
181
+ freq_shift: int = 0,
182
+ down_block_types: Tuple[str] = (
183
+ "CrossAttnDownBlock2D",
184
+ "CrossAttnDownBlock2D",
185
+ "CrossAttnDownBlock2D",
186
+ "DownBlock2D",
187
+ ),
188
+ mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
189
+ up_block_types: Tuple[str] = (
190
+ "UpBlock2D",
191
+ "CrossAttnUpBlock2D",
192
+ "CrossAttnUpBlock2D",
193
+ "CrossAttnUpBlock2D",
194
+ ),
195
+ only_cross_attention: Union[bool, Tuple[bool]] = False,
196
+ block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
197
+ layers_per_block: Union[int, Tuple[int]] = 2,
198
+ downsample_padding: int = 1,
199
+ mid_block_scale_factor: float = 1,
200
+ dropout: float = 0.0,
201
+ act_fn: str = "silu",
202
+ norm_num_groups: Optional[int] = 32,
203
+ norm_eps: float = 1e-5,
204
+ cross_attention_dim: Union[int, Tuple[int]] = 1280,
205
+ transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
206
+ reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None,
207
+ encoder_hid_dim: Optional[int] = None,
208
+ encoder_hid_dim_type: Optional[str] = None,
209
+ attention_head_dim: Union[int, Tuple[int]] = 8,
210
+ num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
211
+ dual_cross_attention: bool = False,
212
+ use_linear_projection: bool = False,
213
+ class_embed_type: Optional[str] = None,
214
+ addition_embed_type: Optional[str] = None,
215
+ addition_time_embed_dim: Optional[int] = None,
216
+ num_class_embeds: Optional[int] = None,
217
+ upcast_attention: bool = False,
218
+ resnet_time_scale_shift: str = "default",
219
+ resnet_skip_time_act: bool = False,
220
+ resnet_out_scale_factor: int = 1.0,
221
+ time_embedding_type: str = "positional",
222
+ time_embedding_dim: Optional[int] = None,
223
+ time_embedding_act_fn: Optional[str] = None,
224
+ timestep_post_act: Optional[str] = None,
225
+ time_cond_proj_dim: Optional[int] = None,
226
+ conv_in_kernel: int = 3,
227
+ conv_out_kernel: int = 3,
228
+ projection_class_embeddings_input_dim: Optional[int] = None,
229
+ attention_type: str = "default",
230
+ class_embeddings_concat: bool = False,
231
+ mid_block_only_cross_attention: Optional[bool] = None,
232
+ cross_attention_norm: Optional[str] = None,
233
+ addition_embed_type_num_heads=64,
234
+ ):
235
+ super().__init__()
236
+
237
+ self.sample_size = sample_size
238
+
239
+ if num_attention_heads is not None:
240
+ raise ValueError(
241
+ "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/ppdiffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in ppdiffusers v0.19."
242
+ )
243
+
244
+ # If `num_attention_heads` is not defined (which is the case for most models)
245
+ # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
246
+ # The reason for this behavior is to correct for incorrectly named variables that were introduced
247
+ # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/ppdiffusers/issues/2011#issuecomment-1547958131
248
+ # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
249
+ # which is why we correct for the naming here.
250
+ num_attention_heads = num_attention_heads or attention_head_dim
251
+
252
+ # Check inputs
253
+ if len(down_block_types) != len(up_block_types):
254
+ raise ValueError(
255
+ f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
256
+ )
257
+
258
+ if len(block_out_channels) != len(down_block_types):
259
+ raise ValueError(
260
+ f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
261
+ )
262
+
263
+ if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
264
+ raise ValueError(
265
+ f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
266
+ )
267
+
268
+ if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
269
+ raise ValueError(
270
+ f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
271
+ )
272
+
273
+ if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
274
+ raise ValueError(
275
+ f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
276
+ )
277
+
278
+ if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
279
+ raise ValueError(
280
+ f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
281
+ )
282
+
283
+ if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
284
+ raise ValueError(
285
+ f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
286
+ )
287
+ if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None:
288
+ for layer_number_per_block in transformer_layers_per_block:
289
+ if isinstance(layer_number_per_block, list):
290
+ raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.")
291
+
292
+ # input
293
+ conv_in_padding = (conv_in_kernel - 1) // 2
294
+
295
+ self.conv_in = paddle.nn.Conv2D(
296
+ in_channels=in_channels,
297
+ out_channels=block_out_channels[0],
298
+ kernel_size=conv_in_kernel,
299
+ padding=conv_in_padding,
300
+ )
301
+
302
+ # time
303
+ if time_embedding_type == "fourier":
304
+ time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
305
+ if time_embed_dim % 2 != 0:
306
+ raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
307
+ self.time_proj = GaussianFourierProjection(
308
+ time_embed_dim // 2,
309
+ set_W_to_weight=False,
310
+ log=False,
311
+ flip_sin_to_cos=flip_sin_to_cos,
312
+ )
313
+ timestep_input_dim = time_embed_dim
314
+ elif time_embedding_type == "positional":
315
+ time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
316
+
317
+ self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
318
+ timestep_input_dim = block_out_channels[0]
319
+ else:
320
+ raise ValueError(
321
+ f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
322
+ )
323
+
324
+ self.time_embedding = TimestepEmbedding(
325
+ timestep_input_dim,
326
+ time_embed_dim,
327
+ act_fn=act_fn,
328
+ post_act_fn=timestep_post_act,
329
+ cond_proj_dim=time_cond_proj_dim,
330
+ )
331
+
332
+ if encoder_hid_dim_type is None and encoder_hid_dim is not None:
333
+ encoder_hid_dim_type = "text_proj"
334
+ self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
335
+ logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
336
+
337
+ if encoder_hid_dim is None and encoder_hid_dim_type is not None:
338
+ raise ValueError(
339
+ f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
340
+ )
341
+
342
+ if encoder_hid_dim_type == "text_proj":
343
+ self.encoder_hid_proj = paddle.nn.Linear(in_features=encoder_hid_dim, out_features=cross_attention_dim)
344
+ elif encoder_hid_dim_type == "text_image_proj":
345
+ # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
346
+ # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
347
+ # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
348
+ self.encoder_hid_proj = TextImageProjection(
349
+ text_embed_dim=encoder_hid_dim,
350
+ image_embed_dim=cross_attention_dim,
351
+ cross_attention_dim=cross_attention_dim,
352
+ )
353
+ elif encoder_hid_dim_type == "image_proj":
354
+ # Kandinsky 2.2
355
+ self.encoder_hid_proj = ImageProjection(
356
+ image_embed_dim=encoder_hid_dim,
357
+ cross_attention_dim=cross_attention_dim,
358
+ )
359
+ elif encoder_hid_dim_type is not None:
360
+ raise ValueError(
361
+ f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
362
+ )
363
+ else:
364
+ self.encoder_hid_proj = None
365
+
366
+ # class embedding
367
+ if class_embed_type is None and num_class_embeds is not None:
368
+ self.class_embedding = paddle.nn.Embedding(num_embeddings=num_class_embeds, embedding_dim=time_embed_dim)
369
+ elif class_embed_type == "timestep":
370
+ self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
371
+ elif class_embed_type == "identity":
372
+ self.class_embedding = paddle.nn.Embedding(num_embeddings=num_class_embeds, embedding_dim=time_embed_dim)
373
+ elif class_embed_type == "projection":
374
+ if projection_class_embeddings_input_dim is None:
375
+ raise ValueError(
376
+ "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
377
+ )
378
+ # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
379
+ # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
380
+ # 2. it projects from an arbitrary input dimension.
381
+ #
382
+ # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
383
+ # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
384
+ # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
385
+ self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
386
+ elif class_embed_type == "simple_projection":
387
+ if projection_class_embeddings_input_dim is None:
388
+ raise ValueError(
389
+ "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
390
+ )
391
+ self.class_embedding = paddle.nn.Linear(
392
+ in_features=projection_class_embeddings_input_dim, out_features=time_embed_dim
393
+ )
394
+ else:
395
+ self.class_embedding = None
396
+
397
+ if addition_embed_type == "text":
398
+ if encoder_hid_dim is not None:
399
+ text_time_embedding_from_dim = encoder_hid_dim
400
+ else:
401
+ text_time_embedding_from_dim = cross_attention_dim
402
+
403
+ self.add_embedding = TextTimeEmbedding(
404
+ text_time_embedding_from_dim,
405
+ time_embed_dim,
406
+ num_heads=addition_embed_type_num_heads,
407
+ )
408
+ elif addition_embed_type == "text_image":
409
+ # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
410
+ # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
411
+ # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
412
+ self.add_embedding = TextImageTimeEmbedding(
413
+ text_embed_dim=cross_attention_dim,
414
+ image_embed_dim=cross_attention_dim,
415
+ time_embed_dim=time_embed_dim,
416
+ )
417
+ elif addition_embed_type == "text_time":
418
+ self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
419
+ self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
420
+ elif addition_embed_type == "image":
421
+ # Kandinsky 2.2
422
+ self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
423
+ elif addition_embed_type == "image_hint":
424
+ # Kandinsky 2.2 ControlNet
425
+ self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
426
+ elif addition_embed_type is not None:
427
+ raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
428
+
429
+ if time_embedding_act_fn is None:
430
+ self.time_embed_act = None
431
+ else:
432
+ self.time_embed_act = get_activation(time_embedding_act_fn)
433
+
434
+ self.down_blocks = paddle.nn.LayerList(sublayers=[])
435
+ self.up_blocks = paddle.nn.LayerList(sublayers=[])
436
+
437
+ if isinstance(only_cross_attention, bool):
438
+ if mid_block_only_cross_attention is None:
439
+ mid_block_only_cross_attention = only_cross_attention
440
+
441
+ only_cross_attention = [only_cross_attention] * len(down_block_types)
442
+
443
+ if mid_block_only_cross_attention is None:
444
+ mid_block_only_cross_attention = False
445
+
446
+ if isinstance(num_attention_heads, int):
447
+ num_attention_heads = (num_attention_heads,) * len(down_block_types)
448
+
449
+ if isinstance(attention_head_dim, int):
450
+ attention_head_dim = (attention_head_dim,) * len(down_block_types)
451
+
452
+ if isinstance(cross_attention_dim, int):
453
+ cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
454
+
455
+ if isinstance(layers_per_block, int):
456
+ layers_per_block = [layers_per_block] * len(down_block_types)
457
+
458
+ if isinstance(transformer_layers_per_block, int):
459
+ transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
460
+
461
+ if class_embeddings_concat:
462
+ # The time embeddings are concatenated with the class embeddings. The dimension of the
463
+ # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
464
+ # regular time embeddings
465
+ blocks_time_embed_dim = time_embed_dim * 2
466
+ else:
467
+ blocks_time_embed_dim = time_embed_dim
468
+
469
+ # down
470
+ output_channel = block_out_channels[0]
471
+ for i, down_block_type in enumerate(down_block_types):
472
+ input_channel = output_channel
473
+ output_channel = block_out_channels[i]
474
+ is_final_block = i == len(block_out_channels) - 1
475
+
476
+ down_block = get_down_block(
477
+ down_block_type,
478
+ num_layers=layers_per_block[i],
479
+ transformer_layers_per_block=transformer_layers_per_block[i],
480
+ in_channels=input_channel,
481
+ out_channels=output_channel,
482
+ temb_channels=blocks_time_embed_dim,
483
+ add_downsample=not is_final_block,
484
+ resnet_eps=norm_eps,
485
+ resnet_act_fn=act_fn,
486
+ resnet_groups=norm_num_groups,
487
+ cross_attention_dim=cross_attention_dim[i],
488
+ num_attention_heads=num_attention_heads[i],
489
+ downsample_padding=downsample_padding,
490
+ dual_cross_attention=dual_cross_attention,
491
+ use_linear_projection=use_linear_projection,
492
+ only_cross_attention=only_cross_attention[i],
493
+ upcast_attention=upcast_attention,
494
+ resnet_time_scale_shift=resnet_time_scale_shift,
495
+ attention_type=attention_type,
496
+ resnet_skip_time_act=resnet_skip_time_act,
497
+ resnet_out_scale_factor=resnet_out_scale_factor,
498
+ cross_attention_norm=cross_attention_norm,
499
+ attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
500
+ dropout=dropout,
501
+ )
502
+ self.down_blocks.append(down_block)
503
+
504
+ # mid
505
+ if mid_block_type == "UNetMidBlock2DCrossAttn":
506
+ self.mid_block = UNetMidBlock2DCrossAttn(
507
+ transformer_layers_per_block=transformer_layers_per_block[-1],
508
+ in_channels=block_out_channels[-1],
509
+ temb_channels=blocks_time_embed_dim,
510
+ dropout=dropout,
511
+ resnet_eps=norm_eps,
512
+ resnet_act_fn=act_fn,
513
+ output_scale_factor=mid_block_scale_factor,
514
+ resnet_time_scale_shift=resnet_time_scale_shift,
515
+ cross_attention_dim=cross_attention_dim[-1],
516
+ num_attention_heads=num_attention_heads[-1],
517
+ resnet_groups=norm_num_groups,
518
+ dual_cross_attention=dual_cross_attention,
519
+ use_linear_projection=use_linear_projection,
520
+ upcast_attention=upcast_attention,
521
+ attention_type=attention_type,
522
+ )
523
+ elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
524
+ raise NotImplementedError(f"Unsupport mid_block_type: {mid_block_type}")
525
+ elif mid_block_type == "UNetMidBlock2D":
526
+ self.mid_block = UNetMidBlock2D(
527
+ in_channels=block_out_channels[-1],
528
+ temb_channels=blocks_time_embed_dim,
529
+ dropout=dropout,
530
+ num_layers=0,
531
+ resnet_eps=norm_eps,
532
+ resnet_act_fn=act_fn,
533
+ output_scale_factor=mid_block_scale_factor,
534
+ resnet_groups=norm_num_groups,
535
+ resnet_time_scale_shift=resnet_time_scale_shift,
536
+ add_attention=False,
537
+ )
538
+ elif mid_block_type is None:
539
+ self.mid_block = None
540
+ else:
541
+ raise ValueError(f"unknown mid_block_type : {mid_block_type}")
542
+
543
+ # count how many layers upsample the images
544
+ self.num_upsamplers = 0
545
+
546
+ # up
547
+ reversed_block_out_channels = list(reversed(block_out_channels))
548
+ reversed_num_attention_heads = list(reversed(num_attention_heads))
549
+ reversed_layers_per_block = list(reversed(layers_per_block))
550
+ reversed_cross_attention_dim = list(reversed(cross_attention_dim))
551
+ reversed_transformer_layers_per_block = (
552
+ list(reversed(transformer_layers_per_block))
553
+ if reverse_transformer_layers_per_block is None
554
+ else reverse_transformer_layers_per_block
555
+ )
556
+ only_cross_attention = list(reversed(only_cross_attention))
557
+
558
+ output_channel = reversed_block_out_channels[0]
559
+ for i, up_block_type in enumerate(up_block_types):
560
+ is_final_block = i == len(block_out_channels) - 1
561
+
562
+ prev_output_channel = output_channel
563
+ output_channel = reversed_block_out_channels[i]
564
+ input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
565
+
566
+ # add upsample block for all BUT final layer
567
+ if not is_final_block:
568
+ add_upsample = True
569
+ self.num_upsamplers += 1
570
+ else:
571
+ add_upsample = False
572
+
573
+ up_block = get_up_block(
574
+ up_block_type,
575
+ num_layers=reversed_layers_per_block[i] + 1,
576
+ transformer_layers_per_block=reversed_transformer_layers_per_block[i],
577
+ in_channels=input_channel,
578
+ out_channels=output_channel,
579
+ prev_output_channel=prev_output_channel,
580
+ temb_channels=blocks_time_embed_dim,
581
+ add_upsample=add_upsample,
582
+ resnet_eps=norm_eps,
583
+ resnet_act_fn=act_fn,
584
+ resolution_idx=i,
585
+ resnet_groups=norm_num_groups,
586
+ cross_attention_dim=reversed_cross_attention_dim[i],
587
+ num_attention_heads=reversed_num_attention_heads[i],
588
+ dual_cross_attention=dual_cross_attention,
589
+ use_linear_projection=use_linear_projection,
590
+ only_cross_attention=only_cross_attention[i],
591
+ upcast_attention=upcast_attention,
592
+ resnet_time_scale_shift=resnet_time_scale_shift,
593
+ attention_type=attention_type,
594
+ resnet_skip_time_act=resnet_skip_time_act,
595
+ resnet_out_scale_factor=resnet_out_scale_factor,
596
+ cross_attention_norm=cross_attention_norm,
597
+ attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
598
+ dropout=dropout,
599
+ )
600
+ self.up_blocks.append(up_block)
601
+ prev_output_channel = output_channel
602
+
603
+ # out
604
+ if norm_num_groups is not None:
605
+
606
+ self.conv_norm_out = paddle.nn.GroupNorm(
607
+ num_channels=block_out_channels[0], num_groups=norm_num_groups, epsilon=norm_eps
608
+ )
609
+
610
+ self.conv_act = get_activation(act_fn)
611
+
612
+ else:
613
+ self.conv_norm_out = None
614
+ self.conv_act = None
615
+ self.conv_norm_out = None
616
+
617
+ if attention_type in ["gated", "gated-text-image"]:
618
+ positive_len = 768
619
+ if isinstance(cross_attention_dim, int):
620
+ positive_len = cross_attention_dim
621
+ elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list):
622
+ positive_len = cross_attention_dim[0]
623
+
624
+ feature_type = "text-only" if attention_type == "gated" else "text-image"
625
+ self.position_net = PositionNet(
626
+ positive_len=positive_len,
627
+ out_dim=cross_attention_dim,
628
+ feature_type=feature_type,
629
+ )
630
+
631
+ @property
632
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
633
+ r"""
634
+ Returns:
635
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
636
+ indexed by its weight name.
637
+ """
638
+ # set recursively
639
+ processors = {}
640
+
641
+ def fn_recursive_add_processors(
642
+ name: str,
643
+ module: paddle.nn.Layer,
644
+ processors: Dict[str, AttentionProcessor],
645
+ ):
646
+ if hasattr(module, "get_processor"):
647
+ processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
648
+
649
+ for sub_name, child in module.named_children():
650
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
651
+
652
+ return processors
653
+
654
+ for name, module in self.named_children():
655
+ fn_recursive_add_processors(name, module, processors)
656
+
657
+ return processors
658
+
659
+ def set_attn_processor(
660
+ self,
661
+ processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]],
662
+ _remove_lora=False,
663
+ ):
664
+ r"""
665
+ Sets the attention processor to use to compute attention.
666
+
667
+ Parameters:
668
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
669
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
670
+ for **all** `Attention` layers.
671
+
672
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
673
+ processor. This is strongly recommended when setting trainable attention processors.
674
+
675
+ """
676
+ count = len(self.attn_processors.keys())
677
+
678
+ if isinstance(processor, dict) and len(processor) != count:
679
+ raise ValueError(
680
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
681
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
682
+ )
683
+
684
+ def fn_recursive_attn_processor(name: str, module: paddle.nn.Layer, processor):
685
+ if hasattr(module, "set_processor"):
686
+ if not isinstance(processor, dict):
687
+ module.set_processor(processor, _remove_lora=_remove_lora)
688
+ else:
689
+ module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
690
+
691
+ for sub_name, child in module.named_children():
692
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
693
+
694
+ for name, module in self.named_children():
695
+ fn_recursive_attn_processor(name, module, processor)
696
+
697
+ def set_default_attn_processor(self):
698
+ """
699
+ Disables custom attention processors and sets the default attention implementation.
700
+ """
701
+ if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
702
+ processor = AttnAddedKVProcessor()
703
+ elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
704
+ processor = AttnProcessor()
705
+ else:
706
+ raise ValueError(
707
+ f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
708
+ )
709
+
710
+ self.set_attn_processor(processor, _remove_lora=True)
711
+
712
+ def set_attention_slice(self, slice_size):
713
+ r"""
714
+ Enable sliced attention computation.
715
+
716
+ When this option is enabled, the attention module splits the input tensor in slices to compute attention in
717
+ several steps. This is useful for saving some memory in exchange for a small decrease in speed.
718
+
719
+ Args:
720
+ slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
721
+ When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
722
+ `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
723
+ provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
724
+ must be a multiple of `slice_size`.
725
+ """
726
+ sliceable_head_dims = []
727
+
728
+ def fn_recursive_retrieve_sliceable_dims(module: paddle.nn.Layer):
729
+ if hasattr(module, "set_attention_slice"):
730
+ sliceable_head_dims.append(module.sliceable_head_dim)
731
+
732
+ for child in module.children():
733
+ fn_recursive_retrieve_sliceable_dims(child)
734
+
735
+ # retrieve number of attention layers
736
+ for module in self.children():
737
+ fn_recursive_retrieve_sliceable_dims(module)
738
+
739
+ num_sliceable_layers = len(sliceable_head_dims)
740
+
741
+ if slice_size == "auto":
742
+ # half the attention head size is usually a good trade-off between
743
+ # speed and memory
744
+ slice_size = [dim // 2 for dim in sliceable_head_dims]
745
+ elif slice_size == "max":
746
+ # make smallest slice possible
747
+ slice_size = num_sliceable_layers * [1]
748
+
749
+ slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
750
+
751
+ if len(slice_size) != len(sliceable_head_dims):
752
+ raise ValueError(
753
+ f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
754
+ f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
755
+ )
756
+
757
+ for i in range(len(slice_size)):
758
+ size = slice_size[i]
759
+ dim = sliceable_head_dims[i]
760
+ if size is not None and size > dim:
761
+ raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
762
+
763
+ # Recursively walk through all the children.
764
+ # Any children which exposes the set_attention_slice method
765
+ # gets the message
766
+ def fn_recursive_set_attention_slice(module: paddle.nn.Layer, slice_size: List[int]):
767
+ if hasattr(module, "set_attention_slice"):
768
+ module.set_attention_slice(slice_size.pop())
769
+
770
+ for child in module.children():
771
+ fn_recursive_set_attention_slice(child, slice_size)
772
+
773
+ reversed_slice_size = list(reversed(slice_size))
774
+ for module in self.children():
775
+ fn_recursive_set_attention_slice(module, reversed_slice_size)
776
+
777
+ def _set_gradient_checkpointing(self, module, value=False):
778
+ if hasattr(module, "gradient_checkpointing"):
779
+ module.gradient_checkpointing = value
780
+
781
+ def enable_freeu(self, s1, s2, b1, b2):
782
+ r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
783
+
784
+ The suffixes after the scaling factors represent the stage blocks where they are being applied.
785
+
786
+ Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
787
+ are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
788
+
789
+ Args:
790
+ s1 (`float`):
791
+ Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
792
+ mitigate the "oversmoothing effect" in the enhanced denoising process.
793
+ s2 (`float`):
794
+ Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
795
+ mitigate the "oversmoothing effect" in the enhanced denoising process.
796
+ b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
797
+ b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
798
+ """
799
+ for i, upsample_block in enumerate(self.up_blocks):
800
+ setattr(upsample_block, "s1", s1)
801
+ setattr(upsample_block, "s2", s2)
802
+ setattr(upsample_block, "b1", b1)
803
+ setattr(upsample_block, "b2", b2)
804
+
805
+ def disable_freeu(self):
806
+ """Disables the FreeU mechanism."""
807
+ freeu_keys = {"s1", "s2", "b1", "b2"}
808
+ for i, upsample_block in enumerate(self.up_blocks):
809
+ for k in freeu_keys:
810
+ if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
811
+ setattr(upsample_block, k, None)
812
+
813
+ def forward(
814
+ self,
815
+ sample: paddle.Tensor,
816
+ timestep: Union[paddle.Tensor, float, int],
817
+ encoder_hidden_states: paddle.Tensor,
818
+ class_labels: Optional[paddle.Tensor] = None,
819
+ timestep_cond: Optional[paddle.Tensor] = None,
820
+ attention_mask: Optional[paddle.Tensor] = None,
821
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
822
+ added_cond_kwargs: Optional[Dict[str, paddle.Tensor]] = None,
823
+ down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
824
+ mid_block_additional_residual: Optional[paddle.Tensor] = None,
825
+ down_intrablock_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
826
+ encoder_attention_mask: Optional[paddle.Tensor] = None,
827
+ return_dict: bool = True,
828
+ ) -> Union[UNet2DConditionOutput, Tuple]:
829
+ r"""
830
+ The [`UNet2DConditionModel`] forward method.
831
+
832
+ Args:
833
+ sample (`paddle.Tensor`):
834
+ The noisy input tensor with the following shape `(batch, channel, height, width)`.
835
+ timestep (`paddle.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
836
+ encoder_hidden_states (`paddle.Tensor`):
837
+ The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
838
+ class_labels (`paddle.Tensor`, *optional*, defaults to `None`):
839
+ Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
840
+ timestep_cond: (`paddle.Tensor`, *optional*, defaults to `None`):
841
+ Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
842
+ through the `self.time_embedding` layer to obtain the timestep embeddings.
843
+ attention_mask (`paddle.Tensor`, *optional*, defaults to `None`):
844
+ An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
845
+ is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
846
+ negative values to the attention scores corresponding to "discard" tokens.
847
+ cross_attention_kwargs (`dict`, *optional*):
848
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
849
+ `self.processor` in
850
+ [ppdiffusers.models.attention_processor](https://github.com/huggingface/ppdiffusers/blob/main/src/ppdiffusers/models/attention_processor.py).
851
+ added_cond_kwargs: (`dict`, *optional*):
852
+ A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
853
+ are passed along to the UNet blocks.
854
+ down_block_additional_residuals: (`tuple` of `paddle.Tensor`, *optional*):
855
+ A tuple of tensors that if specified are added to the residuals of down unet blocks.
856
+ mid_block_additional_residual: (`paddle.Tensor`, *optional*):
857
+ A tensor that if specified is added to the residual of the middle unet block.
858
+ encoder_attention_mask (`paddle.Tensor`):
859
+ A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
860
+ `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
861
+ which adds large negative values to the attention scores corresponding to "discard" tokens.
862
+ return_dict (`bool`, *optional*, defaults to `True`):
863
+ Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
864
+ tuple.
865
+ cross_attention_kwargs (`dict`, *optional*):
866
+ A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
867
+ added_cond_kwargs: (`dict`, *optional*):
868
+ A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
869
+ are passed along to the UNet blocks.
870
+ down_block_additional_residuals (`tuple` of `paddle.Tensor`, *optional*):
871
+ additional residuals to be added to UNet long skip connections from down blocks to up blocks for
872
+ example from ControlNet side model(s)
873
+ mid_block_additional_residual (`paddle.Tensor`, *optional*):
874
+ additional residual to be added to UNet mid block output, for example from ControlNet side model
875
+ down_intrablock_additional_residuals (`tuple` of `paddle.Tensor`, *optional*):
876
+ additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
877
+
878
+ Returns:
879
+ [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
880
+ If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
881
+ a `tuple` is returned where the first element is the sample tensor.
882
+ """
883
+ # By default samples have to be AT least a multiple of the overall upsampling factor.
884
+ # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
885
+ # However, the upsampling interpolation output size can be forced to fit any upsampling size
886
+ # on the fly if necessary.
887
+ default_overall_up_factor = 2**self.num_upsamplers
888
+
889
+ # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
890
+ forward_upsample_size = False
891
+ upsample_size = None
892
+
893
+ for dim in sample.shape[-2:]:
894
+ if dim % default_overall_up_factor != 0:
895
+ # Forward upsample size to force interpolation output size.
896
+ forward_upsample_size = True
897
+ break
898
+
899
+ # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
900
+ # expects mask of shape:
901
+ # [batch, key_tokens]
902
+ # adds singleton query_tokens dimension:
903
+ # [batch, 1, key_tokens]
904
+ # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
905
+ # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn)
906
+ # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
907
+ if attention_mask is not None:
908
+ # assume that mask is expressed as:
909
+ # (1 = keep, 0 = discard)
910
+ # convert mask into a bias that can be added to attention scores:
911
+ # (keep = +0, discard = -10000.0)
912
+ attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
913
+ attention_mask = attention_mask.unsqueeze(1)
914
+
915
+ # convert encoder_attention_mask to a bias the same way we do for attention_mask
916
+ if encoder_attention_mask is not None:
917
+ encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
918
+ encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
919
+
920
+ # 0. center input if necessary
921
+ if self.config.center_input_sample:
922
+ sample = 2 * sample - 1.0
923
+
924
+ # 1. time
925
+ timesteps = timestep
926
+ if not paddle.is_tensor(x=timesteps):
927
+ # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
928
+ # This would be a good case for the `match` statement (Python 3.10+)
929
+ is_mps = sample.device.type == "mps"
930
+ if isinstance(timestep, float):
931
+ dtype = "float32" if is_mps else "float64"
932
+ else:
933
+ dtype = "int32" if is_mps else "int64"
934
+ timesteps = paddle.Tensor([timesteps], dtype=dtype)
935
+ elif len(timesteps.shape) == 0:
936
+ timesteps = timesteps[None]
937
+
938
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
939
+ timesteps = timesteps.expand(sample.shape[0])
940
+
941
+ t_emb = self.time_proj(timesteps)
942
+
943
+ # `Timesteps` does not contain any weights and will always return f32 tensors
944
+ # but time_embedding might actually be running in fp16. so we need to cast here.
945
+ # there might be better ways to encapsulate this.
946
+ t_emb = t_emb.to(dtype=sample.dtype)
947
+
948
+ emb = self.time_embedding(t_emb, timestep_cond)
949
+ aug_emb = None
950
+
951
+ if self.class_embedding is not None:
952
+ if class_labels is None:
953
+ raise ValueError("class_labels should be provided when num_class_embeds > 0")
954
+
955
+ if self.config.class_embed_type == "timestep":
956
+ class_labels = self.time_proj(class_labels)
957
+
958
+ # `Timesteps` does not contain any weights and will always return f32 tensors
959
+ # there might be better ways to encapsulate this.
960
+ class_labels = class_labels.to(dtype=sample.dtype)
961
+
962
+ class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
963
+
964
+ if self.config.class_embeddings_concat:
965
+ emb = paddle.concat(x=[emb, class_emb], axis=-1)
966
+ else:
967
+ emb = emb + class_emb
968
+
969
+ if self.config.addition_embed_type == "text":
970
+ aug_emb = self.add_embedding(encoder_hidden_states)
971
+ elif self.config.addition_embed_type == "text_image":
972
+ # Kandinsky 2.1 - style
973
+ if "image_embeds" not in added_cond_kwargs:
974
+ raise ValueError(
975
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
976
+ )
977
+
978
+ image_embs = added_cond_kwargs.get("image_embeds")
979
+ text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
980
+ aug_emb = self.add_embedding(text_embs, image_embs)
981
+ elif self.config.addition_embed_type == "text_time":
982
+ # SDXL - style
983
+ if "text_embeds" not in added_cond_kwargs:
984
+ raise ValueError(
985
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
986
+ )
987
+ text_embeds = added_cond_kwargs.get("text_embeds")
988
+ if "time_ids" not in added_cond_kwargs:
989
+ raise ValueError(
990
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
991
+ )
992
+ time_ids = added_cond_kwargs.get("time_ids")
993
+ time_embeds = self.add_time_proj(time_ids.flatten())
994
+ time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
995
+ add_embeds = paddle.concat(x=[text_embeds, time_embeds], axis=-1)
996
+ add_embeds = add_embeds.to(emb.dtype)
997
+ aug_emb = self.add_embedding(add_embeds)
998
+ elif self.config.addition_embed_type == "image":
999
+ # Kandinsky 2.2 - style
1000
+ if "image_embeds" not in added_cond_kwargs:
1001
+ raise ValueError(
1002
+ f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
1003
+ )
1004
+ image_embs = added_cond_kwargs.get("image_embeds")
1005
+ aug_emb = self.add_embedding(image_embs)
1006
+ elif self.config.addition_embed_type == "image_hint":
1007
+ # Kandinsky 2.2 - style
1008
+ if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
1009
+ raise ValueError(
1010
+ f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
1011
+ )
1012
+ image_embs = added_cond_kwargs.get("image_embeds")
1013
+ hint = added_cond_kwargs.get("hint")
1014
+ aug_emb, hint = self.add_embedding(image_embs, hint)
1015
+ sample = paddle.concat(x=[sample, hint], axis=1)
1016
+
1017
+ emb = emb + aug_emb if aug_emb is not None else emb
1018
+
1019
+ if self.time_embed_act is not None:
1020
+ emb = self.time_embed_act(emb)
1021
+
1022
+ if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
1023
+ encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
1024
+ elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
1025
+ # Kadinsky 2.1 - style
1026
+ if "image_embeds" not in added_cond_kwargs:
1027
+ raise ValueError(
1028
+ f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
1029
+ )
1030
+
1031
+ image_embeds = added_cond_kwargs.get("image_embeds")
1032
+ encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
1033
+ elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
1034
+ # Kandinsky 2.2 - style
1035
+ if "image_embeds" not in added_cond_kwargs:
1036
+ raise ValueError(
1037
+ f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
1038
+ )
1039
+ image_embeds = added_cond_kwargs.get("image_embeds")
1040
+ encoder_hidden_states = self.encoder_hid_proj(image_embeds)
1041
+ elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
1042
+ if "image_embeds" not in added_cond_kwargs:
1043
+ raise ValueError(
1044
+ f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
1045
+ )
1046
+ image_embeds = added_cond_kwargs.get("image_embeds")
1047
+ image_embeds = self.encoder_hid_proj(image_embeds).to(encoder_hidden_states.dtype)
1048
+
1049
+ encoder_hidden_states = paddle.concat(x=[encoder_hidden_states, image_embeds], axis=1)
1050
+
1051
+ # 2. pre-process
1052
+ sample = self.conv_in(sample)
1053
+
1054
+ # 2.5 GLIGEN position net
1055
+ if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
1056
+ cross_attention_kwargs = cross_attention_kwargs.copy()
1057
+ gligen_args = cross_attention_kwargs.pop("gligen")
1058
+ cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
1059
+
1060
+ # 3. down
1061
+ lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
1062
+ if USE_PEFT_BACKEND:
1063
+ # weight the lora layers by setting `lora_scale` for each PEFT layer
1064
+ scale_lora_layers(self, lora_scale)
1065
+
1066
+ is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
1067
+ # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
1068
+ is_adapter = down_intrablock_additional_residuals is not None
1069
+ # maintain backward compatibility for legacy usage, where
1070
+ # T2I-Adapter and ControlNet both use down_block_additional_residuals arg
1071
+ # but can only use one or the other
1072
+ if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
1073
+ deprecate(
1074
+ "T2I should not use down_block_additional_residuals",
1075
+ "1.3.0",
1076
+ "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
1077
+ and will be removed in ppdiffusers 1.3.0. `down_block_additional_residuals` should only be used \
1078
+ for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
1079
+ standard_warn=False,
1080
+ )
1081
+ down_intrablock_additional_residuals = down_block_additional_residuals
1082
+ is_adapter = True
1083
+
1084
+ down_block_res_samples = (sample,)
1085
+ for downsample_block in self.down_blocks:
1086
+ if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
1087
+ # For t2i-adapter CrossAttnDownBlock2D
1088
+ additional_residuals = {}
1089
+ if is_adapter and len(down_intrablock_additional_residuals) > 0:
1090
+ additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
1091
+
1092
+ sample, res_samples = downsample_block(
1093
+ hidden_states=sample,
1094
+ temb=emb,
1095
+ encoder_hidden_states=encoder_hidden_states,
1096
+ attention_mask=attention_mask,
1097
+ cross_attention_kwargs=cross_attention_kwargs,
1098
+ encoder_attention_mask=encoder_attention_mask,
1099
+ **additional_residuals,
1100
+ )
1101
+ else:
1102
+ sample, res_samples = downsample_block(hidden_states=sample, temb=emb, scale=lora_scale)
1103
+ if is_adapter and len(down_intrablock_additional_residuals) > 0:
1104
+ sample += down_intrablock_additional_residuals.pop(0)
1105
+
1106
+ down_block_res_samples += res_samples
1107
+
1108
+ if is_controlnet:
1109
+ new_down_block_res_samples = ()
1110
+
1111
+ for down_block_res_sample, down_block_additional_residual in zip(
1112
+ down_block_res_samples, down_block_additional_residuals
1113
+ ):
1114
+ down_block_res_sample = down_block_res_sample + down_block_additional_residual
1115
+ new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
1116
+
1117
+ down_block_res_samples = new_down_block_res_samples
1118
+
1119
+ # 4. mid
1120
+ if self.mid_block is not None:
1121
+ if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
1122
+ sample = self.mid_block(
1123
+ sample,
1124
+ emb,
1125
+ encoder_hidden_states=encoder_hidden_states,
1126
+ attention_mask=attention_mask,
1127
+ cross_attention_kwargs=cross_attention_kwargs,
1128
+ encoder_attention_mask=encoder_attention_mask,
1129
+ )
1130
+ else:
1131
+ sample = self.mid_block(sample, emb)
1132
+
1133
+ # To support T2I-Adapter-XL
1134
+ if (
1135
+ is_adapter
1136
+ and len(down_intrablock_additional_residuals) > 0
1137
+ and sample.shape == down_intrablock_additional_residuals[0].shape
1138
+ ):
1139
+ sample += down_intrablock_additional_residuals.pop(0)
1140
+
1141
+ if is_controlnet:
1142
+ sample = sample + mid_block_additional_residual
1143
+
1144
+ # 5. up
1145
+ for i, upsample_block in enumerate(self.up_blocks):
1146
+ is_final_block = i == len(self.up_blocks) - 1
1147
+
1148
+ res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
1149
+ down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
1150
+
1151
+ # if we have not reached the final block and need to forward the
1152
+ # upsample size, we do it here
1153
+ if not is_final_block and forward_upsample_size:
1154
+ upsample_size = down_block_res_samples[-1].shape[2:]
1155
+
1156
+ if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
1157
+ sample = upsample_block(
1158
+ hidden_states=sample,
1159
+ temb=emb,
1160
+ res_hidden_states_tuple=res_samples,
1161
+ encoder_hidden_states=encoder_hidden_states,
1162
+ cross_attention_kwargs=cross_attention_kwargs,
1163
+ upsample_size=upsample_size,
1164
+ attention_mask=attention_mask,
1165
+ encoder_attention_mask=encoder_attention_mask,
1166
+ )
1167
+ else:
1168
+ sample = upsample_block(
1169
+ hidden_states=sample,
1170
+ temb=emb,
1171
+ res_hidden_states_tuple=res_samples,
1172
+ upsample_size=upsample_size,
1173
+ scale=lora_scale,
1174
+ )
1175
+
1176
+ # 6. post-process
1177
+ # if self.conv_norm_out:
1178
+ # sample = self.conv_norm_out(sample)
1179
+ # sample = self.conv_act(sample)
1180
+ # sample = self.conv_out(sample)
1181
+
1182
+ if USE_PEFT_BACKEND:
1183
+ # remove `lora_scale` from each PEFT layer
1184
+ unscale_lora_layers(self, lora_scale)
1185
+
1186
+ if not return_dict:
1187
+ return (sample,)
1188
+
1189
+ return UNet2DConditionOutput(sample=sample)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/animate_anyone/unet_3d_blocks.py ADDED
@@ -0,0 +1,739 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Adapted from https://github.com/huggingface/ppdiffusers/blob/main/src/ppdiffusers/models/unet_2d_blocks.py
16
+
17
+ import paddle
18
+
19
+ from .motion_module import get_motion_module
20
+ from .resnet import Downsample3D, ResnetBlock3D, Upsample3D
21
+ from .transformer_3d import Transformer3DModel
22
+
23
+
24
+ def get_down_block(
25
+ down_block_type,
26
+ num_layers,
27
+ in_channels,
28
+ out_channels,
29
+ temb_channels,
30
+ add_downsample,
31
+ resnet_eps,
32
+ resnet_act_fn,
33
+ attn_num_head_channels,
34
+ resnet_groups=None,
35
+ cross_attention_dim=None,
36
+ downsample_padding=None,
37
+ dual_cross_attention=False,
38
+ use_linear_projection=False,
39
+ only_cross_attention=False,
40
+ upcast_attention=False,
41
+ resnet_time_scale_shift="default",
42
+ unet_use_cross_frame_attention=None,
43
+ unet_use_temporal_attention=None,
44
+ use_inflated_groupnorm=None,
45
+ use_motion_module=None,
46
+ motion_module_type=None,
47
+ motion_module_kwargs=None,
48
+ ):
49
+ down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
50
+ if down_block_type == "DownBlock3D":
51
+ return DownBlock3D(
52
+ num_layers=num_layers,
53
+ in_channels=in_channels,
54
+ out_channels=out_channels,
55
+ temb_channels=temb_channels,
56
+ add_downsample=add_downsample,
57
+ resnet_eps=resnet_eps,
58
+ resnet_act_fn=resnet_act_fn,
59
+ resnet_groups=resnet_groups,
60
+ downsample_padding=downsample_padding,
61
+ resnet_time_scale_shift=resnet_time_scale_shift,
62
+ use_inflated_groupnorm=use_inflated_groupnorm,
63
+ use_motion_module=use_motion_module,
64
+ motion_module_type=motion_module_type,
65
+ motion_module_kwargs=motion_module_kwargs,
66
+ )
67
+ elif down_block_type == "CrossAttnDownBlock3D":
68
+ if cross_attention_dim is None:
69
+ raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock3D")
70
+ return CrossAttnDownBlock3D(
71
+ num_layers=num_layers,
72
+ in_channels=in_channels,
73
+ out_channels=out_channels,
74
+ temb_channels=temb_channels,
75
+ add_downsample=add_downsample,
76
+ resnet_eps=resnet_eps,
77
+ resnet_act_fn=resnet_act_fn,
78
+ resnet_groups=resnet_groups,
79
+ downsample_padding=downsample_padding,
80
+ cross_attention_dim=cross_attention_dim,
81
+ attn_num_head_channels=attn_num_head_channels,
82
+ dual_cross_attention=dual_cross_attention,
83
+ use_linear_projection=use_linear_projection,
84
+ only_cross_attention=only_cross_attention,
85
+ upcast_attention=upcast_attention,
86
+ resnet_time_scale_shift=resnet_time_scale_shift,
87
+ unet_use_cross_frame_attention=unet_use_cross_frame_attention,
88
+ unet_use_temporal_attention=unet_use_temporal_attention,
89
+ use_inflated_groupnorm=use_inflated_groupnorm,
90
+ use_motion_module=use_motion_module,
91
+ motion_module_type=motion_module_type,
92
+ motion_module_kwargs=motion_module_kwargs,
93
+ )
94
+ raise ValueError(f"{down_block_type} does not exist.")
95
+
96
+
97
+ def get_up_block(
98
+ up_block_type,
99
+ num_layers,
100
+ in_channels,
101
+ out_channels,
102
+ prev_output_channel,
103
+ temb_channels,
104
+ add_upsample,
105
+ resnet_eps,
106
+ resnet_act_fn,
107
+ attn_num_head_channels,
108
+ resnet_groups=None,
109
+ cross_attention_dim=None,
110
+ dual_cross_attention=False,
111
+ use_linear_projection=False,
112
+ only_cross_attention=False,
113
+ upcast_attention=False,
114
+ resnet_time_scale_shift="default",
115
+ unet_use_cross_frame_attention=None,
116
+ unet_use_temporal_attention=None,
117
+ use_inflated_groupnorm=None,
118
+ use_motion_module=None,
119
+ motion_module_type=None,
120
+ motion_module_kwargs=None,
121
+ ):
122
+ up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
123
+ if up_block_type == "UpBlock3D":
124
+ return UpBlock3D(
125
+ num_layers=num_layers,
126
+ in_channels=in_channels,
127
+ out_channels=out_channels,
128
+ prev_output_channel=prev_output_channel,
129
+ temb_channels=temb_channels,
130
+ add_upsample=add_upsample,
131
+ resnet_eps=resnet_eps,
132
+ resnet_act_fn=resnet_act_fn,
133
+ resnet_groups=resnet_groups,
134
+ resnet_time_scale_shift=resnet_time_scale_shift,
135
+ use_inflated_groupnorm=use_inflated_groupnorm,
136
+ use_motion_module=use_motion_module,
137
+ motion_module_type=motion_module_type,
138
+ motion_module_kwargs=motion_module_kwargs,
139
+ )
140
+ elif up_block_type == "CrossAttnUpBlock3D":
141
+ if cross_attention_dim is None:
142
+ raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock3D")
143
+ return CrossAttnUpBlock3D(
144
+ num_layers=num_layers,
145
+ in_channels=in_channels,
146
+ out_channels=out_channels,
147
+ prev_output_channel=prev_output_channel,
148
+ temb_channels=temb_channels,
149
+ add_upsample=add_upsample,
150
+ resnet_eps=resnet_eps,
151
+ resnet_act_fn=resnet_act_fn,
152
+ resnet_groups=resnet_groups,
153
+ cross_attention_dim=cross_attention_dim,
154
+ attn_num_head_channels=attn_num_head_channels,
155
+ dual_cross_attention=dual_cross_attention,
156
+ use_linear_projection=use_linear_projection,
157
+ only_cross_attention=only_cross_attention,
158
+ upcast_attention=upcast_attention,
159
+ resnet_time_scale_shift=resnet_time_scale_shift,
160
+ unet_use_cross_frame_attention=unet_use_cross_frame_attention,
161
+ unet_use_temporal_attention=unet_use_temporal_attention,
162
+ use_inflated_groupnorm=use_inflated_groupnorm,
163
+ use_motion_module=use_motion_module,
164
+ motion_module_type=motion_module_type,
165
+ motion_module_kwargs=motion_module_kwargs,
166
+ )
167
+ raise ValueError(f"{up_block_type} does not exist.")
168
+
169
+
170
+ class UNetMidBlock3DCrossAttn(paddle.nn.Layer):
171
+ def __init__(
172
+ self,
173
+ in_channels: int,
174
+ temb_channels: int,
175
+ dropout: float = 0.0,
176
+ num_layers: int = 1,
177
+ resnet_eps: float = 1e-6,
178
+ resnet_time_scale_shift: str = "default",
179
+ resnet_act_fn: str = "swish",
180
+ resnet_groups: int = 32,
181
+ resnet_pre_norm: bool = True,
182
+ attn_num_head_channels=1,
183
+ output_scale_factor=1.0,
184
+ cross_attention_dim=1280,
185
+ dual_cross_attention=False,
186
+ use_linear_projection=False,
187
+ upcast_attention=False,
188
+ unet_use_cross_frame_attention=None,
189
+ unet_use_temporal_attention=None,
190
+ use_inflated_groupnorm=None,
191
+ use_motion_module=None,
192
+ motion_module_type=None,
193
+ motion_module_kwargs=None,
194
+ ):
195
+ super().__init__()
196
+
197
+ self.has_cross_attention = True
198
+ self.attn_num_head_channels = attn_num_head_channels
199
+ resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
200
+
201
+ # there is always at least one resnet
202
+ resnets = [
203
+ ResnetBlock3D(
204
+ in_channels=in_channels,
205
+ out_channels=in_channels,
206
+ temb_channels=temb_channels,
207
+ eps=resnet_eps,
208
+ groups=resnet_groups,
209
+ dropout=dropout,
210
+ time_embedding_norm=resnet_time_scale_shift,
211
+ non_linearity=resnet_act_fn,
212
+ output_scale_factor=output_scale_factor,
213
+ pre_norm=resnet_pre_norm,
214
+ use_inflated_groupnorm=use_inflated_groupnorm,
215
+ )
216
+ ]
217
+ attentions = []
218
+ motion_modules = []
219
+
220
+ for _ in range(num_layers):
221
+ if dual_cross_attention:
222
+ raise NotImplementedError
223
+ attentions.append(
224
+ Transformer3DModel(
225
+ attn_num_head_channels,
226
+ in_channels // attn_num_head_channels,
227
+ in_channels=in_channels,
228
+ num_layers=1,
229
+ cross_attention_dim=cross_attention_dim,
230
+ norm_num_groups=resnet_groups,
231
+ use_linear_projection=use_linear_projection,
232
+ upcast_attention=upcast_attention,
233
+ unet_use_cross_frame_attention=unet_use_cross_frame_attention,
234
+ unet_use_temporal_attention=unet_use_temporal_attention,
235
+ )
236
+ )
237
+ motion_modules.append(
238
+ get_motion_module(
239
+ in_channels=in_channels,
240
+ motion_module_type=motion_module_type,
241
+ motion_module_kwargs=motion_module_kwargs,
242
+ )
243
+ if use_motion_module
244
+ else None
245
+ )
246
+ resnets.append(
247
+ ResnetBlock3D(
248
+ in_channels=in_channels,
249
+ out_channels=in_channels,
250
+ temb_channels=temb_channels,
251
+ eps=resnet_eps,
252
+ groups=resnet_groups,
253
+ dropout=dropout,
254
+ time_embedding_norm=resnet_time_scale_shift,
255
+ non_linearity=resnet_act_fn,
256
+ output_scale_factor=output_scale_factor,
257
+ pre_norm=resnet_pre_norm,
258
+ use_inflated_groupnorm=use_inflated_groupnorm,
259
+ )
260
+ )
261
+
262
+ self.attentions = paddle.nn.LayerList(sublayers=attentions)
263
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
264
+ self.motion_modules = paddle.nn.LayerList(sublayers=motion_modules)
265
+
266
+ def forward(
267
+ self,
268
+ hidden_states,
269
+ temb=None,
270
+ encoder_hidden_states=None,
271
+ attention_mask=None,
272
+ ):
273
+ hidden_states = self.resnets[0](hidden_states, temb)
274
+ for attn, resnet, motion_module in zip(self.attentions, self.resnets[1:], self.motion_modules):
275
+ hidden_states = attn(
276
+ hidden_states,
277
+ encoder_hidden_states=encoder_hidden_states,
278
+ ).sample
279
+ hidden_states = (
280
+ motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states)
281
+ if motion_module is not None
282
+ else hidden_states
283
+ )
284
+ hidden_states = resnet(hidden_states, temb)
285
+
286
+ return hidden_states
287
+
288
+
289
+ class CrossAttnDownBlock3D(paddle.nn.Layer):
290
+ def __init__(
291
+ self,
292
+ in_channels: int,
293
+ out_channels: int,
294
+ temb_channels: int,
295
+ dropout: float = 0.0,
296
+ num_layers: int = 1,
297
+ resnet_eps: float = 1e-6,
298
+ resnet_time_scale_shift: str = "default",
299
+ resnet_act_fn: str = "swish",
300
+ resnet_groups: int = 32,
301
+ resnet_pre_norm: bool = True,
302
+ attn_num_head_channels=1,
303
+ cross_attention_dim=1280,
304
+ output_scale_factor=1.0,
305
+ downsample_padding=1,
306
+ add_downsample=True,
307
+ dual_cross_attention=False,
308
+ use_linear_projection=False,
309
+ only_cross_attention=False,
310
+ upcast_attention=False,
311
+ unet_use_cross_frame_attention=None,
312
+ unet_use_temporal_attention=None,
313
+ use_inflated_groupnorm=None,
314
+ use_motion_module=None,
315
+ motion_module_type=None,
316
+ motion_module_kwargs=None,
317
+ ):
318
+ super().__init__()
319
+ resnets = []
320
+ attentions = []
321
+ motion_modules = []
322
+
323
+ self.has_cross_attention = True
324
+ self.attn_num_head_channels = attn_num_head_channels
325
+
326
+ for i in range(num_layers):
327
+ in_channels = in_channels if i == 0 else out_channels
328
+ resnets.append(
329
+ ResnetBlock3D(
330
+ in_channels=in_channels,
331
+ out_channels=out_channels,
332
+ temb_channels=temb_channels,
333
+ eps=resnet_eps,
334
+ groups=resnet_groups,
335
+ dropout=dropout,
336
+ time_embedding_norm=resnet_time_scale_shift,
337
+ non_linearity=resnet_act_fn,
338
+ output_scale_factor=output_scale_factor,
339
+ pre_norm=resnet_pre_norm,
340
+ use_inflated_groupnorm=use_inflated_groupnorm,
341
+ )
342
+ )
343
+ if dual_cross_attention:
344
+ raise NotImplementedError
345
+ attentions.append(
346
+ Transformer3DModel(
347
+ attn_num_head_channels,
348
+ out_channels // attn_num_head_channels,
349
+ in_channels=out_channels,
350
+ num_layers=1,
351
+ cross_attention_dim=cross_attention_dim,
352
+ norm_num_groups=resnet_groups,
353
+ use_linear_projection=use_linear_projection,
354
+ only_cross_attention=only_cross_attention,
355
+ upcast_attention=upcast_attention,
356
+ unet_use_cross_frame_attention=unet_use_cross_frame_attention,
357
+ unet_use_temporal_attention=unet_use_temporal_attention,
358
+ )
359
+ )
360
+ motion_modules.append(
361
+ get_motion_module(
362
+ in_channels=out_channels,
363
+ motion_module_type=motion_module_type,
364
+ motion_module_kwargs=motion_module_kwargs,
365
+ )
366
+ if use_motion_module
367
+ else None
368
+ )
369
+
370
+ self.attentions = paddle.nn.LayerList(sublayers=attentions)
371
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
372
+ self.motion_modules = paddle.nn.LayerList(sublayers=motion_modules)
373
+
374
+ if add_downsample:
375
+
376
+ self.downsamplers = paddle.nn.LayerList(
377
+ sublayers=[
378
+ Downsample3D(
379
+ out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
380
+ )
381
+ ]
382
+ )
383
+ else:
384
+ self.downsamplers = None
385
+
386
+ self.gradient_checkpointing = False
387
+
388
+ def forward(
389
+ self,
390
+ hidden_states,
391
+ temb=None,
392
+ encoder_hidden_states=None,
393
+ attention_mask=None,
394
+ ):
395
+ output_states = ()
396
+
397
+ for i, (resnet, attn, motion_module) in enumerate(zip(self.resnets, self.attentions, self.motion_modules)):
398
+ hidden_states = resnet(hidden_states, temb)
399
+
400
+ hidden_states = attn(
401
+ hidden_states,
402
+ encoder_hidden_states=encoder_hidden_states,
403
+ ).sample
404
+
405
+ # add motion module
406
+ hidden_states = (
407
+ motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states)
408
+ if motion_module is not None
409
+ else hidden_states
410
+ )
411
+
412
+ output_states += (hidden_states,)
413
+
414
+ if self.downsamplers is not None:
415
+ for downsampler in self.downsamplers:
416
+ hidden_states = downsampler(hidden_states)
417
+
418
+ output_states += (hidden_states,)
419
+
420
+ return hidden_states, output_states
421
+
422
+
423
+ class DownBlock3D(paddle.nn.Layer):
424
+ def __init__(
425
+ self,
426
+ in_channels: int,
427
+ out_channels: int,
428
+ temb_channels: int,
429
+ dropout: float = 0.0,
430
+ num_layers: int = 1,
431
+ resnet_eps: float = 1e-6,
432
+ resnet_time_scale_shift: str = "default",
433
+ resnet_act_fn: str = "swish",
434
+ resnet_groups: int = 32,
435
+ resnet_pre_norm: bool = True,
436
+ output_scale_factor=1.0,
437
+ add_downsample=True,
438
+ downsample_padding=1,
439
+ use_inflated_groupnorm=None,
440
+ use_motion_module=None,
441
+ motion_module_type=None,
442
+ motion_module_kwargs=None,
443
+ ):
444
+ super().__init__()
445
+ resnets = []
446
+ motion_modules = []
447
+
448
+ for i in range(num_layers):
449
+ in_channels = in_channels if i == 0 else out_channels
450
+ resnets.append(
451
+ ResnetBlock3D(
452
+ in_channels=in_channels,
453
+ out_channels=out_channels,
454
+ temb_channels=temb_channels,
455
+ eps=resnet_eps,
456
+ groups=resnet_groups,
457
+ dropout=dropout,
458
+ time_embedding_norm=resnet_time_scale_shift,
459
+ non_linearity=resnet_act_fn,
460
+ output_scale_factor=output_scale_factor,
461
+ pre_norm=resnet_pre_norm,
462
+ use_inflated_groupnorm=use_inflated_groupnorm,
463
+ )
464
+ )
465
+ motion_modules.append(
466
+ get_motion_module(
467
+ in_channels=out_channels,
468
+ motion_module_type=motion_module_type,
469
+ motion_module_kwargs=motion_module_kwargs,
470
+ )
471
+ if use_motion_module
472
+ else None
473
+ )
474
+
475
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
476
+ self.motion_modules = paddle.nn.LayerList(sublayers=motion_modules)
477
+
478
+ if add_downsample:
479
+
480
+ self.downsamplers = paddle.nn.LayerList(
481
+ sublayers=[
482
+ Downsample3D(
483
+ out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
484
+ )
485
+ ]
486
+ )
487
+ else:
488
+ self.downsamplers = None
489
+
490
+ self.gradient_checkpointing = False
491
+
492
+ def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
493
+ output_states = ()
494
+
495
+ for resnet, motion_module in zip(self.resnets, self.motion_modules):
496
+ hidden_states = resnet(hidden_states, temb)
497
+
498
+ # add motion module
499
+ hidden_states = (
500
+ motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states)
501
+ if motion_module is not None
502
+ else hidden_states
503
+ )
504
+
505
+ output_states += (hidden_states,)
506
+
507
+ if self.downsamplers is not None:
508
+ for downsampler in self.downsamplers:
509
+ hidden_states = downsampler(hidden_states)
510
+
511
+ output_states += (hidden_states,)
512
+
513
+ return hidden_states, output_states
514
+
515
+
516
+ class CrossAttnUpBlock3D(paddle.nn.Layer):
517
+ def __init__(
518
+ self,
519
+ in_channels: int,
520
+ out_channels: int,
521
+ prev_output_channel: int,
522
+ temb_channels: int,
523
+ dropout: float = 0.0,
524
+ num_layers: int = 1,
525
+ resnet_eps: float = 1e-6,
526
+ resnet_time_scale_shift: str = "default",
527
+ resnet_act_fn: str = "swish",
528
+ resnet_groups: int = 32,
529
+ resnet_pre_norm: bool = True,
530
+ attn_num_head_channels=1,
531
+ cross_attention_dim=1280,
532
+ output_scale_factor=1.0,
533
+ add_upsample=True,
534
+ dual_cross_attention=False,
535
+ use_linear_projection=False,
536
+ only_cross_attention=False,
537
+ upcast_attention=False,
538
+ unet_use_cross_frame_attention=None,
539
+ unet_use_temporal_attention=None,
540
+ use_motion_module=None,
541
+ use_inflated_groupnorm=None,
542
+ motion_module_type=None,
543
+ motion_module_kwargs=None,
544
+ ):
545
+ super().__init__()
546
+ resnets = []
547
+ attentions = []
548
+ motion_modules = []
549
+
550
+ self.has_cross_attention = True
551
+ self.attn_num_head_channels = attn_num_head_channels
552
+
553
+ for i in range(num_layers):
554
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
555
+ resnet_in_channels = prev_output_channel if i == 0 else out_channels
556
+
557
+ resnets.append(
558
+ ResnetBlock3D(
559
+ in_channels=resnet_in_channels + res_skip_channels,
560
+ out_channels=out_channels,
561
+ temb_channels=temb_channels,
562
+ eps=resnet_eps,
563
+ groups=resnet_groups,
564
+ dropout=dropout,
565
+ time_embedding_norm=resnet_time_scale_shift,
566
+ non_linearity=resnet_act_fn,
567
+ output_scale_factor=output_scale_factor,
568
+ pre_norm=resnet_pre_norm,
569
+ use_inflated_groupnorm=use_inflated_groupnorm,
570
+ )
571
+ )
572
+ if dual_cross_attention:
573
+ raise NotImplementedError
574
+ attentions.append(
575
+ Transformer3DModel(
576
+ attn_num_head_channels,
577
+ out_channels // attn_num_head_channels,
578
+ in_channels=out_channels,
579
+ num_layers=1,
580
+ cross_attention_dim=cross_attention_dim,
581
+ norm_num_groups=resnet_groups,
582
+ use_linear_projection=use_linear_projection,
583
+ only_cross_attention=only_cross_attention,
584
+ upcast_attention=upcast_attention,
585
+ unet_use_cross_frame_attention=unet_use_cross_frame_attention,
586
+ unet_use_temporal_attention=unet_use_temporal_attention,
587
+ )
588
+ )
589
+ motion_modules.append(
590
+ get_motion_module(
591
+ in_channels=out_channels,
592
+ motion_module_type=motion_module_type,
593
+ motion_module_kwargs=motion_module_kwargs,
594
+ )
595
+ if use_motion_module
596
+ else None
597
+ )
598
+
599
+ self.attentions = paddle.nn.LayerList(sublayers=attentions)
600
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
601
+ self.motion_modules = paddle.nn.LayerList(sublayers=motion_modules)
602
+
603
+ if add_upsample:
604
+ self.upsamplers = paddle.nn.LayerList(
605
+ sublayers=[Upsample3D(out_channels, use_conv=True, out_channels=out_channels)]
606
+ )
607
+ else:
608
+ self.upsamplers = None
609
+
610
+ self.gradient_checkpointing = False
611
+
612
+ def forward(
613
+ self,
614
+ hidden_states,
615
+ res_hidden_states_tuple,
616
+ temb=None,
617
+ encoder_hidden_states=None,
618
+ upsample_size=None,
619
+ attention_mask=None,
620
+ ):
621
+ for i, (resnet, attn, motion_module) in enumerate(zip(self.resnets, self.attentions, self.motion_modules)):
622
+ # pop res hidden states
623
+ res_hidden_states = res_hidden_states_tuple[-1]
624
+ res_hidden_states_tuple = res_hidden_states_tuple[:-1]
625
+ hidden_states = paddle.concat(x=[hidden_states, res_hidden_states], axis=1)
626
+
627
+ hidden_states = resnet(hidden_states, temb)
628
+ hidden_states = attn(
629
+ hidden_states,
630
+ encoder_hidden_states=encoder_hidden_states,
631
+ ).sample
632
+
633
+ # add motion module
634
+ hidden_states = (
635
+ motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states)
636
+ if motion_module is not None
637
+ else hidden_states
638
+ )
639
+
640
+ if self.upsamplers is not None:
641
+ for upsampler in self.upsamplers:
642
+ hidden_states = upsampler(hidden_states, upsample_size)
643
+
644
+ return hidden_states
645
+
646
+
647
+ class UpBlock3D(paddle.nn.Layer):
648
+ def __init__(
649
+ self,
650
+ in_channels: int,
651
+ prev_output_channel: int,
652
+ out_channels: int,
653
+ temb_channels: int,
654
+ dropout: float = 0.0,
655
+ num_layers: int = 1,
656
+ resnet_eps: float = 1e-6,
657
+ resnet_time_scale_shift: str = "default",
658
+ resnet_act_fn: str = "swish",
659
+ resnet_groups: int = 32,
660
+ resnet_pre_norm: bool = True,
661
+ output_scale_factor=1.0,
662
+ add_upsample=True,
663
+ use_inflated_groupnorm=None,
664
+ use_motion_module=None,
665
+ motion_module_type=None,
666
+ motion_module_kwargs=None,
667
+ ):
668
+ super().__init__()
669
+ resnets = []
670
+ motion_modules = []
671
+
672
+ # use_motion_module = False
673
+ for i in range(num_layers):
674
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
675
+ resnet_in_channels = prev_output_channel if i == 0 else out_channels
676
+
677
+ resnets.append(
678
+ ResnetBlock3D(
679
+ in_channels=resnet_in_channels + res_skip_channels,
680
+ out_channels=out_channels,
681
+ temb_channels=temb_channels,
682
+ eps=resnet_eps,
683
+ groups=resnet_groups,
684
+ dropout=dropout,
685
+ time_embedding_norm=resnet_time_scale_shift,
686
+ non_linearity=resnet_act_fn,
687
+ output_scale_factor=output_scale_factor,
688
+ pre_norm=resnet_pre_norm,
689
+ use_inflated_groupnorm=use_inflated_groupnorm,
690
+ )
691
+ )
692
+ motion_modules.append(
693
+ get_motion_module(
694
+ in_channels=out_channels,
695
+ motion_module_type=motion_module_type,
696
+ motion_module_kwargs=motion_module_kwargs,
697
+ )
698
+ if use_motion_module
699
+ else None
700
+ )
701
+
702
+ self.resnets = paddle.nn.LayerList(sublayers=resnets)
703
+ self.motion_modules = paddle.nn.LayerList(sublayers=motion_modules)
704
+
705
+ if add_upsample:
706
+ self.upsamplers = paddle.nn.LayerList(
707
+ sublayers=[Upsample3D(out_channels, use_conv=True, out_channels=out_channels)]
708
+ )
709
+ else:
710
+ self.upsamplers = None
711
+
712
+ self.gradient_checkpointing = False
713
+
714
+ def forward(
715
+ self,
716
+ hidden_states,
717
+ res_hidden_states_tuple,
718
+ temb=None,
719
+ upsample_size=None,
720
+ encoder_hidden_states=None,
721
+ ):
722
+ for resnet, motion_module in zip(self.resnets, self.motion_modules):
723
+ # pop res hidden states
724
+ res_hidden_states = res_hidden_states_tuple[-1]
725
+ res_hidden_states_tuple = res_hidden_states_tuple[:-1]
726
+ hidden_states = paddle.concat(x=[hidden_states, res_hidden_states], axis=1)
727
+
728
+ hidden_states = resnet(hidden_states, temb)
729
+ hidden_states = (
730
+ motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states)
731
+ if motion_module is not None
732
+ else hidden_states
733
+ )
734
+
735
+ if self.upsamplers is not None:
736
+ for upsampler in self.upsamplers:
737
+ hidden_states = upsampler(hidden_states, upsample_size)
738
+
739
+ return hidden_states
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/hotshot_xl/transformer_temporal.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import math
16
+ from dataclasses import dataclass
17
+ from typing import Optional
18
+
19
+ import paddle
20
+ from einops import rearrange, repeat
21
+
22
+ import ppdiffusers
23
+
24
+
25
+ class PositionalEncoding(paddle.nn.Layer):
26
+ """
27
+ Implements positional encoding as described in "Attention Is All You Need".
28
+ Adds sinusoidal based positional encodings to the input tensor.
29
+ """
30
+
31
+ _SCALE_FACTOR = 10000.0
32
+
33
+ def __init__(self, dim: int, dropout: float = 0.0, max_length: int = 24):
34
+ super(PositionalEncoding, self).__init__()
35
+ self.dropout = paddle.nn.Dropout(p=dropout)
36
+ positional_encoding = paddle.zeros(shape=[1, max_length, dim])
37
+ position = paddle.arange(end=max_length).unsqueeze(axis=1)
38
+ div_term = paddle.exp(x=paddle.arange(start=0, end=dim, step=2) * (-math.log(self._SCALE_FACTOR) / dim))
39
+ positional_encoding[0, :, 0::2] = paddle.sin(x=position.astype(div_term.dtype) * div_term)
40
+ positional_encoding[0, :, 1::2] = paddle.cos(x=position.astype(div_term.dtype) * div_term)
41
+ self.register_buffer(name="positional_encoding", tensor=positional_encoding)
42
+
43
+ def forward(self, hidden_states: paddle.Tensor, length: int) -> paddle.Tensor:
44
+ hidden_states = hidden_states + self.positional_encoding[:, :length]
45
+ return self.dropout(hidden_states)
46
+
47
+
48
+ class TemporalAttention(ppdiffusers.models.attention.Attention):
49
+ def __init__(self, *args, **kwargs):
50
+ super().__init__(*args, **kwargs)
51
+ self.pos_encoder = PositionalEncoding(kwargs["query_dim"], dropout=0)
52
+
53
+ def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, number_of_frames=8):
54
+ sequence_length = tuple(hidden_states.shape)[1]
55
+ hidden_states = rearrange(hidden_states, "(b f) s c -> (b s) f c", f=number_of_frames)
56
+ hidden_states = self.pos_encoder(hidden_states, length=number_of_frames)
57
+ if encoder_hidden_states:
58
+ encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b s) n c", s=sequence_length)
59
+ hidden_states = super().forward(hidden_states, encoder_hidden_states, attention_mask=attention_mask)
60
+ return rearrange(hidden_states, "(b s) f c -> (b f) s c", s=sequence_length)
61
+
62
+
63
+ @dataclass
64
+ class TransformerTemporalOutput(ppdiffusers.utils.BaseOutput):
65
+ sample: paddle.float32
66
+
67
+
68
+ class TransformerTemporal(paddle.nn.Layer):
69
+ def __init__(
70
+ self,
71
+ num_attention_heads: int,
72
+ attention_head_dim: int,
73
+ in_channels: int,
74
+ num_layers: int = 1,
75
+ dropout: float = 0.0,
76
+ norm_num_groups: int = 32,
77
+ cross_attention_dim: Optional[int] = None,
78
+ attention_bias: bool = False,
79
+ activation_fn: str = "geglu",
80
+ upcast_attention: bool = False,
81
+ ):
82
+ super().__init__()
83
+ inner_dim = num_attention_heads * attention_head_dim
84
+ self.norm = paddle.nn.GroupNorm(
85
+ num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-06, weight_attr=True, bias_attr=True
86
+ )
87
+ self.proj_in = paddle.nn.Linear(in_features=in_channels, out_features=inner_dim)
88
+ self.transformer_blocks = paddle.nn.LayerList(
89
+ sublayers=[
90
+ TransformerBlock(
91
+ dim=inner_dim,
92
+ num_attention_heads=num_attention_heads,
93
+ attention_head_dim=attention_head_dim,
94
+ dropout=dropout,
95
+ activation_fn=activation_fn,
96
+ attention_bias=attention_bias,
97
+ upcast_attention=upcast_attention,
98
+ cross_attention_dim=cross_attention_dim,
99
+ )
100
+ for _ in range(num_layers)
101
+ ]
102
+ )
103
+ self.proj_out = paddle.nn.Linear(in_features=inner_dim, out_features=in_channels)
104
+
105
+ def forward(self, hidden_states, encoder_hidden_states=None):
106
+ _, num_channels, f, height, width = tuple(hidden_states.shape)
107
+ hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
108
+ skip = hidden_states
109
+ hidden_states = self.norm(hidden_states)
110
+ hidden_states = rearrange(hidden_states, "bf c h w -> bf (h w) c")
111
+ hidden_states = self.proj_in(hidden_states)
112
+ for block in self.transformer_blocks:
113
+ hidden_states = block(hidden_states, encoder_hidden_states=encoder_hidden_states, number_of_frames=f)
114
+ hidden_states = self.proj_out(hidden_states)
115
+ hidden_states = rearrange(hidden_states, "bf (h w) c -> bf c h w", h=height, w=width).contiguous()
116
+ output = hidden_states + skip
117
+ output = rearrange(output, "(b f) c h w -> b c f h w", f=f)
118
+ return output
119
+
120
+
121
+ class TransformerBlock(paddle.nn.Layer):
122
+ def __init__(
123
+ self,
124
+ dim,
125
+ num_attention_heads,
126
+ attention_head_dim,
127
+ dropout=0.0,
128
+ activation_fn="geglu",
129
+ attention_bias=False,
130
+ upcast_attention=False,
131
+ depth=2,
132
+ cross_attention_dim: Optional[int] = None,
133
+ ):
134
+ super().__init__()
135
+ self.is_cross = cross_attention_dim is not None
136
+ attention_blocks = []
137
+ norms = []
138
+ for _ in range(depth):
139
+ attention_blocks.append(
140
+ TemporalAttention(
141
+ query_dim=dim,
142
+ cross_attention_dim=cross_attention_dim,
143
+ heads=num_attention_heads,
144
+ dim_head=attention_head_dim,
145
+ dropout=dropout,
146
+ bias=attention_bias,
147
+ upcast_attention=upcast_attention,
148
+ )
149
+ )
150
+ norms.append(paddle.nn.LayerNorm(normalized_shape=dim))
151
+ self.attention_blocks = paddle.nn.LayerList(sublayers=attention_blocks)
152
+ self.norms = paddle.nn.LayerList(sublayers=norms)
153
+ self.ff = ppdiffusers.models.attention.FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
154
+ self.ff_norm = paddle.nn.LayerNorm(normalized_shape=dim)
155
+
156
+ def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, number_of_frames=None):
157
+ if not self.is_cross:
158
+ encoder_hidden_states = None
159
+ for block, norm in zip(self.attention_blocks, self.norms):
160
+ norm_hidden_states = norm(hidden_states)
161
+ hidden_states = (
162
+ block(
163
+ norm_hidden_states,
164
+ encoder_hidden_states=encoder_hidden_states,
165
+ attention_mask=attention_mask,
166
+ number_of_frames=number_of_frames,
167
+ )
168
+ + hidden_states
169
+ )
170
+ norm_hidden_states = self.ff_norm(hidden_states)
171
+ hidden_states = self.ff(norm_hidden_states) + hidden_states
172
+ output = hidden_states
173
+ return output
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/noise_conditions.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+ import paddle
17
+ import paddle_aux # noqa: F401
18
+
19
+
20
+ class BaseNoiseCond:
21
+ def __init__(self, *args, shift=1, clamp_range=None, **kwargs):
22
+ clamp_range = [-1000000000.0, 1000000000.0] if clamp_range is None else clamp_range
23
+ self.shift = shift
24
+ self.clamp_range = clamp_range
25
+ self.setup(*args, **kwargs)
26
+
27
+ def setup(self, *args, **kwargs):
28
+ pass
29
+
30
+ def cond(self, logSNR):
31
+ raise NotImplementedError("this method needs to be overriden")
32
+
33
+ def __call__(self, logSNR):
34
+ if self.shift != 1:
35
+ logSNR = logSNR.clone() + 2 * np.log(self.shift)
36
+ return paddle.clip(self.cond(logSNR), min=self.clamp_range[0], max=self.clamp_range[1])
37
+
38
+
39
+ class CosineTNoiseCond(BaseNoiseCond):
40
+ def setup(self, s=0.008, clamp_range=[0, 1]):
41
+ self.s = paddle.to_tensor(data=[s])
42
+ self.clamp_range = clamp_range
43
+ self.min_var = paddle.square(paddle.cos(x=self.s / (1 + self.s) * np.pi * 0.5))
44
+
45
+ def cond(self, logSNR):
46
+ var = paddle.nn.functional.sigmoid(logSNR)
47
+ var = paddle.clip(var, min=self.clamp_range[0], max=self.clamp_range[1])
48
+ s, min_var = self.s, self.min_var
49
+ t = ((var * min_var) ** 0.5).acos() / (np.pi * 0.5) * (1 + s) - s
50
+ return t
51
+
52
+
53
+ class EDMNoiseCond(BaseNoiseCond):
54
+ def cond(self, logSNR):
55
+ return -logSNR / 8
56
+
57
+
58
+ class SigmoidNoiseCond(BaseNoiseCond):
59
+ def cond(self, logSNR):
60
+ return (-logSNR).sigmoid()
61
+
62
+
63
+ class LogSNRNoiseCond(BaseNoiseCond):
64
+ def cond(self, logSNR):
65
+ return logSNR
66
+
67
+
68
+ class EDMSigmaNoiseCond(BaseNoiseCond):
69
+ def setup(self, sigma_data=1):
70
+ self.sigma_data = sigma_data
71
+
72
+ def cond(self, logSNR):
73
+ return paddle.exp(x=-logSNR / 2) * self.sigma_data
74
+
75
+
76
+ class RectifiedFlowsNoiseCond(BaseNoiseCond):
77
+ def cond(self, logSNR):
78
+ _a = logSNR.exp() - 1
79
+ _a[_a == 0] = 0.001
80
+ a = 1 + (2 - (2**2 + 4 * _a) ** 0.5) / (2 * _a)
81
+ return a
82
+
83
+
84
+ class PiecewiseLinearNoiseCond(BaseNoiseCond):
85
+ def setup(self):
86
+ self.x = None
87
+ self.y = None
88
+
89
+ def piecewise_linear(self, y, xs, ys):
90
+ indices = len(xs) - 2 - paddle.searchsorted(sorted_sequence=ys.flip(axis=(-1,))[:-2], values=y)
91
+ x_min, x_max = xs[indices], xs[indices + 1]
92
+ y_min, y_max = ys[indices], ys[indices + 1]
93
+ x = x_min + (x_max - x_min) * (y - y_min) / (y_max - y_min)
94
+ return x
95
+
96
+ def cond(self, logSNR):
97
+ var = logSNR.sigmoid()
98
+ t = self.piecewise_linear(var, self.x.to(var.place), self.y.to(var.place))
99
+ return t
100
+
101
+
102
+ class StableDiffusionNoiseCond(PiecewiseLinearNoiseCond):
103
+ def setup(self, linear_range=[0.00085, 0.012], total_steps=1000):
104
+ self.total_steps = total_steps
105
+ linear_range_sqrt = [(r**0.5) for r in linear_range]
106
+ self.x = paddle.linspace(start=0, stop=1, num=total_steps + 1)
107
+ alphas = 1 - (linear_range_sqrt[0] * (1 - self.x) + linear_range_sqrt[1] * self.x) ** 2
108
+ self.y = alphas.cumprod(dim=-1)
109
+
110
+ def cond(self, logSNR):
111
+ return super().cond(logSNR).clip(min=0, max=1)
112
+
113
+
114
+ class DiscreteNoiseCond(BaseNoiseCond):
115
+ def setup(self, noise_cond, steps=1000, continuous_range=[0, 1]):
116
+ self.noise_cond = noise_cond
117
+ self.steps = steps
118
+ self.continuous_range = continuous_range
119
+
120
+ def cond(self, logSNR):
121
+ cond = self.noise_cond(logSNR)
122
+ cond = (cond - self.continuous_range[0]) / (self.continuous_range[1] - self.continuous_range[0])
123
+ return cond.mul(self.steps).astype(dtype="int64")
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/samplers.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import paddle
16
+
17
+ import ppdiffusers # noqa: F401
18
+
19
+
20
+ class SimpleSampler:
21
+ def __init__(self, gdf):
22
+ self.gdf = gdf
23
+ self.current_step = -1
24
+
25
+ def __call__(self, *args, **kwargs):
26
+ self.current_step += 1
27
+ return self.step(*args, **kwargs)
28
+
29
+ def init_x(self, shape):
30
+ generator = paddle.Generator().manual_seed(1)
31
+ return paddle.randn(shape=shape, generator=generator)
32
+
33
+ def step(self, x, x0, epsilon, logSNR, logSNR_prev):
34
+ raise NotImplementedError("You should override the 'apply' function.")
35
+
36
+
37
+ def expand_to_match(tensor, target_shape):
38
+ # Expand tensor dimensions to match the target shape for broadcasting
39
+ # Assuming tensor initially has shape [batch_size, 1] and target_shape is like [batch_size, channels, height, width]
40
+ return tensor.unsqueeze(-1).unsqueeze(-1).expand(target_shape[0], target_shape[1], 1, 1)
41
+
42
+
43
+ class DDIMSampler(SimpleSampler):
44
+ def step(self, x, x0, epsilon, logSNR, logSNR_prev, eta=0):
45
+ a, b = self.gdf.input_scaler(logSNR)
46
+ if len(a.shape) == 1:
47
+ a, b = a.reshape([-1] + [1] * (len(x0.shape) - 1)), b.reshape([-1] + [1] * (len(x0.shape) - 1))
48
+
49
+ a_prev, b_prev = self.gdf.input_scaler(logSNR_prev)
50
+
51
+ if len(a_prev.shape) == 1:
52
+ a_prev, b_prev = a_prev.reshape([-1] + [1] * (len(x0.shape) - 1)), b_prev.reshape(
53
+ [-1] + [1] * (len(x0.shape) - 1)
54
+ )
55
+ sigma_tau = (
56
+ eta * paddle.sqrt(b_prev**2 / b**2) * paddle.sqrt(1 - a**2 / a_prev**2)
57
+ if eta > 0
58
+ else paddle.zeros_like(x0)
59
+ )
60
+ x = (
61
+ a_prev * x0
62
+ + paddle.sqrt(b_prev**2 - sigma_tau**2) * epsilon
63
+ + sigma_tau * paddle.randn(x0.shape, dtype=x0.dtype)
64
+ )
65
+
66
+ return x
67
+
68
+
69
+ class DDPMSampler(DDIMSampler):
70
+ def step(self, x, x0, epsilon, logSNR, logSNR_prev, eta=1):
71
+ return super().step(x, x0, epsilon, logSNR, logSNR_prev, eta)
72
+
73
+
74
+ class LCMSampler(SimpleSampler):
75
+ def step(self, x, x0, epsilon, logSNR, logSNR_prev):
76
+ a_prev, b_prev = self.gdf.input_scaler(logSNR_prev)
77
+ if len(a_prev.shape) == 1:
78
+ a_prev, b_prev = a_prev.unsqueeze(-1).expand_as(x0), b_prev.unsqueeze(-1).expand_as(x0)
79
+ return x0 * a_prev + paddle.randn_like(epsilon) * b_prev
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/models/stable_cascade/gdf/schedulers.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+ import paddle
17
+ import paddle_aux # noqa
18
+
19
+
20
+ class BaseSchedule:
21
+ def __init__(self, *args, force_limits=True, discrete_steps=None, shift=1, **kwargs):
22
+ self.setup(*args, **kwargs)
23
+ self.limits = None
24
+ self.discrete_steps = discrete_steps
25
+ self.shift = shift
26
+ if force_limits:
27
+ self.reset_limits()
28
+
29
+ def reset_limits(self, shift=1, disable=False):
30
+ try:
31
+ self.limits = None if disable else self(paddle.to_tensor(data=[1.0, 0.0]), shift=shift).tolist()
32
+ return self.limits
33
+ except Exception:
34
+ print("WARNING: this schedule doesn't support t and will be unbounded")
35
+ return None
36
+
37
+ def setup(self, *args, **kwargs):
38
+ raise NotImplementedError("this method needs to be overriden")
39
+
40
+ def schedule(self, *args, **kwargs):
41
+ raise NotImplementedError("this method needs to be overriden")
42
+
43
+ def __call__(self, t, *args, shift=1, **kwargs):
44
+ if isinstance(t, paddle.Tensor):
45
+ batch_size = None
46
+ if self.discrete_steps is not None:
47
+ if t.dtype != "int64":
48
+ t = (t * (self.discrete_steps - 1)).round().astype(dtype="int64")
49
+ t = t / (self.discrete_steps - 1)
50
+ t = t.clip(min=0, max=1)
51
+ else:
52
+ batch_size = t
53
+ t = None
54
+ logSNR = self.schedule(t, batch_size, *args, **kwargs)
55
+ if shift * self.shift != 1:
56
+ logSNR += 2 * np.log(1 / (shift * self.shift))
57
+ if self.limits is not None:
58
+ logSNR = paddle.clip(logSNR, *self.limits)
59
+
60
+ return logSNR
61
+
62
+
63
+ class CosineSchedule(BaseSchedule):
64
+ def setup(self, s=0.008, clamp_range=[0.0001, 0.9999], norm_instead=False):
65
+ self.s = paddle.to_tensor(data=[s])
66
+ self.clamp_range = clamp_range
67
+ self.norm_instead = norm_instead
68
+ self.min_var = paddle.cos(x=self.s / (1 + self.s) * np.pi * 0.5) ** 2
69
+
70
+ def schedule(self, t, batch_size):
71
+ if t is None:
72
+ t = (1 - paddle.rand(shape=[batch_size])).add(0.001).clip(min=0.001, max=1.0)
73
+ s, min_var = self.s, self.min_var
74
+ var = (paddle.cos(x=(s + t) / (1 + s) * np.pi * 0.5).clip(min=0, max=1) ** 2) / min_var
75
+
76
+ if self.norm_instead:
77
+ var = var * (self.clamp_range[1] - self.clamp_range[0]) + self.clamp_range[0]
78
+ else:
79
+ var = paddle.clip(var, min=self.clamp_range[0], max=self.clamp_range[1])
80
+ logSNR = (var / (1 - var)).log()
81
+
82
+ return logSNR
83
+
84
+
85
+ class CosineSchedule2(BaseSchedule):
86
+ def setup(self, logsnr_range=[-15, 15]):
87
+ self.t_min = np.arctan(np.exp(-0.5 * logsnr_range[1]))
88
+ self.t_max = np.arctan(np.exp(-0.5 * logsnr_range[0]))
89
+
90
+ def schedule(self, t, batch_size):
91
+ if t is None:
92
+ t = 1 - paddle.rand(shape=batch_size)
93
+ return -2 * paddle.tan(self.t_min + t * (self.t_max - self.t_min)).log()
94
+
95
+
96
+ class SqrtSchedule(BaseSchedule):
97
+ def setup(self, s=0.0001, clamp_range=[0.0001, 0.9999], norm_instead=False):
98
+ self.s = s
99
+ self.clamp_range = clamp_range
100
+ self.norm_instead = norm_instead
101
+
102
+ def schedule(self, t, batch_size):
103
+ if t is None:
104
+ t = 1 - paddle.rand(shape=batch_size)
105
+ var = 1 - (t + self.s) ** 0.5
106
+ if self.norm_instead:
107
+ var = var * (self.clamp_range[1] - self.clamp_range[0]) + self.clamp_range[0]
108
+ else:
109
+ var = paddle.clip(var, min=self.clamp_range[0], max=self.clamp_range[1])
110
+ logSNR = (var / (1 - var)).log()
111
+
112
+ return logSNR
113
+
114
+
115
+ class RectifiedFlowsSchedule(BaseSchedule):
116
+ def setup(self, logsnr_range=[-15, 15]):
117
+ self.logsnr_range = logsnr_range
118
+
119
+ def schedule(self, t, batch_size):
120
+ if t is None:
121
+ t = 1 - paddle.rand(shape=batch_size)
122
+ logSNR = ((1 - t) ** 2 / t**2).log()
123
+ logSNR = paddle.clip(logSNR, min=self.logsnr_range[0], max=self.logsnr_range[1])
124
+ return logSNR
125
+
126
+
127
+ class EDMSampleSchedule(BaseSchedule):
128
+ def setup(self, sigma_range=[0.002, 80], p=7):
129
+ self.sigma_range = sigma_range
130
+ self.p = p
131
+
132
+ def schedule(self, t, batch_size):
133
+ if t is None:
134
+ t = 1 - paddle.rand(shape=batch_size)
135
+ smin, smax, p = *self.sigma_range, self.p
136
+ sigma = (smax ** (1 / p) + (1 - t) * (smin ** (1 / p) - smax ** (1 / p))) ** p
137
+ logSNR = (1 / sigma**2).log()
138
+ return logSNR
139
+
140
+
141
+ class EDMTrainSchedule(BaseSchedule):
142
+ def setup(self, mu=-1.2, std=1.2):
143
+ self.mu = mu
144
+ self.std = std
145
+
146
+ def schedule(self, t, batch_size):
147
+ if t is not None:
148
+ raise Exception("EDMTrainSchedule doesn't support passing timesteps: t")
149
+ logSNR = -2 * (paddle.randn(shape=batch_size) * self.std - self.mu)
150
+ return logSNR
151
+
152
+
153
+ class LinearSchedule(BaseSchedule):
154
+ def setup(self, logsnr_range=[-10, 10]):
155
+ self.logsnr_range = logsnr_range
156
+
157
+ def schedule(self, t, batch_size):
158
+ if t is None:
159
+ t = 1 - paddle.rand(shape=batch_size)
160
+ logSNR = t * (self.logsnr_range[0] - self.logsnr_range[1]) + self.logsnr_range[1]
161
+ return logSNR
162
+
163
+
164
+ class PiecewiseLinearSchedule(BaseSchedule):
165
+ def setup(self):
166
+ self.x = None
167
+ self.y = None
168
+
169
+ def piecewise_linear(self, x, xs, ys):
170
+ indices = paddle.searchsorted(sorted_sequence=xs[:-1], values=x) - 1
171
+ x_min, x_max = xs[indices], xs[indices + 1]
172
+ y_min, y_max = ys[indices], ys[indices + 1]
173
+ var = y_min + (y_max - y_min) * (x - x_min) / (x_max - x_min)
174
+ return var
175
+
176
+ def schedule(self, t, batch_size):
177
+ if t is None:
178
+ t = 1 - paddle.rand(shape=batch_size)
179
+ var = self.piecewise_linear(t, self.x.to(t.place), self.y.to(t.place))
180
+ logSNR = (var / (1 - var)).log()
181
+ return logSNR
182
+
183
+
184
+ class StableDiffusionSchedule(PiecewiseLinearSchedule):
185
+ def setup(self, linear_range=[0.00085, 0.012], total_steps=1000):
186
+ linear_range_sqrt = [(r**0.5) for r in linear_range]
187
+ self.x = paddle.linspace(start=0, stop=1, num=total_steps + 1)
188
+ alphas = 1 - (linear_range_sqrt[0] * (1 - self.x) + linear_range_sqrt[1] * self.x) ** 2
189
+ self.y = alphas.cumprod(dim=-1)
190
+
191
+
192
+ class AdaptiveTrainSchedule(BaseSchedule):
193
+ def setup(self, logsnr_range=[-10, 10], buckets=100, min_probs=0.0):
194
+ th = paddle.linspace(start=logsnr_range[0], stop=logsnr_range[1], num=buckets + 1)
195
+ self.bucket_ranges = paddle.to_tensor(data=[(th[i], th[i + 1]) for i in range(buckets)])
196
+ self.bucket_probs = paddle.ones(shape=buckets)
197
+ self.min_probs = min_probs
198
+
199
+ def schedule(self, t, batch_size):
200
+ if t is not None:
201
+ raise Exception("AdaptiveTrainSchedule doesn't support passing timesteps: t")
202
+ norm_probs = (self.bucket_probs + self.min_probs) / (self.bucket_probs + self.min_probs).sum()
203
+ buckets = paddle.multinomial(x=norm_probs, num_samples=batch_size, replacement=True)
204
+ ranges = self.bucket_ranges[buckets]
205
+ logSNR = paddle.rand(shape=batch_size) * (ranges[:, 1] - ranges[:, 0]) + ranges[:, 0]
206
+ return logSNR
207
+
208
+ def update_buckets(self, logSNR, loss, beta=0.99):
209
+ range_mtx = self.bucket_ranges.unsqueeze(axis=0).expand(shape=[logSNR.shape[0], -1, -1]).to(logSNR.place)
210
+ range_mask = (range_mtx[:, :, 0] <= logSNR[:, None]) * (range_mtx[:, :, 1] > logSNR[:, None]).astype(
211
+ dtype="float32"
212
+ )
213
+ range_idx = range_mask.argmax(axis=-1).cpu()
214
+ self.bucket_probs[range_idx] = self.bucket_probs[range_idx] * beta + loss.detach().cpu() * (1 - beta)
215
+
216
+
217
+ class InterpolatedSchedule(BaseSchedule):
218
+ def setup(self, scheduler1, scheduler2, shifts=[1.0, 1.0]):
219
+ self.scheduler1 = scheduler1
220
+ self.scheduler2 = scheduler2
221
+ self.shifts = shifts
222
+
223
+ def schedule(self, t, batch_size):
224
+ if t is None:
225
+ t = 1 - paddle.rand(shape=batch_size)
226
+ t = t.clip(min=1e-07, max=1 - 1e-07)
227
+ low_logSNR = self.scheduler1(t, shift=self.shifts[0])
228
+ high_logSNR = self.scheduler2(t, shift=self.shifts[1])
229
+ return low_logSNR * t + high_logSNR * (1 - t)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py ADDED
@@ -0,0 +1,925 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import inspect
17
+ from typing import Any, Callable, Dict, List, Optional, Union
18
+
19
+ import paddle
20
+ from packaging import version
21
+
22
+ from ppdiffusers.transformers import (
23
+ CLIPImageProcessor,
24
+ CLIPVisionModelWithProjection,
25
+ XLMRobertaTokenizer,
26
+ )
27
+
28
+ from ...configuration_utils import FrozenDict
29
+ from ...image_processor import PipelineImageInput, VaeImageProcessor
30
+ from ...loaders import (
31
+ FromSingleFileMixin,
32
+ IPAdapterMixin,
33
+ LoraLoaderMixin,
34
+ TextualInversionLoaderMixin,
35
+ )
36
+ from ...models import AutoencoderKL, UNet2DConditionModel
37
+ from ...models.lora import adjust_lora_scale_text_encoder
38
+ from ...schedulers import KarrasDiffusionSchedulers
39
+ from ...utils import (
40
+ USE_PEFT_BACKEND,
41
+ deprecate,
42
+ logging,
43
+ replace_example_docstring,
44
+ scale_lora_layers,
45
+ unscale_lora_layers,
46
+ )
47
+ from ...utils.paddle_utils import randn_tensor
48
+ from ..pipeline_utils import DiffusionPipeline
49
+ from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
50
+ from .modeling_roberta_series import RobertaSeriesModelWithTransformation
51
+ from .pipeline_output import AltDiffusionPipelineOutput
52
+
53
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
54
+
55
+ EXAMPLE_DOC_STRING = """
56
+ Examples:
57
+ ```py
58
+ >>> import paddle
59
+ >>> from ppdiffusers import AltDiffusionPipeline
60
+
61
+ >>> pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion-m9", paddle_dtype=paddle.float16)
62
+
63
+ >>> # "dark elf princess, highly detailed, d & d, fantasy, highly detailed, digital painting, trending on artstation, concept art, sharp focus, illustration, art by artgerm and greg rutkowski and fuji choko and viktoria gavrilenko and hoang lap"
64
+ >>> prompt = "黑暗精灵公主,非常详细,幻想,非常详细,数字绘画,概念艺术,敏锐的焦点,插图"
65
+ >>> image = pipe(prompt).images[0]
66
+ ```
67
+ """
68
+
69
+
70
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
71
+ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
72
+ """
73
+ Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
74
+ Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
75
+ """
76
+ std_text = noise_pred_text.std(axis=list(range(1, noise_pred_text.ndim)), keepdim=True)
77
+ std_cfg = noise_cfg.std(axis=list(range(1, noise_cfg.ndim)), keepdim=True)
78
+ # rescale the results from guidance (fixes overexposure)
79
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
80
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
81
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
82
+ return noise_cfg
83
+
84
+
85
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
86
+ def retrieve_timesteps(
87
+ scheduler,
88
+ num_inference_steps: Optional[int] = None,
89
+ timesteps: Optional[List[int]] = None,
90
+ **kwargs,
91
+ ):
92
+ """
93
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
94
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
95
+
96
+ Args:
97
+ scheduler (`SchedulerMixin`):
98
+ The scheduler to get timesteps from.
99
+ num_inference_steps (`int`):
100
+ The number of diffusion steps used when generating samples with a pre-trained model. If used,
101
+ `timesteps` must be `None`.
102
+ timesteps (`List[int]`, *optional*):
103
+ Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
104
+ timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
105
+ must be `None`.
106
+
107
+ Returns:
108
+ `Tuple[paddle.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
109
+ second element is the number of inference steps.
110
+ """
111
+ if timesteps is not None:
112
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
113
+ if not accepts_timesteps:
114
+ raise ValueError(
115
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
116
+ f" timestep schedules. Please check whether you are using the correct scheduler."
117
+ )
118
+ scheduler.set_timesteps(timesteps=timesteps, **kwargs)
119
+ timesteps = scheduler.timesteps
120
+ num_inference_steps = len(timesteps)
121
+ else:
122
+ scheduler.set_timesteps(num_inference_steps, **kwargs)
123
+ timesteps = scheduler.timesteps
124
+ return timesteps, num_inference_steps
125
+
126
+
127
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline with Stable->Alt, CLIPTextModel->RobertaSeriesModelWithTransformation, CLIPTokenizer->XLMRobertaTokenizer, AltDiffusionSafetyChecker->StableDiffusionSafetyChecker
128
+ class AltDiffusionPipeline(
129
+ DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
130
+ ):
131
+ r"""
132
+ Pipeline for text-to-image generation using Alt Diffusion.
133
+
134
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
135
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
136
+
137
+ The pipeline also inherits the following loading methods:
138
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
139
+ - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
140
+ - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
141
+ - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
142
+ - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
143
+
144
+ Args:
145
+ vae ([`AutoencoderKL`]):
146
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
147
+ text_encoder ([`~transformers.RobertaSeriesModelWithTransformation`]):
148
+ Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
149
+ tokenizer ([`~transformers.XLMRobertaTokenizer`]):
150
+ A `XLMRobertaTokenizer` to tokenize text.
151
+ unet ([`UNet2DConditionModel`]):
152
+ A `UNet2DConditionModel` to denoise the encoded image latents.
153
+ scheduler ([`SchedulerMixin`]):
154
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
155
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
156
+ safety_checker ([`StableDiffusionSafetyChecker`]):
157
+ Classification module that estimates whether generated images could be considered offensive or harmful.
158
+ Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
159
+ about a model's potential harms.
160
+ feature_extractor ([`~transformers.CLIPImageProcessor`]):
161
+ A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
162
+ """
163
+
164
+ model_cpu_offload_seq = "text_encoder->unet->vae"
165
+ _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
166
+ _exclude_from_cpu_offload = ["safety_checker"]
167
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
168
+
169
+ def __init__(
170
+ self,
171
+ vae: AutoencoderKL,
172
+ text_encoder: RobertaSeriesModelWithTransformation,
173
+ tokenizer: XLMRobertaTokenizer,
174
+ unet: UNet2DConditionModel,
175
+ scheduler: KarrasDiffusionSchedulers,
176
+ safety_checker: StableDiffusionSafetyChecker,
177
+ feature_extractor: CLIPImageProcessor,
178
+ image_encoder: CLIPVisionModelWithProjection = None,
179
+ requires_safety_checker: bool = True,
180
+ ):
181
+ super().__init__()
182
+
183
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
184
+ deprecation_message = (
185
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
186
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
187
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
188
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
189
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
190
+ " file"
191
+ )
192
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
193
+ new_config = dict(scheduler.config)
194
+ new_config["steps_offset"] = 1
195
+ scheduler._internal_dict = FrozenDict(new_config)
196
+
197
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
198
+ deprecation_message = (
199
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
200
+ " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
201
+ " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
202
+ " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
203
+ " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
204
+ )
205
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
206
+ new_config = dict(scheduler.config)
207
+ new_config["clip_sample"] = False
208
+ scheduler._internal_dict = FrozenDict(new_config)
209
+
210
+ if safety_checker is None and requires_safety_checker:
211
+ logger.warning(
212
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
213
+ " that you abide to the conditions of the Alt Diffusion license and do not expose unfiltered"
214
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
215
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
216
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
217
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
218
+ )
219
+
220
+ if safety_checker is not None and feature_extractor is None:
221
+ raise ValueError(
222
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
223
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
224
+ )
225
+
226
+ is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
227
+ version.parse(unet.config._ppdiffusers_version).base_version
228
+ ) < version.parse("0.9.0.dev0")
229
+ is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
230
+ if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
231
+ deprecation_message = (
232
+ "The configuration file of the unet has set the default `sample_size` to smaller than"
233
+ " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
234
+ " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
235
+ " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
236
+ " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
237
+ " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
238
+ " in the config might lead to incorrect results in future versions. If you have downloaded this"
239
+ " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
240
+ " the `unet/config.json` file"
241
+ )
242
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
243
+ new_config = dict(unet.config)
244
+ new_config["sample_size"] = 64
245
+ unet._internal_dict = FrozenDict(new_config)
246
+
247
+ self.register_modules(
248
+ vae=vae,
249
+ text_encoder=text_encoder,
250
+ tokenizer=tokenizer,
251
+ unet=unet,
252
+ scheduler=scheduler,
253
+ safety_checker=safety_checker,
254
+ feature_extractor=feature_extractor,
255
+ image_encoder=image_encoder,
256
+ )
257
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
258
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
259
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
260
+
261
+ def _encode_prompt(
262
+ self,
263
+ prompt,
264
+ num_images_per_prompt,
265
+ do_classifier_free_guidance,
266
+ negative_prompt=None,
267
+ prompt_embeds: Optional[paddle.Tensor] = None,
268
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
269
+ lora_scale: Optional[float] = None,
270
+ **kwargs,
271
+ ):
272
+ deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
273
+ deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
274
+
275
+ prompt_embeds_tuple = self.encode_prompt(
276
+ prompt=prompt,
277
+ num_images_per_prompt=num_images_per_prompt,
278
+ do_classifier_free_guidance=do_classifier_free_guidance,
279
+ negative_prompt=negative_prompt,
280
+ prompt_embeds=prompt_embeds,
281
+ negative_prompt_embeds=negative_prompt_embeds,
282
+ lora_scale=lora_scale,
283
+ **kwargs,
284
+ )
285
+
286
+ # concatenate for backwards comp
287
+ prompt_embeds = paddle.concat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
288
+
289
+ return prompt_embeds
290
+
291
+ def encode_prompt(
292
+ self,
293
+ prompt,
294
+ num_images_per_prompt,
295
+ do_classifier_free_guidance,
296
+ negative_prompt=None,
297
+ prompt_embeds: Optional[paddle.Tensor] = None,
298
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
299
+ lora_scale: Optional[float] = None,
300
+ clip_skip: Optional[int] = None,
301
+ ):
302
+ r"""
303
+ Encodes the prompt into text encoder hidden states.
304
+
305
+ Args:
306
+ prompt (`str` or `List[str]`, *optional*):
307
+ prompt to be encoded
308
+ num_images_per_prompt (`int`):
309
+ number of images that should be generated per prompt
310
+ do_classifier_free_guidance (`bool`):
311
+ whether to use classifier free guidance or not
312
+ negative_prompt (`str` or `List[str]`, *optional*):
313
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
314
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
315
+ less than `1`).
316
+ prompt_embeds (`paddle.Tensor`, *optional*):
317
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
318
+ provided, text embeddings will be generated from `prompt` input argument.
319
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
320
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
321
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
322
+ argument.
323
+ lora_scale (`float`, *optional*):
324
+ A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
325
+ clip_skip (`int`, *optional*):
326
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
327
+ the output of the pre-final layer will be used for computing the prompt embeddings.
328
+ """
329
+ # set lora scale so that monkey patched LoRA
330
+ # function of text encoder can correctly access it
331
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
332
+ self._lora_scale = lora_scale
333
+
334
+ # dynamically adjust the LoRA scale
335
+ if not USE_PEFT_BACKEND:
336
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
337
+ else:
338
+ scale_lora_layers(self.text_encoder, lora_scale)
339
+
340
+ if prompt is not None and isinstance(prompt, str):
341
+ batch_size = 1
342
+ elif prompt is not None and isinstance(prompt, list):
343
+ batch_size = len(prompt)
344
+ else:
345
+ batch_size = prompt_embeds.shape[0]
346
+
347
+ if prompt_embeds is None:
348
+ # textual inversion: process multi-vector tokens if necessary
349
+ if isinstance(self, TextualInversionLoaderMixin):
350
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
351
+
352
+ text_inputs = self.tokenizer(
353
+ prompt,
354
+ padding="max_length",
355
+ max_length=self.tokenizer.model_max_length,
356
+ truncation=True,
357
+ return_tensors="pd",
358
+ )
359
+ text_input_ids = text_inputs.input_ids
360
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
361
+
362
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
363
+ text_input_ids, untruncated_ids
364
+ ):
365
+ removed_text = self.tokenizer.batch_decode(
366
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
367
+ )
368
+ logger.warning(
369
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
370
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
371
+ )
372
+
373
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
374
+ attention_mask = text_inputs.attention_mask
375
+ else:
376
+ attention_mask = None
377
+
378
+ if clip_skip is None:
379
+ prompt_embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask)
380
+ prompt_embeds = prompt_embeds[0]
381
+ else:
382
+ prompt_embeds = self.text_encoder(
383
+ text_input_ids, attention_mask=attention_mask, output_hidden_states=True
384
+ )
385
+ # Access the `hidden_states` first, that contains a tuple of
386
+ # all the hidden states from the encoder layers. Then index into
387
+ # the tuple to access the hidden states from the desired layer.
388
+ prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
389
+ # We also need to apply the final LayerNorm here to not mess with the
390
+ # representations. The `last_hidden_states` that we typically use for
391
+ # obtaining the final prompt representations passes through the LayerNorm
392
+ # layer.
393
+ prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
394
+
395
+ if self.text_encoder is not None:
396
+ prompt_embeds_dtype = self.text_encoder.dtype
397
+ elif self.unet is not None:
398
+ prompt_embeds_dtype = self.unet.dtype
399
+ else:
400
+ prompt_embeds_dtype = prompt_embeds.dtype
401
+
402
+ prompt_embeds = prompt_embeds.cast(dtype=prompt_embeds_dtype)
403
+
404
+ bs_embed, seq_len, _ = prompt_embeds.shape
405
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
406
+ prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
407
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
408
+
409
+ # get unconditional embeddings for classifier free guidance
410
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
411
+ uncond_tokens: List[str]
412
+ if negative_prompt is None:
413
+ uncond_tokens = [""] * batch_size
414
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
415
+ raise TypeError(
416
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
417
+ f" {type(prompt)}."
418
+ )
419
+ elif isinstance(negative_prompt, str):
420
+ uncond_tokens = [negative_prompt]
421
+ elif batch_size != len(negative_prompt):
422
+ raise ValueError(
423
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
424
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
425
+ " the batch size of `prompt`."
426
+ )
427
+ else:
428
+ uncond_tokens = negative_prompt
429
+
430
+ # textual inversion: process multi-vector tokens if necessary
431
+ if isinstance(self, TextualInversionLoaderMixin):
432
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
433
+
434
+ max_length = prompt_embeds.shape[1]
435
+ uncond_input = self.tokenizer(
436
+ uncond_tokens,
437
+ padding="max_length",
438
+ max_length=max_length,
439
+ truncation=True,
440
+ return_tensors="pd",
441
+ )
442
+
443
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
444
+ attention_mask = uncond_input.attention_mask
445
+ else:
446
+ attention_mask = None
447
+
448
+ negative_prompt_embeds = self.text_encoder(
449
+ uncond_input.input_ids,
450
+ attention_mask=attention_mask,
451
+ )
452
+ negative_prompt_embeds = negative_prompt_embeds[0]
453
+
454
+ if do_classifier_free_guidance:
455
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
456
+ seq_len = negative_prompt_embeds.shape[1]
457
+
458
+ negative_prompt_embeds = negative_prompt_embeds.cast(dtype=prompt_embeds_dtype)
459
+
460
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
461
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
462
+
463
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
464
+ # Retrieve the original scale by scaling back the LoRA layers
465
+ unscale_lora_layers(self.text_encoder, lora_scale)
466
+
467
+ return prompt_embeds, negative_prompt_embeds
468
+
469
+ def encode_image(self, image, num_images_per_prompt):
470
+ dtype = next(self.image_encoder.named_parameters())[1].dtype
471
+
472
+ if not isinstance(image, paddle.Tensor):
473
+ image = self.feature_extractor(image, return_tensors="pd").pixel_values
474
+
475
+ image = image.cast(dtype=dtype)
476
+ image_embeds = self.image_encoder(image).image_embeds
477
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, axis=0)
478
+
479
+ uncond_image_embeds = paddle.zeros_like(image_embeds)
480
+ return image_embeds, uncond_image_embeds
481
+
482
+ def run_safety_checker(self, image, dtype):
483
+ if self.safety_checker is None:
484
+ has_nsfw_concept = None
485
+ else:
486
+ if paddle.is_tensor(image):
487
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
488
+ else:
489
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
490
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd")
491
+ image, has_nsfw_concept = self.safety_checker(
492
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype=dtype)
493
+ )
494
+ return image, has_nsfw_concept
495
+
496
+ def decode_latents(self, latents):
497
+ deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
498
+ deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
499
+
500
+ latents = 1 / self.vae.config.scaling_factor * latents
501
+ image = self.vae.decode(latents, return_dict=False)[0]
502
+ image = (image / 2 + 0.5).clip(0, 1)
503
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
504
+ image = image.cast(dtype=paddle.float32).transpose([0, 2, 3, 1]).cpu().numpy()
505
+ return image
506
+
507
+ def prepare_extra_step_kwargs(self, generator, eta):
508
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
509
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
510
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
511
+ # and should be between [0, 1]
512
+
513
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
514
+ extra_step_kwargs = {}
515
+ if accepts_eta:
516
+ extra_step_kwargs["eta"] = eta
517
+
518
+ # check if the scheduler accepts generator
519
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
520
+ if accepts_generator:
521
+ extra_step_kwargs["generator"] = generator
522
+ return extra_step_kwargs
523
+
524
+ def check_inputs(
525
+ self,
526
+ prompt,
527
+ height,
528
+ width,
529
+ callback_steps,
530
+ negative_prompt=None,
531
+ prompt_embeds=None,
532
+ negative_prompt_embeds=None,
533
+ callback_on_step_end_tensor_inputs=None,
534
+ ):
535
+ if height % 8 != 0 or width % 8 != 0:
536
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
537
+
538
+ if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
539
+ raise ValueError(
540
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
541
+ f" {type(callback_steps)}."
542
+ )
543
+ if callback_on_step_end_tensor_inputs is not None and not all(
544
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
545
+ ):
546
+ raise ValueError(
547
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
548
+ )
549
+
550
+ if prompt is not None and prompt_embeds is not None:
551
+ raise ValueError(
552
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
553
+ " only forward one of the two."
554
+ )
555
+ elif prompt is None and prompt_embeds is None:
556
+ raise ValueError(
557
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
558
+ )
559
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
560
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
561
+
562
+ if negative_prompt is not None and negative_prompt_embeds is not None:
563
+ raise ValueError(
564
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
565
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
566
+ )
567
+
568
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
569
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
570
+ raise ValueError(
571
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
572
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
573
+ f" {negative_prompt_embeds.shape}."
574
+ )
575
+
576
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
577
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
578
+ if isinstance(generator, list) and len(generator) != batch_size:
579
+ raise ValueError(
580
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
581
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
582
+ )
583
+
584
+ if latents is None:
585
+ latents = randn_tensor(shape, generator=generator, dtype=dtype)
586
+ else:
587
+ latents = latents.cast(dtype)
588
+
589
+ # scale the initial noise by the standard deviation required by the scheduler
590
+ latents = latents * self.scheduler.init_noise_sigma
591
+ return latents
592
+
593
+ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=paddle.float32):
594
+ """
595
+ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
596
+
597
+ Args:
598
+ timesteps (`paddle.Tensor`):
599
+ generate embedding vectors at these timesteps
600
+ embedding_dim (`int`, *optional*, defaults to 512):
601
+ dimension of the embeddings to generate
602
+ dtype:
603
+ data type of the generated embeddings
604
+
605
+ Returns:
606
+ `paddle.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
607
+ """
608
+ assert len(w.shape) == 1
609
+ w = w * 1000.0
610
+
611
+ half_dim = embedding_dim // 2
612
+ emb = paddle.log(paddle.to_tensor(10000.0)) / (half_dim - 1)
613
+ emb = paddle.exp(paddle.arange(half_dim, dtype=dtype) * -emb)
614
+ emb = w.cast(dtype=dtype)[:, None] * emb[None, :]
615
+ emb = paddle.concat([paddle.sin(emb), paddle.cos(emb)], axis=1)
616
+ if embedding_dim % 2 == 1:
617
+ emb = paddle.concat(emb, paddle.zeros([emb.shape[0], 1]), axis=-1)
618
+ assert emb.shape == [w.shape[0], embedding_dim]
619
+ return emb
620
+
621
+ @property
622
+ def guidance_scale(self):
623
+ return self._guidance_scale
624
+
625
+ @property
626
+ def guidance_rescale(self):
627
+ return self._guidance_rescale
628
+
629
+ @property
630
+ def clip_skip(self):
631
+ return self._clip_skip
632
+
633
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
634
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
635
+ # corresponds to doing no classifier free guidance.
636
+ @property
637
+ def do_classifier_free_guidance(self):
638
+ return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
639
+
640
+ @property
641
+ def cross_attention_kwargs(self):
642
+ return self._cross_attention_kwargs
643
+
644
+ @property
645
+ def num_timesteps(self):
646
+ return self._num_timesteps
647
+
648
+ @paddle.no_grad()
649
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
650
+ def __call__(
651
+ self,
652
+ prompt: Union[str, List[str]] = None,
653
+ height: Optional[int] = None,
654
+ width: Optional[int] = None,
655
+ num_inference_steps: int = 50,
656
+ timesteps: List[int] = None,
657
+ guidance_scale: float = 7.5,
658
+ negative_prompt: Optional[Union[str, List[str]]] = None,
659
+ num_images_per_prompt: Optional[int] = 1,
660
+ eta: float = 0.0,
661
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
662
+ latents: Optional[paddle.Tensor] = None,
663
+ prompt_embeds: Optional[paddle.Tensor] = None,
664
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
665
+ ip_adapter_image: Optional[PipelineImageInput] = None,
666
+ output_type: Optional[str] = "pil",
667
+ return_dict: bool = True,
668
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
669
+ guidance_rescale: float = 0.0,
670
+ clip_skip: Optional[int] = None,
671
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
672
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
673
+ **kwargs,
674
+ ):
675
+ r"""
676
+ The call function to the pipeline for generation.
677
+
678
+ Args:
679
+ prompt (`str` or `List[str]`, *optional*):
680
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
681
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
682
+ The height in pixels of the generated image.
683
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
684
+ The width in pixels of the generated image.
685
+ num_inference_steps (`int`, *optional*, defaults to 50):
686
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
687
+ expense of slower inference.
688
+ timesteps (`List[int]`, *optional*):
689
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
690
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
691
+ passed will be used. Must be in descending order.
692
+ guidance_scale (`float`, *optional*, defaults to 7.5):
693
+ A higher guidance scale value encourages the model to generate images closely linked to the text
694
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
695
+ negative_prompt (`str` or `List[str]`, *optional*):
696
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
697
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
698
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
699
+ The number of images to generate per prompt.
700
+ eta (`float`, *optional*, defaults to 0.0):
701
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
702
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
703
+ generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
704
+ A [`paddle.Generator`] to make generation deterministic.
705
+ latents (`paddle.Tensor`, *optional*):
706
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
707
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
708
+ tensor is generated by sampling using the supplied random `generator`.
709
+ prompt_embeds (`paddle.Tensor`, *optional*):
710
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
711
+ provided, text embeddings are generated from the `prompt` input argument.
712
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
713
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
714
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
715
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
716
+ output_type (`str`, *optional*, defaults to `"pil"`):
717
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
718
+ return_dict (`bool`, *optional*, defaults to `True`):
719
+ Whether or not to return a [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] instead of a
720
+ plain tuple.
721
+ cross_attention_kwargs (`dict`, *optional*):
722
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
723
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
724
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
725
+ Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
726
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
727
+ using zero terminal SNR.
728
+ clip_skip (`int`, *optional*):
729
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
730
+ the output of the pre-final layer will be used for computing the prompt embeddings.
731
+ callback_on_step_end (`Callable`, *optional*):
732
+ A function that calls at the end of each denoising steps during the inference. The function is called
733
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
734
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
735
+ `callback_on_step_end_tensor_inputs`.
736
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
737
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
738
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
739
+ `._callback_tensor_inputs` attribute of your pipeline class.
740
+
741
+ Examples:
742
+
743
+ Returns:
744
+ [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] or `tuple`:
745
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] is returned,
746
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
747
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
748
+ "not-safe-for-work" (nsfw) content.
749
+ """
750
+
751
+ callback = kwargs.pop("callback", None)
752
+ callback_steps = kwargs.pop("callback_steps", None)
753
+
754
+ if callback is not None:
755
+ deprecate(
756
+ "callback",
757
+ "1.0.0",
758
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
759
+ )
760
+ if callback_steps is not None:
761
+ deprecate(
762
+ "callback_steps",
763
+ "1.0.0",
764
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
765
+ )
766
+
767
+ # 0. Default height and width to unet
768
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
769
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
770
+ # to deal with lora scaling and other possible forward hooks
771
+
772
+ # 1. Check inputs. Raise error if not correct
773
+ self.check_inputs(
774
+ prompt,
775
+ height,
776
+ width,
777
+ callback_steps,
778
+ negative_prompt,
779
+ prompt_embeds,
780
+ negative_prompt_embeds,
781
+ callback_on_step_end_tensor_inputs,
782
+ )
783
+
784
+ self._guidance_scale = guidance_scale
785
+ self._guidance_rescale = guidance_rescale
786
+ self._clip_skip = clip_skip
787
+ self._cross_attention_kwargs = cross_attention_kwargs
788
+
789
+ # 2. Define call parameters
790
+ if prompt is not None and isinstance(prompt, str):
791
+ batch_size = 1
792
+ elif prompt is not None and isinstance(prompt, list):
793
+ batch_size = len(prompt)
794
+ else:
795
+ batch_size = prompt_embeds.shape[0]
796
+
797
+ # 3. Encode input prompt
798
+ lora_scale = (
799
+ self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
800
+ )
801
+
802
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
803
+ prompt,
804
+ num_images_per_prompt,
805
+ self.do_classifier_free_guidance,
806
+ negative_prompt,
807
+ prompt_embeds=prompt_embeds,
808
+ negative_prompt_embeds=negative_prompt_embeds,
809
+ lora_scale=lora_scale,
810
+ clip_skip=self.clip_skip,
811
+ )
812
+
813
+ # For classifier free guidance, we need to do two forward passes.
814
+ # Here we concatenate the unconditional and text embeddings into a single batch
815
+ # to avoid doing two forward passes
816
+ if self.do_classifier_free_guidance:
817
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
818
+
819
+ if ip_adapter_image is not None:
820
+ image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, num_images_per_prompt)
821
+ if self.do_classifier_free_guidance:
822
+ image_embeds = paddle.concat([negative_image_embeds, image_embeds])
823
+
824
+ # 4. Prepare timesteps
825
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, timesteps)
826
+
827
+ # 5. Prepare latent variables
828
+ num_channels_latents = self.unet.config.in_channels
829
+ latents = self.prepare_latents(
830
+ batch_size * num_images_per_prompt,
831
+ num_channels_latents,
832
+ height,
833
+ width,
834
+ prompt_embeds.dtype,
835
+ generator,
836
+ latents,
837
+ )
838
+
839
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
840
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
841
+
842
+ # 6.1 Add image embeds for IP-Adapter
843
+ added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
844
+
845
+ # 6.2 Optionally get Guidance Scale Embedding
846
+ timestep_cond = None
847
+ if self.unet.config.time_cond_proj_dim is not None:
848
+ guidance_scale_tensor = paddle.to_tensor([self.guidance_scale - 1]).tile(
849
+ [
850
+ batch_size * num_images_per_prompt,
851
+ ]
852
+ )
853
+ timestep_cond = self.get_guidance_scale_embedding(
854
+ guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
855
+ ).cast(dtype=latents.dtype)
856
+
857
+ # 7. Denoising loop
858
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
859
+ self._num_timesteps = len(timesteps)
860
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
861
+ for i, t in enumerate(timesteps):
862
+ # expand the latents if we are doing classifier free guidance
863
+ latent_model_input = paddle.concat([latents] * 2) if self.do_classifier_free_guidance else latents
864
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
865
+
866
+ # predict the noise residual
867
+ noise_pred = self.unet(
868
+ latent_model_input,
869
+ t,
870
+ encoder_hidden_states=prompt_embeds,
871
+ timestep_cond=timestep_cond,
872
+ cross_attention_kwargs=self.cross_attention_kwargs,
873
+ added_cond_kwargs=added_cond_kwargs,
874
+ return_dict=False,
875
+ )[0]
876
+
877
+ # perform guidance
878
+ if self.do_classifier_free_guidance:
879
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
880
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
881
+
882
+ if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
883
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
884
+ noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
885
+
886
+ # compute the previous noisy sample x_t -> x_t-1
887
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
888
+
889
+ if callback_on_step_end is not None:
890
+ callback_kwargs = {}
891
+ for k in callback_on_step_end_tensor_inputs:
892
+ callback_kwargs[k] = locals()[k]
893
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
894
+
895
+ latents = callback_outputs.pop("latents", latents)
896
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
897
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
898
+
899
+ # call the callback, if provided
900
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
901
+ progress_bar.update()
902
+ if callback is not None and i % callback_steps == 0:
903
+ step_idx = i // getattr(self.scheduler, "order", 1)
904
+ callback(step_idx, t, latents)
905
+
906
+ if not output_type == "latent":
907
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
908
+ 0
909
+ ]
910
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
911
+ else:
912
+ image = latents
913
+ has_nsfw_concept = None
914
+
915
+ if has_nsfw_concept is None:
916
+ do_denormalize = [True] * image.shape[0]
917
+ else:
918
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
919
+
920
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
921
+
922
+ if not return_dict:
923
+ return (image, has_nsfw_concept)
924
+
925
+ return AltDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/animatediff/pipeline_animatediff.py ADDED
@@ -0,0 +1,657 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from dataclasses import dataclass
17
+ from typing import Any, Callable, Dict, List, Optional, Union
18
+
19
+ import numpy as np
20
+ import paddle
21
+
22
+ from ppdiffusers.transformers import (
23
+ CLIPImageProcessor,
24
+ CLIPTextModel,
25
+ CLIPTokenizer,
26
+ CLIPVisionModelWithProjection,
27
+ )
28
+
29
+ from ...image_processor import PipelineImageInput, VaeImageProcessor
30
+ from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
31
+ from ...models import AutoencoderKL, UNet2DConditionModel, UNetMotionModel
32
+ from ...models.lora import adjust_lora_scale_text_encoder
33
+ from ...models.unet_motion_model import MotionAdapter
34
+ from ...schedulers import (
35
+ DDIMScheduler,
36
+ DPMSolverMultistepScheduler,
37
+ EulerAncestralDiscreteScheduler,
38
+ EulerDiscreteScheduler,
39
+ LMSDiscreteScheduler,
40
+ PNDMScheduler,
41
+ )
42
+ from ...utils import USE_PEFT_BACKEND, BaseOutput, logging
43
+ from ...utils.paddle_utils import randn_tensor
44
+ from ..pipeline_utils import DiffusionPipeline
45
+
46
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
47
+
48
+ EXAMPLE_DOC_STRING = """
49
+ Examples:
50
+ ```py
51
+ >>> import paddle
52
+ >>> from ppdiffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
53
+ >>> from ppdiffusers.utils import export_to_gif
54
+
55
+ >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
56
+ >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
57
+ >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
58
+ >>> output = pipe(prompt="A corgi walking in the park")
59
+ >>> frames = output.frames[0]
60
+ >>> export_to_gif(frames, "animation.gif")
61
+ ```
62
+ """
63
+
64
+
65
+ def tensor2vid(video: paddle.Tensor, processor, output_type="np"):
66
+ # Based on:
67
+ # https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78
68
+
69
+ batch_size, channels, num_frames, height, width = video.shape
70
+ outputs = []
71
+ for batch_idx in range(batch_size):
72
+ batch_vid = video[batch_idx].transpose([1, 0, 2, 3])
73
+ batch_output = processor.postprocess(batch_vid, output_type)
74
+
75
+ outputs.append(batch_output)
76
+
77
+ return outputs
78
+
79
+
80
+ @dataclass
81
+ class AnimateDiffPipelineOutput(BaseOutput):
82
+ frames: Union[paddle.Tensor, np.ndarray]
83
+
84
+
85
+ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
86
+ r"""
87
+ Pipeline for text-to-video generation.
88
+
89
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
90
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
91
+
92
+ Args:
93
+ vae ([`AutoencoderKL`]):
94
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
95
+ text_encoder ([`CLIPTextModel`]):
96
+ Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
97
+ tokenizer (`CLIPTokenizer`):
98
+ A [`~transformers.CLIPTokenizer`] to tokenize text.
99
+ unet ([`UNet2DConditionModel`]):
100
+ A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
101
+ motion_adapter ([`MotionAdapter`]):
102
+ A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
103
+ scheduler ([`SchedulerMixin`]):
104
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
105
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
106
+ """
107
+
108
+ model_cpu_offload_seq = "text_encoder->unet->vae"
109
+ _optional_components = ["feature_extractor", "image_encoder"]
110
+
111
+ def __init__(
112
+ self,
113
+ vae: AutoencoderKL,
114
+ text_encoder: CLIPTextModel,
115
+ tokenizer: CLIPTokenizer,
116
+ unet: UNet2DConditionModel,
117
+ motion_adapter: MotionAdapter,
118
+ scheduler: Union[
119
+ DDIMScheduler,
120
+ PNDMScheduler,
121
+ LMSDiscreteScheduler,
122
+ EulerDiscreteScheduler,
123
+ EulerAncestralDiscreteScheduler,
124
+ DPMSolverMultistepScheduler,
125
+ ],
126
+ feature_extractor: CLIPImageProcessor = None,
127
+ image_encoder: CLIPVisionModelWithProjection = None,
128
+ ):
129
+ super().__init__()
130
+ unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
131
+
132
+ self.register_modules(
133
+ vae=vae,
134
+ text_encoder=text_encoder,
135
+ tokenizer=tokenizer,
136
+ unet=unet,
137
+ motion_adapter=motion_adapter,
138
+ scheduler=scheduler,
139
+ feature_extractor=feature_extractor,
140
+ image_encoder=image_encoder,
141
+ )
142
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
143
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
144
+
145
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
146
+ def encode_prompt(
147
+ self,
148
+ prompt,
149
+ num_images_per_prompt,
150
+ do_classifier_free_guidance,
151
+ negative_prompt=None,
152
+ prompt_embeds: Optional[paddle.Tensor] = None,
153
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
154
+ lora_scale: Optional[float] = None,
155
+ clip_skip: Optional[int] = None,
156
+ ):
157
+ r"""
158
+ Encodes the prompt into text encoder hidden states.
159
+
160
+ Args:
161
+ prompt (`str` or `List[str]`, *optional*):
162
+ prompt to be encoded
163
+ num_images_per_prompt (`int`):
164
+ number of images that should be generated per prompt
165
+ do_classifier_free_guidance (`bool`):
166
+ whether to use classifier free guidance or not
167
+ negative_prompt (`str` or `List[str]`, *optional*):
168
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
169
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
170
+ less than `1`).
171
+ prompt_embeds (`paddle.Tensor`, *optional*):
172
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
173
+ provided, text embeddings will be generated from `prompt` input argument.
174
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
175
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
176
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
177
+ argument.
178
+ lora_scale (`float`, *optional*):
179
+ A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
180
+ clip_skip (`int`, *optional*):
181
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
182
+ the output of the pre-final layer will be used for computing the prompt embeddings.
183
+ """
184
+ # set lora scale so that monkey patched LoRA
185
+ # function of text encoder can correctly access it
186
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
187
+ self._lora_scale = lora_scale
188
+
189
+ # dynamically adjust the LoRA scale
190
+ if not USE_PEFT_BACKEND:
191
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
192
+
193
+ if prompt is not None and isinstance(prompt, str):
194
+ batch_size = 1
195
+ elif prompt is not None and isinstance(prompt, list):
196
+ batch_size = len(prompt)
197
+ else:
198
+ batch_size = prompt_embeds.shape[0]
199
+
200
+ if prompt_embeds is None:
201
+ # textual inversion: process multi-vector tokens if necessary
202
+ if isinstance(self, TextualInversionLoaderMixin):
203
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
204
+
205
+ text_inputs = self.tokenizer(
206
+ prompt,
207
+ padding="max_length",
208
+ max_length=self.tokenizer.model_max_length,
209
+ truncation=True,
210
+ return_tensors="pd",
211
+ )
212
+ text_input_ids = text_inputs.input_ids
213
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
214
+
215
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
216
+ text_input_ids, untruncated_ids
217
+ ):
218
+ removed_text = self.tokenizer.batch_decode(
219
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
220
+ )
221
+ logger.warning(
222
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
223
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
224
+ )
225
+
226
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
227
+ attention_mask = text_inputs.attention_mask
228
+ else:
229
+ attention_mask = None
230
+
231
+ if clip_skip is None:
232
+ prompt_embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask)
233
+ prompt_embeds = prompt_embeds[0]
234
+ else:
235
+ prompt_embeds = self.text_encoder(
236
+ text_input_ids, attention_mask=attention_mask, output_hidden_states=True
237
+ )
238
+ # Access the `hidden_states` first, that contains a tuple of
239
+ # all the hidden states from the encoder layers. Then index into
240
+ # the tuple to access the hidden states from the desired layer.
241
+ prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
242
+ # We also need to apply the final LayerNorm here to not mess with the
243
+ # representations. The `last_hidden_states` that we typically use for
244
+ # obtaining the final prompt representations passes through the LayerNorm
245
+ # layer.
246
+ prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
247
+
248
+ if self.text_encoder is not None:
249
+ prompt_embeds_dtype = self.text_encoder.dtype
250
+ elif self.unet is not None:
251
+ prompt_embeds_dtype = self.unet.dtype
252
+ else:
253
+ prompt_embeds_dtype = prompt_embeds.dtype
254
+
255
+ prompt_embeds = prompt_embeds.cast(dtype=prompt_embeds_dtype)
256
+
257
+ bs_embed, seq_len, _ = prompt_embeds.shape
258
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
259
+ prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
260
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
261
+
262
+ # get unconditional embeddings for classifier free guidance
263
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
264
+ uncond_tokens: List[str]
265
+ if negative_prompt is None:
266
+ uncond_tokens = [""] * batch_size
267
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
268
+ raise TypeError(
269
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
270
+ f" {type(prompt)}."
271
+ )
272
+ elif isinstance(negative_prompt, str):
273
+ uncond_tokens = [negative_prompt]
274
+ elif batch_size != len(negative_prompt):
275
+ raise ValueError(
276
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
277
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
278
+ " the batch size of `prompt`."
279
+ )
280
+ else:
281
+ uncond_tokens = negative_prompt
282
+
283
+ # textual inversion: process multi-vector tokens if necessary
284
+ if isinstance(self, TextualInversionLoaderMixin):
285
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
286
+
287
+ max_length = prompt_embeds.shape[1]
288
+ uncond_input = self.tokenizer(
289
+ uncond_tokens,
290
+ padding="max_length",
291
+ max_length=max_length,
292
+ truncation=True,
293
+ return_tensors="pd",
294
+ )
295
+
296
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
297
+ attention_mask = uncond_input.attention_mask
298
+ else:
299
+ attention_mask = None
300
+
301
+ negative_prompt_embeds = self.text_encoder(
302
+ uncond_input.input_ids,
303
+ attention_mask=attention_mask,
304
+ )
305
+ negative_prompt_embeds = negative_prompt_embeds[0]
306
+
307
+ if do_classifier_free_guidance:
308
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
309
+ seq_len = negative_prompt_embeds.shape[1]
310
+
311
+ negative_prompt_embeds = negative_prompt_embeds.cast(dtype=prompt_embeds_dtype)
312
+
313
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
314
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
315
+
316
+ return prompt_embeds, negative_prompt_embeds
317
+
318
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
319
+ def encode_image(self, image, num_images_per_prompt):
320
+ dtype = next(self.image_encoder.named_parameters())[1].dtype
321
+
322
+ if not isinstance(image, paddle.Tensor):
323
+ image = self.feature_extractor(image, return_tensors="pd").pixel_values
324
+
325
+ image = image.cast(dtype=dtype)
326
+ image_embeds = self.image_encoder(image).image_embeds
327
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, axis=0)
328
+
329
+ uncond_image_embeds = paddle.zeros_like(image_embeds)
330
+ return image_embeds, uncond_image_embeds
331
+
332
+ # Copied from ppdiffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
333
+ def decode_latents(self, latents):
334
+ latents = 1 / self.vae.config.scaling_factor * latents
335
+
336
+ batch_size, channels, num_frames, height, width = latents.shape
337
+ latents = latents.transpose([0, 2, 1, 3, 4]).reshape([batch_size * num_frames, channels, height, width])
338
+
339
+ image = self.vae.decode(latents).sample
340
+ video = (
341
+ image[None, :]
342
+ .reshape(
343
+ [
344
+ batch_size,
345
+ num_frames,
346
+ -1,
347
+ ]
348
+ + image.shape[2:]
349
+ )
350
+ .transpose([0, 2, 1, 3, 4])
351
+ )
352
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
353
+ video = video.cast("float32")
354
+ return video
355
+
356
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
357
+ def prepare_extra_step_kwargs(self, generator, eta):
358
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
359
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
360
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
361
+ # and should be between [0, 1]
362
+
363
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
364
+ extra_step_kwargs = {}
365
+ if accepts_eta:
366
+ extra_step_kwargs["eta"] = eta
367
+
368
+ # check if the scheduler accepts generator
369
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
370
+ if accepts_generator:
371
+ extra_step_kwargs["generator"] = generator
372
+ return extra_step_kwargs
373
+
374
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
375
+ def check_inputs(
376
+ self,
377
+ prompt,
378
+ height,
379
+ width,
380
+ callback_steps,
381
+ negative_prompt=None,
382
+ prompt_embeds=None,
383
+ negative_prompt_embeds=None,
384
+ callback_on_step_end_tensor_inputs=None,
385
+ ):
386
+ if height % 8 != 0 or width % 8 != 0:
387
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
388
+
389
+ if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
390
+ raise ValueError(
391
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
392
+ f" {type(callback_steps)}."
393
+ )
394
+ if callback_on_step_end_tensor_inputs is not None and not all(
395
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
396
+ ):
397
+ raise ValueError(
398
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
399
+ )
400
+
401
+ if prompt is not None and prompt_embeds is not None:
402
+ raise ValueError(
403
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
404
+ " only forward one of the two."
405
+ )
406
+ elif prompt is None and prompt_embeds is None:
407
+ raise ValueError(
408
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
409
+ )
410
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
411
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
412
+
413
+ if negative_prompt is not None and negative_prompt_embeds is not None:
414
+ raise ValueError(
415
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
416
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
417
+ )
418
+
419
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
420
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
421
+ raise ValueError(
422
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
423
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
424
+ f" {negative_prompt_embeds.shape}."
425
+ )
426
+
427
+ # Copied from ppdiffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
428
+ def prepare_latents(
429
+ self, batch_size, num_channels_latents, num_frames, height, width, dtype, generator, latents=None
430
+ ):
431
+ shape = (
432
+ batch_size,
433
+ num_channels_latents,
434
+ num_frames,
435
+ height // self.vae_scale_factor,
436
+ width // self.vae_scale_factor,
437
+ )
438
+ if isinstance(generator, list) and len(generator) != batch_size:
439
+ raise ValueError(
440
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
441
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
442
+ )
443
+
444
+ if latents is None:
445
+ latents = randn_tensor(shape, generator=generator, dtype=dtype)
446
+ else:
447
+ latents = latents.cast(dtype=dtype)
448
+
449
+ # scale the initial noise by the standard deviation required by the scheduler
450
+ latents = latents * self.scheduler.init_noise_sigma
451
+ return latents
452
+
453
+ @paddle.no_grad()
454
+ def __call__(
455
+ self,
456
+ prompt: Union[str, List[str]] = None,
457
+ num_frames: Optional[int] = 16,
458
+ height: Optional[int] = None,
459
+ width: Optional[int] = None,
460
+ num_inference_steps: int = 50,
461
+ guidance_scale: float = 7.5,
462
+ negative_prompt: Optional[Union[str, List[str]]] = None,
463
+ num_videos_per_prompt: Optional[int] = 1,
464
+ eta: float = 0.0,
465
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
466
+ latents: Optional[paddle.Tensor] = None,
467
+ prompt_embeds: Optional[paddle.Tensor] = None,
468
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
469
+ ip_adapter_image: Optional[PipelineImageInput] = None,
470
+ output_type: Optional[str] = "pil",
471
+ return_dict: bool = True,
472
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
473
+ callback_steps: Optional[int] = 1,
474
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
475
+ clip_skip: Optional[int] = None,
476
+ ):
477
+ r"""
478
+ The call function to the pipeline for generation.
479
+
480
+ Args:
481
+ prompt (`str` or `List[str]`, *optional*):
482
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
483
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
484
+ The height in pixels of the generated video.
485
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
486
+ The width in pixels of the generated video.
487
+ num_frames (`int`, *optional*, defaults to 16):
488
+ The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
489
+ amounts to 2 seconds of video.
490
+ num_inference_steps (`int`, *optional*, defaults to 50):
491
+ The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
492
+ expense of slower inference.
493
+ guidance_scale (`float`, *optional*, defaults to 7.5):
494
+ A higher guidance scale value encourages the model to generate images closely linked to the text
495
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
496
+ negative_prompt (`str` or `List[str]`, *optional*):
497
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
498
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
499
+ eta (`float`, *optional*, defaults to 0.0):
500
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
501
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
502
+ generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
503
+ A [`paddle.Generator`] to make generation deterministic.
504
+ latents (`paddle.Tensor`, *optional*):
505
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
506
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
507
+ tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
508
+ `(batch_size, num_channel, num_frames, height, width)`.
509
+ prompt_embeds (`paddle.Tensor`, *optional*):
510
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
511
+ provided, text embeddings are generated from the `prompt` input argument.
512
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
513
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
514
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
515
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
516
+ output_type (`str`, *optional*, defaults to `"pil"`):
517
+ The output format of the generated video. Choose between `paddle.Tensor`, `PIL.Image` or
518
+ `np.array`.
519
+ return_dict (`bool`, *optional*, defaults to `True`):
520
+ Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
521
+ of a plain tuple.
522
+ callback (`Callable`, *optional*):
523
+ A function that calls every `callback_steps` steps during inference. The function is called with the
524
+ following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
525
+ callback_steps (`int`, *optional*, defaults to 1):
526
+ The frequency at which the `callback` function is called. If not specified, the callback is called at
527
+ every step.
528
+ cross_attention_kwargs (`dict`, *optional*):
529
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
530
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
531
+ clip_skip (`int`, *optional*):
532
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
533
+ the output of the pre-final layer will be used for computing the prompt embeddings.
534
+ Examples:
535
+
536
+ Returns:
537
+ [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
538
+ If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
539
+ returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
540
+ """
541
+ # 0. Default height and width to unet
542
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
543
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
544
+
545
+ num_videos_per_prompt = 1
546
+
547
+ # 1. Check inputs. Raise error if not correct
548
+ self.check_inputs(
549
+ prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
550
+ )
551
+
552
+ # 2. Define call parameters
553
+ if prompt is not None and isinstance(prompt, str):
554
+ batch_size = 1
555
+ elif prompt is not None and isinstance(prompt, list):
556
+ batch_size = len(prompt)
557
+ else:
558
+ batch_size = prompt_embeds.shape[0]
559
+
560
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
561
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
562
+ # corresponds to doing no classifier free guidance.
563
+ do_classifier_free_guidance = guidance_scale > 1.0
564
+
565
+ # 3. Encode input prompt
566
+ text_encoder_lora_scale = (
567
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
568
+ )
569
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
570
+ prompt,
571
+ num_videos_per_prompt,
572
+ do_classifier_free_guidance,
573
+ negative_prompt,
574
+ prompt_embeds=prompt_embeds,
575
+ negative_prompt_embeds=negative_prompt_embeds,
576
+ lora_scale=text_encoder_lora_scale,
577
+ clip_skip=clip_skip,
578
+ )
579
+ # For classifier free guidance, we need to do two forward passes.
580
+ # Here we concatenate the unconditional and text embeddings into a single batch
581
+ # to avoid doing two forward passes
582
+ if do_classifier_free_guidance:
583
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
584
+
585
+ if ip_adapter_image is not None:
586
+ image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, num_videos_per_prompt)
587
+ if do_classifier_free_guidance:
588
+ image_embeds = paddle.concat([negative_image_embeds, image_embeds])
589
+
590
+ # 4. Prepare timesteps
591
+ self.scheduler.set_timesteps(num_inference_steps)
592
+ timesteps = self.scheduler.timesteps
593
+
594
+ # 5. Prepare latent variables
595
+ num_channels_latents = self.unet.config.in_channels
596
+ latents = self.prepare_latents(
597
+ batch_size * num_videos_per_prompt,
598
+ num_channels_latents,
599
+ num_frames,
600
+ height,
601
+ width,
602
+ prompt_embeds.dtype,
603
+ generator,
604
+ latents,
605
+ )
606
+
607
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
608
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
609
+ # 7 Add image embeds for IP-Adapter
610
+ added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
611
+
612
+ # Denoising loop
613
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
614
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
615
+ for i, t in enumerate(timesteps):
616
+ # expand the latents if we are doing classifier free guidance
617
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
618
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
619
+
620
+ # predict the noise residual
621
+ noise_pred = self.unet(
622
+ latent_model_input,
623
+ t,
624
+ encoder_hidden_states=prompt_embeds,
625
+ cross_attention_kwargs=cross_attention_kwargs,
626
+ added_cond_kwargs=added_cond_kwargs,
627
+ ).sample
628
+
629
+ # perform guidance
630
+ if do_classifier_free_guidance:
631
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
632
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
633
+
634
+ # compute the previous noisy sample x_t -> x_t-1
635
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
636
+
637
+ # call the callback, if provided
638
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
639
+ progress_bar.update()
640
+ if callback is not None and i % callback_steps == 0:
641
+ callback(i, t, latents)
642
+
643
+ if output_type == "latent":
644
+ return AnimateDiffPipelineOutput(frames=latents)
645
+
646
+ # Post-processing
647
+ video_tensor = self.decode_latents(latents)
648
+
649
+ if output_type == "pd":
650
+ video = video_tensor
651
+ else:
652
+ video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
653
+
654
+ if not return_dict:
655
+ return (video,)
656
+
657
+ return AnimateDiffPipelineOutput(frames=video)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/audioldm/__init__.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import TYPE_CHECKING
16
+
17
+ from ...utils import (
18
+ PPDIFFUSERS_SLOW_IMPORT,
19
+ OptionalDependencyNotAvailable,
20
+ _LazyModule,
21
+ is_paddle_available,
22
+ is_paddlenlp_available,
23
+ is_paddlenlp_version,
24
+ )
25
+
26
+ _dummy_objects = {}
27
+ _import_structure = {}
28
+
29
+ try:
30
+ if not (is_paddlenlp_available() and is_paddle_available() and is_paddlenlp_version(">=", "2.5.2")):
31
+ raise OptionalDependencyNotAvailable()
32
+ except OptionalDependencyNotAvailable:
33
+ from ...utils.dummy_paddle_and_paddlenlp_objects import AudioLDMPipeline
34
+
35
+ _dummy_objects.update({"AudioLDMPipeline": AudioLDMPipeline})
36
+ else:
37
+ _import_structure["pipeline_audioldm"] = ["AudioLDMPipeline"]
38
+
39
+
40
+ if TYPE_CHECKING or PPDIFFUSERS_SLOW_IMPORT:
41
+ try:
42
+ if not (is_paddlenlp_available() and is_paddle_available() and is_paddlenlp_version(">=", "2.5.2")):
43
+ raise OptionalDependencyNotAvailable()
44
+ except OptionalDependencyNotAvailable:
45
+ from ...utils.dummy_paddle_and_paddlenlp_objects import AudioLDMPipeline
46
+
47
+ else:
48
+ from .pipeline_audioldm import AudioLDMPipeline
49
+ else:
50
+ import sys
51
+
52
+ sys.modules[__name__] = _LazyModule(
53
+ __name__,
54
+ globals()["__file__"],
55
+ _import_structure,
56
+ module_spec=__spec__,
57
+ )
58
+
59
+ for name, value in _dummy_objects.items():
60
+ setattr(sys.modules[__name__], name, value)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/audioldm/pipeline_audioldm.py ADDED
@@ -0,0 +1,553 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Union
17
+
18
+ import numpy as np
19
+ import paddle
20
+ import paddle.nn.functional as F
21
+
22
+ from ppdiffusers.transformers import (
23
+ ClapTextModelWithProjection,
24
+ RobertaTokenizer,
25
+ SpeechT5HifiGan,
26
+ )
27
+
28
+ from ...models import AutoencoderKL, UNet2DConditionModel
29
+ from ...schedulers import KarrasDiffusionSchedulers
30
+ from ...utils import logging, replace_example_docstring
31
+ from ...utils.paddle_utils import randn_tensor
32
+ from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
33
+
34
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
35
+
36
+ EXAMPLE_DOC_STRING = """
37
+ Examples:
38
+ ```py
39
+ >>> from ppdiffusers import AudioLDMPipeline
40
+ >>> import paddle
41
+ >>> import scipy
42
+
43
+ >>> repo_id = "cvssp/audioldm-s-full-v2"
44
+ >>> pipe = AudioLDMPipeline.from_pretrained(repo_id, paddle_dtype=paddle.float16)
45
+
46
+ >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
47
+ >>> audio = pipe(prompt, num_inference_steps=10, audio_length_in_s=5.0).audios[0]
48
+
49
+ >>> # save the audio sample as a .wav file
50
+ >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
51
+ ```
52
+ """
53
+
54
+
55
+ class AudioLDMPipeline(DiffusionPipeline):
56
+ r"""
57
+ Pipeline for text-to-audio generation using AudioLDM.
58
+
59
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
60
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
61
+
62
+ Args:
63
+ vae ([`AutoencoderKL`]):
64
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
65
+ text_encoder ([`~transformers.ClapTextModelWithProjection`]):
66
+ Frozen text-encoder (`ClapTextModelWithProjection`, specifically the
67
+ [laion/clap-htsat-unfused](https://huggingface.co/laion/clap-htsat-unfused) variant.
68
+ tokenizer ([`PreTrainedTokenizer`]):
69
+ A [`~transformers.RobertaTokenizer`] to tokenize text.
70
+ unet ([`UNet2DConditionModel`]):
71
+ A `UNet2DConditionModel` to denoise the encoded audio latents.
72
+ scheduler ([`SchedulerMixin`]):
73
+ A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
74
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
75
+ vocoder ([`~transformers.SpeechT5HifiGan`]):
76
+ Vocoder of class `SpeechT5HifiGan`.
77
+ """
78
+
79
+ model_cpu_offload_seq = "text_encoder->unet->vae"
80
+
81
+ def __init__(
82
+ self,
83
+ vae: AutoencoderKL,
84
+ text_encoder: ClapTextModelWithProjection,
85
+ tokenizer: RobertaTokenizer,
86
+ unet: UNet2DConditionModel,
87
+ scheduler: KarrasDiffusionSchedulers,
88
+ vocoder: SpeechT5HifiGan,
89
+ ):
90
+ super().__init__()
91
+
92
+ self.register_modules(
93
+ vae=vae,
94
+ text_encoder=text_encoder,
95
+ tokenizer=tokenizer,
96
+ unet=unet,
97
+ scheduler=scheduler,
98
+ vocoder=vocoder,
99
+ )
100
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
101
+
102
+ def _encode_prompt(
103
+ self,
104
+ prompt,
105
+ num_waveforms_per_prompt,
106
+ do_classifier_free_guidance,
107
+ negative_prompt=None,
108
+ prompt_embeds: Optional[paddle.Tensor] = None,
109
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
110
+ ):
111
+ r"""
112
+ Encodes the prompt into text encoder hidden states.
113
+
114
+ Args:
115
+ prompt (`str` or `List[str]`, *optional*):
116
+ prompt to be encoded
117
+ num_waveforms_per_prompt (`int`):
118
+ number of waveforms that should be generated per prompt
119
+ do_classifier_free_guidance (`bool`):
120
+ whether to use classifier free guidance or not
121
+ negative_prompt (`str` or `List[str]`, *optional*):
122
+ The prompt or prompts not to guide the audio generation. If not defined, one has to pass
123
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
124
+ less than `1`).
125
+ prompt_embeds (`paddle.Tensor`, *optional*):
126
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
127
+ provided, text embeddings will be generated from `prompt` input argument.
128
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
129
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
130
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
131
+ argument.
132
+ """
133
+ if self.text_encoder.text_model.embeddings.token_type_ids.dtype not in [
134
+ paddle.int16,
135
+ paddle.int32,
136
+ paddle.int64,
137
+ ]:
138
+ self.text_encoder.text_model.embeddings.token_type_ids = (
139
+ self.text_encoder.text_model.embeddings.token_type_ids.cast("int64")
140
+ )
141
+
142
+ if prompt is not None and isinstance(prompt, str):
143
+ batch_size = 1
144
+ elif prompt is not None and isinstance(prompt, list):
145
+ batch_size = len(prompt)
146
+ else:
147
+ batch_size = prompt_embeds.shape[0]
148
+
149
+ if prompt_embeds is None:
150
+ text_inputs = self.tokenizer(
151
+ prompt,
152
+ padding="max_length",
153
+ max_length=self.tokenizer.model_max_length,
154
+ return_attention_mask=True,
155
+ truncation=True,
156
+ return_tensors="pd",
157
+ )
158
+ text_input_ids = text_inputs.input_ids
159
+ attention_mask = text_inputs.attention_mask
160
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
161
+
162
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
163
+ text_input_ids, untruncated_ids
164
+ ):
165
+ removed_text = self.tokenizer.batch_decode(
166
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
167
+ )
168
+ logger.warning(
169
+ "The following part of your input was truncated because CLAP can only handle sequences up to"
170
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
171
+ )
172
+
173
+ prompt_embeds = self.text_encoder(
174
+ text_input_ids,
175
+ attention_mask=attention_mask,
176
+ )
177
+ prompt_embeds = prompt_embeds.text_embeds
178
+ # additional L_2 normalization over each hidden-state
179
+ prompt_embeds = F.normalize(prompt_embeds, axis=-1)
180
+
181
+ prompt_embeds = prompt_embeds.cast(dtype=self.unet.dtype)
182
+
183
+ (
184
+ bs_embed,
185
+ seq_len,
186
+ ) = prompt_embeds.shape
187
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
188
+ prompt_embeds = prompt_embeds.tile([1, num_waveforms_per_prompt])
189
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_waveforms_per_prompt, seq_len])
190
+
191
+ # get unconditional embeddings for classifier free guidance
192
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
193
+ uncond_tokens: List[str]
194
+ if negative_prompt is None:
195
+ uncond_tokens = [""] * batch_size
196
+ elif type(prompt) is not type(negative_prompt):
197
+ raise TypeError(
198
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
199
+ f" {type(prompt)}."
200
+ )
201
+ elif isinstance(negative_prompt, str):
202
+ uncond_tokens = [negative_prompt]
203
+ elif batch_size != len(negative_prompt):
204
+ raise ValueError(
205
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
206
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
207
+ " the batch size of `prompt`."
208
+ )
209
+ else:
210
+ uncond_tokens = negative_prompt
211
+
212
+ max_length = prompt_embeds.shape[1]
213
+ uncond_input = self.tokenizer(
214
+ uncond_tokens,
215
+ padding="max_length",
216
+ max_length=max_length,
217
+ truncation=True,
218
+ return_tensors="pd",
219
+ return_attention_mask=True,
220
+ )
221
+
222
+ uncond_input_ids = uncond_input.input_ids
223
+ attention_mask = uncond_input.attention_mask
224
+
225
+ negative_prompt_embeds = self.text_encoder(
226
+ uncond_input_ids,
227
+ attention_mask=attention_mask,
228
+ )
229
+ negative_prompt_embeds = negative_prompt_embeds.text_embeds
230
+ # additional L_2 normalization over each hidden-state
231
+ negative_prompt_embeds = F.normalize(negative_prompt_embeds, axis=-1)
232
+
233
+ if do_classifier_free_guidance:
234
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
235
+ seq_len = negative_prompt_embeds.shape[1]
236
+
237
+ negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.unet.dtype)
238
+
239
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_waveforms_per_prompt])
240
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_waveforms_per_prompt, seq_len])
241
+
242
+ # For classifier free guidance, we need to do two forward passes.
243
+ # Here we concatenate the unconditional and text embeddings into a single batch
244
+ # to avoid doing two forward passes
245
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
246
+
247
+ return prompt_embeds
248
+
249
+ def decode_latents(self, latents):
250
+ latents = 1 / self.vae.config.scaling_factor * latents
251
+ mel_spectrogram = self.vae.decode(latents).sample
252
+ return mel_spectrogram
253
+
254
+ def mel_spectrogram_to_waveform(self, mel_spectrogram):
255
+ if mel_spectrogram.dim() == 4:
256
+ mel_spectrogram = mel_spectrogram.squeeze(1)
257
+
258
+ waveform = self.vocoder(mel_spectrogram)
259
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
260
+ waveform = waveform.cast("float32").cpu()
261
+ return waveform
262
+
263
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
264
+ def prepare_extra_step_kwargs(self, generator, eta):
265
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
266
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
267
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
268
+ # and should be between [0, 1]
269
+
270
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
271
+ extra_step_kwargs = {}
272
+ if accepts_eta:
273
+ extra_step_kwargs["eta"] = eta
274
+
275
+ # check if the scheduler accepts generator
276
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
277
+ if accepts_generator:
278
+ extra_step_kwargs["generator"] = generator
279
+ return extra_step_kwargs
280
+
281
+ def check_inputs(
282
+ self,
283
+ prompt,
284
+ audio_length_in_s,
285
+ vocoder_upsample_factor,
286
+ callback_steps,
287
+ negative_prompt=None,
288
+ prompt_embeds=None,
289
+ negative_prompt_embeds=None,
290
+ ):
291
+ min_audio_length_in_s = vocoder_upsample_factor * self.vae_scale_factor
292
+ if audio_length_in_s < min_audio_length_in_s:
293
+ raise ValueError(
294
+ f"`audio_length_in_s` has to be a positive value greater than or equal to {min_audio_length_in_s}, but "
295
+ f"is {audio_length_in_s}."
296
+ )
297
+
298
+ if self.vocoder.config.model_in_dim % self.vae_scale_factor != 0:
299
+ raise ValueError(
300
+ f"The number of frequency bins in the vocoder's log-mel spectrogram has to be divisible by the "
301
+ f"VAE scale factor, but got {self.vocoder.config.model_in_dim} bins and a scale factor of "
302
+ f"{self.vae_scale_factor}."
303
+ )
304
+
305
+ if (callback_steps is None) or (
306
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
307
+ ):
308
+ raise ValueError(
309
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
310
+ f" {type(callback_steps)}."
311
+ )
312
+
313
+ if prompt is not None and prompt_embeds is not None:
314
+ raise ValueError(
315
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
316
+ " only forward one of the two."
317
+ )
318
+ elif prompt is None and prompt_embeds is None:
319
+ raise ValueError(
320
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
321
+ )
322
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
323
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
324
+
325
+ if negative_prompt is not None and negative_prompt_embeds is not None:
326
+ raise ValueError(
327
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
328
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
329
+ )
330
+
331
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
332
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
333
+ raise ValueError(
334
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
335
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
336
+ f" {negative_prompt_embeds.shape}."
337
+ )
338
+
339
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents with width->self.vocoder.config.model_in_dim
340
+ def prepare_latents(self, batch_size, num_channels_latents, height, dtype, generator, latents=None):
341
+ shape = (
342
+ batch_size,
343
+ num_channels_latents,
344
+ height // self.vae_scale_factor,
345
+ self.vocoder.config.model_in_dim // self.vae_scale_factor,
346
+ )
347
+ if isinstance(generator, list) and len(generator) != batch_size:
348
+ raise ValueError(
349
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
350
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
351
+ )
352
+
353
+ if latents is None:
354
+ latents = randn_tensor(shape, generator=generator, dtype=dtype)
355
+ else:
356
+ latents = latents.cast(dtype)
357
+
358
+ # scale the initial noise by the standard deviation required by the scheduler
359
+ latents = latents * self.scheduler.init_noise_sigma
360
+ return latents
361
+
362
+ @paddle.no_grad()
363
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
364
+ def __call__(
365
+ self,
366
+ prompt: Union[str, List[str]] = None,
367
+ audio_length_in_s: Optional[float] = None,
368
+ num_inference_steps: int = 10,
369
+ guidance_scale: float = 2.5,
370
+ negative_prompt: Optional[Union[str, List[str]]] = None,
371
+ num_waveforms_per_prompt: Optional[int] = 1,
372
+ eta: float = 0.0,
373
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
374
+ latents: Optional[paddle.Tensor] = None,
375
+ prompt_embeds: Optional[paddle.Tensor] = None,
376
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
377
+ return_dict: bool = True,
378
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
379
+ callback_steps: Optional[int] = 1,
380
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
381
+ output_type: Optional[str] = "np",
382
+ ):
383
+ r"""
384
+ The call function to the pipeline for generation.
385
+
386
+ Args:
387
+ prompt (`str` or `List[str]`, *optional*):
388
+ The prompt or prompts to guide audio generation. If not defined, you need to pass `prompt_embeds`.
389
+ audio_length_in_s (`int`, *optional*, defaults to 5.12):
390
+ The length of the generated audio sample in seconds.
391
+ num_inference_steps (`int`, *optional*, defaults to 10):
392
+ The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
393
+ expense of slower inference.
394
+ guidance_scale (`float`, *optional*, defaults to 2.5):
395
+ A higher guidance scale value encourages the model to generate audio that is closely linked to the text
396
+ `prompt` at the expense of lower sound quality. Guidance scale is enabled when `guidance_scale > 1`.
397
+ negative_prompt (`str` or `List[str]`, *optional*):
398
+ The prompt or prompts to guide what to not include in audio generation. If not defined, you need to
399
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
400
+ num_waveforms_per_prompt (`int`, *optional*, defaults to 1):
401
+ The number of waveforms to generate per prompt.
402
+ eta (`float`, *optional*, defaults to 0.0):
403
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
404
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
405
+ generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
406
+ A [`paddle.Generator`] to make generation deterministic.
407
+ latents (`paddle.Tensor`, *optional*):
408
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
409
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
410
+ tensor is generated by sampling using the supplied random `generator`.
411
+ prompt_embeds (`paddle.Tensor`, *optional*):
412
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
413
+ provided, text embeddings are generated from the `prompt` input argument.
414
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
415
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
416
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
417
+ return_dict (`bool`, *optional*, defaults to `True`):
418
+ Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
419
+ callback (`Callable`, *optional*):
420
+ A function that calls every `callback_steps` steps during inference. The function is called with the
421
+ following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
422
+ callback_steps (`int`, *optional*, defaults to 1):
423
+ The frequency at which the `callback` function is called. If not specified, the callback is called at
424
+ every step.
425
+ cross_attention_kwargs (`dict`, *optional*):
426
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
427
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
428
+ output_type (`str`, *optional*, defaults to `"np"`):
429
+ The output format of the generated image. Choose between `"np"` to return a NumPy `np.ndarray` or
430
+ `"pd"` to return a Paddle `paddle.Tensor` object.
431
+
432
+ Examples:
433
+
434
+ Returns:
435
+ [`~pipelines.AudioPipelineOutput`] or `tuple`:
436
+ If `return_dict` is `True`, [`~pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
437
+ returned where the first element is a list with the generated audio.
438
+ """
439
+ # 0. Convert audio input length from seconds to spectrogram height
440
+ vocoder_upsample_factor = np.prod(self.vocoder.config.upsample_rates) / self.vocoder.config.sampling_rate
441
+
442
+ if audio_length_in_s is None:
443
+ audio_length_in_s = self.unet.config.sample_size * self.vae_scale_factor * vocoder_upsample_factor
444
+
445
+ height = int(audio_length_in_s / vocoder_upsample_factor)
446
+
447
+ original_waveform_length = int(audio_length_in_s * self.vocoder.config.sampling_rate)
448
+ if height % self.vae_scale_factor != 0:
449
+ height = int(np.ceil(height / self.vae_scale_factor)) * self.vae_scale_factor
450
+ logger.info(
451
+ f"Audio length in seconds {audio_length_in_s} is increased to {height * vocoder_upsample_factor} "
452
+ f"so that it can be handled by the model. It will be cut to {audio_length_in_s} after the "
453
+ f"denoising process."
454
+ )
455
+
456
+ # 1. Check inputs. Raise error if not correct
457
+ self.check_inputs(
458
+ prompt,
459
+ audio_length_in_s,
460
+ vocoder_upsample_factor,
461
+ callback_steps,
462
+ negative_prompt,
463
+ prompt_embeds,
464
+ negative_prompt_embeds,
465
+ )
466
+
467
+ # 2. Define call parameters
468
+ if prompt is not None and isinstance(prompt, str):
469
+ batch_size = 1
470
+ elif prompt is not None and isinstance(prompt, list):
471
+ batch_size = len(prompt)
472
+ else:
473
+ batch_size = prompt_embeds.shape[0]
474
+
475
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
476
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
477
+ # corresponds to doing no classifier free guidance.
478
+ do_classifier_free_guidance = guidance_scale > 1.0
479
+
480
+ # 3. Encode input prompt
481
+ prompt_embeds = self._encode_prompt(
482
+ prompt,
483
+ num_waveforms_per_prompt,
484
+ do_classifier_free_guidance,
485
+ negative_prompt,
486
+ prompt_embeds=prompt_embeds,
487
+ negative_prompt_embeds=negative_prompt_embeds,
488
+ )
489
+
490
+ # 4. Prepare timesteps
491
+ self.scheduler.set_timesteps(num_inference_steps)
492
+ timesteps = self.scheduler.timesteps
493
+
494
+ # 5. Prepare latent variables
495
+ num_channels_latents = self.unet.config.in_channels
496
+ latents = self.prepare_latents(
497
+ batch_size * num_waveforms_per_prompt,
498
+ num_channels_latents,
499
+ height,
500
+ prompt_embeds.dtype,
501
+ generator,
502
+ latents,
503
+ )
504
+
505
+ # 6. Prepare extra step kwargs
506
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
507
+
508
+ # 7. Denoising loop
509
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
510
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
511
+ for i, t in enumerate(timesteps):
512
+ # expand the latents if we are doing classifier free guidance
513
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
514
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
515
+
516
+ # predict the noise residual
517
+ noise_pred = self.unet(
518
+ latent_model_input,
519
+ t,
520
+ encoder_hidden_states=None,
521
+ class_labels=prompt_embeds,
522
+ cross_attention_kwargs=cross_attention_kwargs,
523
+ ).sample
524
+
525
+ # perform guidance
526
+ if do_classifier_free_guidance:
527
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
528
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
529
+
530
+ # compute the previous noisy sample x_t -> x_t-1
531
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
532
+
533
+ # call the callback, if provided
534
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
535
+ progress_bar.update()
536
+ if callback is not None and i % callback_steps == 0:
537
+ step_idx = i // getattr(self.scheduler, "order", 1)
538
+ callback(step_idx, t, latents)
539
+
540
+ # 8. Post-processing
541
+ mel_spectrogram = self.decode_latents(latents)
542
+
543
+ audio = self.mel_spectrogram_to_waveform(mel_spectrogram)
544
+
545
+ audio = audio[:, :original_waveform_length]
546
+
547
+ if output_type == "np":
548
+ audio = audio.numpy()
549
+
550
+ if not return_dict:
551
+ return (audio,)
552
+
553
+ return AudioPipelineOutput(audios=audio)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Salesforce.com, inc.
2
+ # Copyright 2023 The HuggingFace Team. All rights reserved.#
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import List, Optional, Union
15
+
16
+ import paddle
17
+ import PIL.Image
18
+
19
+ from ppdiffusers.transformers import CLIPTokenizer
20
+
21
+ from ...models import AutoencoderKL, UNet2DConditionModel
22
+ from ...schedulers import PNDMScheduler
23
+ from ...utils import logging, replace_example_docstring
24
+ from ...utils.paddle_utils import randn_tensor
25
+ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
26
+ from .blip_image_processing import BlipImageProcessor
27
+ from .modeling_blip2 import Blip2QFormerModel
28
+ from .modeling_ctx_clip import ContextCLIPTextModel
29
+
30
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
31
+
32
+ EXAMPLE_DOC_STRING = """
33
+ Examples:
34
+ ```py
35
+ >>> from ppdiffusers.pipelines import BlipDiffusionPipeline
36
+ >>> from ppdiffusers.utils import load_image
37
+ >>> import paddle
38
+
39
+ >>> blip_diffusion_pipe = BlipDiffusionPipeline.from_pretrained(
40
+ ... "Salesforce/blipdiffusion", paddle_dtype=paddle.float16
41
+ ... )
42
+
43
+
44
+ >>> cond_subject = "dog"
45
+ >>> tgt_subject = "dog"
46
+ >>> text_prompt_input = "swimming underwater"
47
+
48
+ >>> cond_image = load_image(
49
+ ... "https://huggingface.co/datasets/ayushtues/blipdiffusion_images/resolve/main/dog.jpg"
50
+ ... )
51
+ >>> guidance_scale = 7.5
52
+ >>> num_inference_steps = 25
53
+ >>> negative_prompt = "over-exposure, under-exposure, saturated, duplicate, out of frame, lowres, cropped, worst quality, low quality, jpeg artifacts, morbid, mutilated, out of frame, ugly, bad anatomy, bad proportions, deformed, blurry, duplicate"
54
+
55
+
56
+ >>> output = blip_diffusion_pipe(
57
+ ... text_prompt_input,
58
+ ... cond_image,
59
+ ... cond_subject,
60
+ ... tgt_subject,
61
+ ... guidance_scale=guidance_scale,
62
+ ... num_inference_steps=num_inference_steps,
63
+ ... neg_prompt=negative_prompt,
64
+ ... height=512,
65
+ ... width=512,
66
+ ... ).images
67
+ >>> output[0].save("image.png")
68
+ ```
69
+ """
70
+
71
+
72
+ class BlipDiffusionPipeline(DiffusionPipeline):
73
+ """
74
+ Pipeline for Zero-Shot Subject Driven Generation using Blip Diffusion.
75
+
76
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
77
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
78
+
79
+ Args:
80
+ tokenizer ([`CLIPTokenizer`]):
81
+ Tokenizer for the text encoder
82
+ text_encoder ([`ContextCLIPTextModel`]):
83
+ Text encoder to encode the text prompt
84
+ vae ([`AutoencoderKL`]):
85
+ VAE model to map the latents to the image
86
+ unet ([`UNet2DConditionModel`]):
87
+ Conditional U-Net architecture to denoise the image embedding.
88
+ scheduler ([`PNDMScheduler`]):
89
+ A scheduler to be used in combination with `unet` to generate image latents.
90
+ qformer ([`Blip2QFormerModel`]):
91
+ QFormer model to get multi-modal embeddings from the text and image.
92
+ image_processor ([`BlipImageProcessor`]):
93
+ Image Processor to preprocess and postprocess the image.
94
+ ctx_begin_pos (int, `optional`, defaults to 2):
95
+ Position of the context token in the text encoder.
96
+ """
97
+
98
+ model_cpu_offload_seq = "qformer->text_encoder->unet->vae"
99
+
100
+ def __init__(
101
+ self,
102
+ tokenizer: CLIPTokenizer,
103
+ text_encoder: ContextCLIPTextModel,
104
+ vae: AutoencoderKL,
105
+ unet: UNet2DConditionModel,
106
+ scheduler: PNDMScheduler,
107
+ qformer: Blip2QFormerModel,
108
+ image_processor: BlipImageProcessor,
109
+ ctx_begin_pos: int = 2,
110
+ mean: List[float] = None,
111
+ std: List[float] = None,
112
+ ):
113
+ super().__init__()
114
+
115
+ self.register_modules(
116
+ tokenizer=tokenizer,
117
+ text_encoder=text_encoder,
118
+ vae=vae,
119
+ unet=unet,
120
+ scheduler=scheduler,
121
+ qformer=qformer,
122
+ image_processor=image_processor,
123
+ )
124
+ self.register_to_config(ctx_begin_pos=ctx_begin_pos, mean=mean, std=std)
125
+
126
+ def get_query_embeddings(self, input_image, src_subject):
127
+ return self.qformer(image_input=input_image, text_input=src_subject, return_dict=False)
128
+
129
+ # from the original Blip Diffusion code, speciefies the target subject and augments the prompt by repeating it
130
+ def _build_prompt(self, prompts, tgt_subjects, prompt_strength=1.0, prompt_reps=20):
131
+ rv = []
132
+ for prompt, tgt_subject in zip(prompts, tgt_subjects):
133
+ prompt = f"a {tgt_subject} {prompt.strip()}"
134
+ # a trick to amplify the prompt
135
+ rv.append(", ".join([prompt] * int(prompt_strength * prompt_reps)))
136
+
137
+ return rv
138
+
139
+ # Copied from ppdiffusers.pipelines.consistency_models.pipeline_consistency_models.ConsistencyModelPipeline.prepare_latents
140
+ def prepare_latents(self, batch_size, num_channels, height, width, dtype, generator, latents=None):
141
+ shape = (batch_size, num_channels, height, width)
142
+ if isinstance(generator, list) and len(generator) != batch_size:
143
+ raise ValueError(
144
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
145
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
146
+ )
147
+
148
+ if latents is None:
149
+ latents = randn_tensor(shape, generator=generator, dtype=dtype)
150
+ else:
151
+ latents = latents.cast(dtype=dtype)
152
+
153
+ # scale the initial noise by the standard deviation required by the scheduler
154
+ latents = latents * self.scheduler.init_noise_sigma
155
+ return latents
156
+
157
+ def encode_prompt(self, query_embeds, prompt):
158
+
159
+ # embeddings for prompt, with query_embeds as context
160
+ max_len = self.text_encoder.text_model.config.max_position_embeddings
161
+ max_len -= self.qformer.config.num_query_tokens
162
+
163
+ tokenized_prompt = self.tokenizer(
164
+ prompt,
165
+ padding="max_length",
166
+ truncation=True,
167
+ max_length=max_len,
168
+ return_tensors="pd",
169
+ )
170
+
171
+ batch_size = query_embeds.shape[0]
172
+ ctx_begin_pos = [self.config.ctx_begin_pos] * batch_size
173
+
174
+ text_embeddings = self.text_encoder(
175
+ input_ids=tokenized_prompt.input_ids,
176
+ ctx_embeddings=query_embeds,
177
+ ctx_begin_pos=ctx_begin_pos,
178
+ )[0]
179
+
180
+ return text_embeddings
181
+
182
+ @paddle.no_grad()
183
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
184
+ def __call__(
185
+ self,
186
+ prompt: List[str],
187
+ reference_image: PIL.Image.Image,
188
+ source_subject_category: List[str],
189
+ target_subject_category: List[str],
190
+ latents: Optional[paddle.Tensor] = None,
191
+ guidance_scale: float = 7.5,
192
+ height: int = 512,
193
+ width: int = 512,
194
+ num_inference_steps: int = 50,
195
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
196
+ neg_prompt: Optional[str] = "",
197
+ prompt_strength: float = 1.0,
198
+ prompt_reps: int = 20,
199
+ output_type: Optional[str] = "pil",
200
+ return_dict: bool = True,
201
+ ):
202
+ """
203
+ Function invoked when calling the pipeline for generation.
204
+
205
+ Args:
206
+ prompt (`List[str]`):
207
+ The prompt or prompts to guide the image generation.
208
+ reference_image (`PIL.Image.Image`):
209
+ The reference image to condition the generation on.
210
+ source_subject_category (`List[str]`):
211
+ The source subject category.
212
+ target_subject_category (`List[str]`):
213
+ The target subject category.
214
+ latents (`paddle.Tensor`, *optional*):
215
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
216
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
217
+ tensor will ge generated by random sampling.
218
+ guidance_scale (`float`, *optional*, defaults to 7.5):
219
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
220
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
221
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
222
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
223
+ usually at the expense of lower image quality.
224
+ height (`int`, *optional*, defaults to 512):
225
+ The height of the generated image.
226
+ width (`int`, *optional*, defaults to 512):
227
+ The width of the generated image.
228
+ num_inference_steps (`int`, *optional*, defaults to 50):
229
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
230
+ expense of slower inference.
231
+ generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
232
+ One or a list of [paddle generator(s)] to make generation deterministic.
233
+ neg_prompt (`str`, *optional*, defaults to ""):
234
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
235
+ if `guidance_scale` is less than `1`).
236
+ prompt_strength (`float`, *optional*, defaults to 1.0):
237
+ The strength of the prompt. Specifies the number of times the prompt is repeated along with prompt_reps
238
+ to amplify the prompt.
239
+ prompt_reps (`int`, *optional*, defaults to 20):
240
+ The number of times the prompt is repeated along with prompt_strength to amplify the prompt.
241
+ output_type (`str`, *optional*, defaults to `"pil"`):
242
+ The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
243
+ (`np.array`) or `"d"` (`paddle.Tensor`).
244
+ return_dict (`bool`, *optional*, defaults to `True`):
245
+ Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
246
+ Examples:
247
+
248
+ Returns:
249
+ [`~pipelines.ImagePipelineOutput`] or `tuple`
250
+ """
251
+
252
+ reference_image = self.image_processor.preprocess(
253
+ reference_image, image_mean=self.config.mean, image_std=self.config.std, return_tensors="pd"
254
+ )["pixel_values"]
255
+
256
+ if isinstance(prompt, str):
257
+ prompt = [prompt]
258
+ if isinstance(source_subject_category, str):
259
+ source_subject_category = [source_subject_category]
260
+ if isinstance(target_subject_category, str):
261
+ target_subject_category = [target_subject_category]
262
+
263
+ batch_size = len(prompt)
264
+
265
+ prompt = self._build_prompt(
266
+ prompts=prompt,
267
+ tgt_subjects=target_subject_category,
268
+ prompt_strength=prompt_strength,
269
+ prompt_reps=prompt_reps,
270
+ )
271
+ query_embeds = self.get_query_embeddings(reference_image, source_subject_category)
272
+ text_embeddings = self.encode_prompt(query_embeds, prompt)
273
+ do_classifier_free_guidance = guidance_scale > 1.0
274
+ if do_classifier_free_guidance:
275
+ max_length = self.text_encoder.text_model.config.max_position_embeddings
276
+
277
+ uncond_input = self.tokenizer(
278
+ [neg_prompt] * batch_size,
279
+ padding="max_length",
280
+ max_length=max_length,
281
+ return_tensors="pd",
282
+ )
283
+ uncond_embeddings = self.text_encoder(
284
+ input_ids=uncond_input.input_ids,
285
+ ctx_embeddings=None,
286
+ )[0]
287
+ # For classifier free guidance, we need to do two forward passes.
288
+ # Here we concatenate the unconditional and text embeddings into a single batch
289
+ # to avoid doing two forward passes
290
+ text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
291
+
292
+ scale_down_factor = 2 ** (len(self.unet.config.block_out_channels) - 1)
293
+ latents = self.prepare_latents(
294
+ batch_size=batch_size,
295
+ num_channels=self.unet.config.in_channels,
296
+ height=height // scale_down_factor,
297
+ width=width // scale_down_factor,
298
+ generator=generator,
299
+ latents=latents,
300
+ dtype=self.unet.dtype,
301
+ )
302
+ # set timesteps
303
+ extra_set_kwargs = {}
304
+ self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
305
+
306
+ for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
307
+ # expand the latents if we are doing classifier free guidance
308
+ do_classifier_free_guidance = guidance_scale > 1.0
309
+
310
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
311
+
312
+ noise_pred = self.unet(
313
+ latent_model_input,
314
+ timestep=t,
315
+ encoder_hidden_states=text_embeddings,
316
+ down_block_additional_residuals=None,
317
+ mid_block_additional_residual=None,
318
+ )["sample"]
319
+
320
+ # perform guidance
321
+ if do_classifier_free_guidance:
322
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
323
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
324
+
325
+ latents = self.scheduler.step(
326
+ noise_pred,
327
+ t,
328
+ latents,
329
+ )["prev_sample"]
330
+
331
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
332
+ image = self.image_processor.postprocess(image, output_type=output_type)
333
+
334
+ if not return_dict:
335
+ return (image,)
336
+
337
+ return ImagePipelineOutput(images=image)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/__init__.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from typing import TYPE_CHECKING
17
+
18
+ from ...utils import (
19
+ PPDIFFUSERS_SLOW_IMPORT,
20
+ OptionalDependencyNotAvailable,
21
+ _LazyModule,
22
+ get_objects_from_module,
23
+ is_fastdeploy_available,
24
+ is_paddle_available,
25
+ is_paddlenlp_available,
26
+ )
27
+
28
+ _dummy_objects = {}
29
+ _import_structure = {}
30
+
31
+ try:
32
+ if not (is_paddlenlp_available() and is_paddle_available()):
33
+ raise OptionalDependencyNotAvailable()
34
+ except OptionalDependencyNotAvailable:
35
+ from ...utils import dummy_paddle_and_paddlenlp_objects # noqa F403
36
+
37
+ _dummy_objects.update(get_objects_from_module(dummy_paddle_and_paddlenlp_objects))
38
+ else:
39
+ _import_structure["multicontrolnet"] = ["MultiControlNetModel"]
40
+ _import_structure["pipeline_controlnet"] = ["StableDiffusionControlNetPipeline"]
41
+ _import_structure["pipeline_controlnet_blip_diffusion"] = ["BlipDiffusionControlNetPipeline"]
42
+ _import_structure["pipeline_controlnet_img2img"] = ["StableDiffusionControlNetImg2ImgPipeline"]
43
+ _import_structure["pipeline_controlnet_inpaint"] = ["StableDiffusionControlNetInpaintPipeline"]
44
+ _import_structure["pipeline_controlnet_inpaint_sd_xl"] = ["StableDiffusionXLControlNetInpaintPipeline"]
45
+ _import_structure["pipeline_controlnet_sd_xl"] = ["StableDiffusionXLControlNetPipeline"]
46
+ _import_structure["pipeline_controlnet_sd_xl_img2img"] = ["StableDiffusionXLControlNetImg2ImgPipeline"]
47
+ try:
48
+ if not (is_paddle_available() and is_paddlenlp_available() and is_fastdeploy_available()):
49
+ raise OptionalDependencyNotAvailable()
50
+ except OptionalDependencyNotAvailable:
51
+ from ...utils import dummy_fastdeploy_objects # noqa F403
52
+
53
+ _dummy_objects.update(get_objects_from_module(dummy_fastdeploy_objects))
54
+ else:
55
+ _import_structure["pipeline_fastdeploy_stable_diffusion_controlnet"] = [
56
+ "FastDeployStableDiffusionControlNetPipeline"
57
+ ]
58
+
59
+ _import_structure["pipeline_paddleinfer_stable_diffusion_controlnet"] = [
60
+ "PaddleInferStableDiffusionControlNetPipeline",
61
+ ]
62
+
63
+
64
+ if TYPE_CHECKING or PPDIFFUSERS_SLOW_IMPORT:
65
+ try:
66
+ if not (is_paddlenlp_available() and is_paddle_available()):
67
+ raise OptionalDependencyNotAvailable()
68
+
69
+ except OptionalDependencyNotAvailable:
70
+ from ...utils.dummy_paddle_and_paddlenlp_objects import *
71
+ else:
72
+ from .multicontrolnet import MultiControlNetModel
73
+ from .pipeline_controlnet import StableDiffusionControlNetPipeline
74
+ from .pipeline_controlnet_blip_diffusion import BlipDiffusionControlNetPipeline
75
+ from .pipeline_controlnet_img2img import (
76
+ StableDiffusionControlNetImg2ImgPipeline,
77
+ )
78
+ from .pipeline_controlnet_inpaint import (
79
+ StableDiffusionControlNetInpaintPipeline,
80
+ )
81
+ from .pipeline_controlnet_inpaint_sd_xl import (
82
+ StableDiffusionXLControlNetInpaintPipeline,
83
+ )
84
+ from .pipeline_controlnet_sd_xl import StableDiffusionXLControlNetPipeline
85
+ from .pipeline_controlnet_sd_xl_img2img import (
86
+ StableDiffusionXLControlNetImg2ImgPipeline,
87
+ )
88
+
89
+ try:
90
+ if not (is_paddle_available() and is_paddlenlp_available() and is_fastdeploy_available()):
91
+ raise OptionalDependencyNotAvailable()
92
+ except OptionalDependencyNotAvailable:
93
+ from ...utils.dummy_fastdeploy_objects import * # noqa F403
94
+ else:
95
+ from .pipeline_fastdeploy_stable_diffusion_controlnet import (
96
+ FastDeployStableDiffusionControlNetPipeline,
97
+ )
98
+
99
+ from .pipeline_paddleinfer_stable_diffusion_controlnet import (
100
+ PaddleInferStableDiffusionControlNetPipeline,
101
+ )
102
+
103
+ else:
104
+ import sys
105
+
106
+ sys.modules[__name__] = _LazyModule(
107
+ __name__,
108
+ globals()["__file__"],
109
+ _import_structure,
110
+ module_spec=__spec__,
111
+ )
112
+ for name, value in _dummy_objects.items():
113
+ setattr(sys.modules[__name__], name, value)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/multicontrolnet.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
+
18
+ import paddle
19
+ from paddle import nn
20
+
21
+ from ...models.controlnet import ControlNetModel, ControlNetOutput
22
+ from ...models.modeling_utils import ModelMixin
23
+ from ...utils import logging
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+
28
+ class MultiControlNetModel(ModelMixin):
29
+ r"""
30
+ Multiple `ControlNetModel` wrapper class for Multi-ControlNet
31
+
32
+ This module is a wrapper for multiple instances of the `ControlNetModel`. The `forward()` API is designed to be
33
+ compatible with `ControlNetModel`.
34
+
35
+ Args:
36
+ controlnets (`List[ControlNetModel]`):
37
+ Provides additional conditioning to the unet during the denoising process. You must set multiple
38
+ `ControlNetModel` as a list.
39
+ """
40
+
41
+ def __init__(self, controlnets: Union[List[ControlNetModel], Tuple[ControlNetModel]]):
42
+ super().__init__()
43
+ self.nets = nn.LayerList(controlnets)
44
+
45
+ def forward(
46
+ self,
47
+ sample: paddle.Tensor,
48
+ timestep: Union[paddle.Tensor, float, int],
49
+ encoder_hidden_states: paddle.Tensor,
50
+ controlnet_cond: List[paddle.Tensor],
51
+ conditioning_scale: List[float],
52
+ class_labels: Optional[paddle.Tensor] = None,
53
+ timestep_cond: Optional[paddle.Tensor] = None,
54
+ attention_mask: Optional[paddle.Tensor] = None,
55
+ added_cond_kwargs: Optional[Dict[str, paddle.Tensor]] = None,
56
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
57
+ guess_mode: bool = False,
58
+ return_dict: bool = True,
59
+ ) -> Union[ControlNetOutput, Tuple]:
60
+ for i, (image, scale, controlnet) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)):
61
+ down_samples, mid_sample = controlnet(
62
+ sample=sample,
63
+ timestep=timestep,
64
+ encoder_hidden_states=encoder_hidden_states,
65
+ controlnet_cond=image,
66
+ conditioning_scale=scale,
67
+ class_labels=class_labels,
68
+ timestep_cond=timestep_cond,
69
+ attention_mask=attention_mask,
70
+ added_cond_kwargs=added_cond_kwargs,
71
+ cross_attention_kwargs=cross_attention_kwargs,
72
+ guess_mode=guess_mode,
73
+ return_dict=return_dict,
74
+ )
75
+
76
+ # merge samples
77
+ if i == 0:
78
+ down_block_res_samples, mid_block_res_sample = down_samples, mid_sample
79
+ else:
80
+ down_block_res_samples = [
81
+ samples_prev + samples_curr
82
+ for samples_prev, samples_curr in zip(down_block_res_samples, down_samples)
83
+ ]
84
+ mid_block_res_sample += mid_sample
85
+
86
+ return down_block_res_samples, mid_block_res_sample
87
+
88
+ def save_pretrained(
89
+ self,
90
+ save_directory: Union[str, os.PathLike],
91
+ is_main_process: bool = True,
92
+ save_function: Callable = None,
93
+ safe_serialization: bool = True,
94
+ variant: Optional[str] = None,
95
+ ):
96
+ """
97
+ Save a model and its configuration file to a directory, so that it can be re-loaded using the
98
+ `[`~pipelines.controlnet.MultiControlNetModel.from_pretrained`]` class method.
99
+
100
+ Arguments:
101
+ save_directory (`str` or `os.PathLike`):
102
+ Directory to which to save. Will be created if it doesn't exist.
103
+ is_main_process (`bool`, *optional*, defaults to `True`):
104
+ Whether the process calling this is the main process or not. Useful when in distributed training like
105
+ TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
106
+ the main process to avoid race conditions.
107
+ save_function (`Callable`):
108
+ The function to use to save the state dictionary. Useful on distributed training like TPUs when one
109
+ need to replace `paddle.save` by another method. Can be configured with the environment variable
110
+ `PPDIFFUSERS_SAVE_MODE`.
111
+ safe_serialization (`bool`, *optional*, defaults to `True`):
112
+ Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
113
+ variant (`str`, *optional*):
114
+ If specified, weights are saved in the format pytorch_model.<variant>.bin.
115
+ """
116
+ idx = 0
117
+ model_path_to_save = save_directory
118
+ for controlnet in self.nets:
119
+ controlnet.save_pretrained(
120
+ model_path_to_save,
121
+ is_main_process=is_main_process,
122
+ save_function=save_function,
123
+ safe_serialization=safe_serialization,
124
+ variant=variant,
125
+ )
126
+
127
+ idx += 1
128
+ model_path_to_save = model_path_to_save + f"_{idx}"
129
+
130
+ @classmethod
131
+ def from_pretrained(cls, pretrained_model_path: Optional[Union[str, os.PathLike]], **kwargs):
132
+ r"""
133
+ Instantiate a pretrained MultiControlNet model from multiple pre-trained controlnet models.
134
+
135
+ The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
136
+ the model, you should first set it back in training mode with `model.train()`.
137
+
138
+ The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
139
+ pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
140
+ task.
141
+
142
+ The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
143
+ weights are discarded.
144
+
145
+ Parameters:
146
+ pretrained_model_path (`os.PathLike`):
147
+ A path to a *directory* containing model weights saved using
148
+ [`~ppdiffusers.pipelines.controlnet.MultiControlNetModel.save_pretrained`], e.g.,
149
+ `./my_model_directory/controlnet`.
150
+ paddle_dtype (`str` or `paddle.dtype`, *optional*):
151
+ Override the default `paddle.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
152
+ will be automatically derived from the model's weights.
153
+ output_loading_info(`bool`, *optional*, defaults to `False`):
154
+ Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
155
+ max_memory (`Dict`, *optional*):
156
+ A dictionary device identifier to maximum memory. Will default to the maximum memory available for each
157
+ GPU and the available CPU RAM if unset.
158
+ low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if paddle version >= 2.5.2 else `False`):
159
+ Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
160
+ also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
161
+ model. This is only supported when paddle version >= 2.5.2. If you are using an older version of paddle,
162
+ setting this argument to `True` will raise an error.
163
+ variant (`str`, *optional*):
164
+ If specified load weights from `variant` filename, *e.g.* model_state.<variant>.pdparams.
165
+ use_safetensors (`bool`, *optional*, defaults to `None`):
166
+ If set to `None`, the `safetensors` weights will be downloaded if they're available **and** if the
167
+ `safetensors` library is installed. If set to `True`, the model will be forcibly loaded from
168
+ `safetensors` weights. If set to `False`, loading will *not* use `safetensors`.
169
+ """
170
+ idx = 0
171
+ controlnets = []
172
+
173
+ # load controlnet and append to list until no controlnet directory exists anymore
174
+ # first controlnet has to be saved under `./mydirectory/controlnet` to be compliant with `DiffusionPipeline.from_prertained`
175
+ # second, third, ... controlnets have to be saved under `./mydirectory/controlnet_1`, `./mydirectory/controlnet_2`, ...
176
+ model_path_to_load = pretrained_model_path
177
+ while os.path.isdir(model_path_to_load):
178
+ controlnet = ControlNetModel.from_pretrained(model_path_to_load, **kwargs)
179
+ controlnets.append(controlnet)
180
+
181
+ idx += 1
182
+ model_path_to_load = pretrained_model_path + f"_{idx}"
183
+
184
+ logger.info(f"{len(controlnets)} controlnets loaded from {pretrained_model_path}.")
185
+
186
+ if len(controlnets) == 0:
187
+ raise ValueError(
188
+ f"No ControlNets found under {os.path.dirname(pretrained_model_path)}. Expected at least {pretrained_model_path + '_0'}."
189
+ )
190
+
191
+ return cls(controlnets)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_controlnet.py ADDED
@@ -0,0 +1,1159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import inspect
17
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
18
+
19
+ import numpy as np
20
+ import paddle
21
+ import PIL.Image
22
+
23
+ from ppdiffusers.transformers import (
24
+ CLIPImageProcessor,
25
+ CLIPTextModel,
26
+ CLIPTokenizer,
27
+ CLIPVisionModelWithProjection,
28
+ )
29
+
30
+ from ...image_processor import PipelineImageInput, VaeImageProcessor
31
+ from ...loaders import (
32
+ FromSingleFileMixin,
33
+ IPAdapterMixin,
34
+ LoraLoaderMixin,
35
+ TextualInversionLoaderMixin,
36
+ )
37
+ from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
38
+ from ...models.lora import adjust_lora_scale_text_encoder
39
+ from ...schedulers import KarrasDiffusionSchedulers
40
+ from ...utils import USE_PEFT_BACKEND, deprecate, logging, replace_example_docstring
41
+ from ...utils.paddle_utils import randn_tensor
42
+ from ..pipeline_utils import DiffusionPipeline
43
+ from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
44
+ from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
45
+ from .multicontrolnet import MultiControlNetModel
46
+
47
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
48
+
49
+
50
+ EXAMPLE_DOC_STRING = """
51
+ Examples:
52
+ ```py
53
+ >>> # !pip install opencv-python transformers accelerate
54
+ >>> from ppdiffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
55
+ >>> from ppdiffusers.utils import load_image
56
+ >>> import numpy as np
57
+ >>> import paddle
58
+
59
+ >>> import cv2
60
+ >>> from PIL import Image
61
+
62
+ >>> # download an image
63
+ >>> image = load_image(
64
+ ... "https://hf-mirror.com/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
65
+ ... )
66
+ >>> image = np.array(image)
67
+
68
+ >>> # get canny image
69
+ >>> image = cv2.Canny(image, 100, 200)
70
+ >>> image = image[:, :, None]
71
+ >>> image = np.concatenate([image, image, image], axis=2)
72
+ >>> canny_image = Image.fromarray(image)
73
+
74
+ >>> # load control net and stable diffusion v1-5
75
+ >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", paddle_dtype=paddle.float16)
76
+ >>> pipe = StableDiffusionControlNetPipeline.from_pretrained(
77
+ ... "runwayml/stable-diffusion-v1-5", controlnet=controlnet, paddle_dtype=paddle.float16
78
+ ... )
79
+
80
+ >>> # speed up diffusion process with faster scheduler and memory optimization
81
+ >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
82
+ >>> # remove following line if xformers is not installed
83
+ >>> pipe.enable_xformers_memory_efficient_attention()
84
+
85
+
86
+ >>> # generate image
87
+ >>> generator = paddle.Generator().manual_seed(0)
88
+ >>> image = pipe(
89
+ ... "futuristic-looking woman", num_inference_steps=20, generator=generator, image=canny_image
90
+ ... ).images[0]
91
+ ```
92
+ """
93
+
94
+
95
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
96
+ def retrieve_timesteps(
97
+ scheduler,
98
+ num_inference_steps: Optional[int] = None,
99
+ timesteps: Optional[List[int]] = None,
100
+ **kwargs,
101
+ ):
102
+ """
103
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
104
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
105
+
106
+ Args:
107
+ scheduler (`SchedulerMixin`):
108
+ The scheduler to get timesteps from.
109
+ num_inference_steps (`int`):
110
+ The number of diffusion steps used when generating samples with a pre-trained model. If used,
111
+ `timesteps` must be `None`.
112
+ timesteps (`List[int]`, *optional*):
113
+ Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
114
+ timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
115
+ must be `None`.
116
+
117
+ Returns:
118
+ `Tuple[paddle.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
119
+ second element is the number of inference steps.
120
+ """
121
+ if timesteps is not None:
122
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
123
+ if not accepts_timesteps:
124
+ raise ValueError(
125
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
126
+ f" timestep schedules. Please check whether you are using the correct scheduler."
127
+ )
128
+ scheduler.set_timesteps(timesteps=timesteps, **kwargs)
129
+ timesteps = scheduler.timesteps
130
+ num_inference_steps = len(timesteps)
131
+ else:
132
+ scheduler.set_timesteps(num_inference_steps, **kwargs)
133
+ timesteps = scheduler.timesteps
134
+ return timesteps, num_inference_steps
135
+
136
+
137
+ class StableDiffusionControlNetPipeline(
138
+ DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
139
+ ):
140
+ r"""
141
+ Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
142
+
143
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
144
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
145
+
146
+ The pipeline also inherits the following loading methods:
147
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
148
+ - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
149
+
150
+ Args:
151
+ vae ([`AutoencoderKL`]):
152
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
153
+ text_encoder ([`~transformers.CLIPTextModel`]):
154
+ Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
155
+ tokenizer ([`~transformers.CLIPTokenizer`]):
156
+ A `CLIPTokenizer` to tokenize text.
157
+ unet ([`UNet2DConditionModel`]):
158
+ A `UNet2DConditionModel` to denoise the encoded image latents.
159
+ controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
160
+ Provides additional conditioning to the `unet` during the denoising process. If you set multiple
161
+ ControlNets as a list, the outputs from each ControlNet are added together to create one combined
162
+ additional conditioning.
163
+ scheduler ([`SchedulerMixin`]):
164
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
165
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
166
+ safety_checker ([`StableDiffusionSafetyChecker`]):
167
+ Classification module that estimates whether generated images could be considered offensive or harmful.
168
+ Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
169
+ about a model's potential harms.
170
+ feature_extractor ([`~transformers.CLIPImageProcessor`]):
171
+ A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
172
+ """
173
+
174
+ model_cpu_offload_seq = "text_encoder->unet->vae"
175
+ _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
176
+ _exclude_from_cpu_offload = ["safety_checker"]
177
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
178
+
179
+ def __init__(
180
+ self,
181
+ vae: AutoencoderKL,
182
+ text_encoder: CLIPTextModel,
183
+ tokenizer: CLIPTokenizer,
184
+ unet: UNet2DConditionModel,
185
+ controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
186
+ scheduler: KarrasDiffusionSchedulers,
187
+ safety_checker: StableDiffusionSafetyChecker,
188
+ feature_extractor: CLIPImageProcessor,
189
+ image_encoder: CLIPVisionModelWithProjection = None,
190
+ requires_safety_checker: bool = True,
191
+ ):
192
+ super().__init__()
193
+
194
+ if safety_checker is None and requires_safety_checker:
195
+ logger.warning(
196
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
197
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
198
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
199
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
200
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
201
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
202
+ )
203
+
204
+ if safety_checker is not None and feature_extractor is None:
205
+ raise ValueError(
206
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
207
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
208
+ )
209
+
210
+ if isinstance(controlnet, (list, tuple)):
211
+ controlnet = MultiControlNetModel(controlnet)
212
+
213
+ self.register_modules(
214
+ vae=vae,
215
+ text_encoder=text_encoder,
216
+ tokenizer=tokenizer,
217
+ unet=unet,
218
+ controlnet=controlnet,
219
+ scheduler=scheduler,
220
+ safety_checker=safety_checker,
221
+ feature_extractor=feature_extractor,
222
+ image_encoder=image_encoder,
223
+ )
224
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
225
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
226
+ self.control_image_processor = VaeImageProcessor(
227
+ vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
228
+ )
229
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
230
+
231
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
232
+ def _encode_prompt(
233
+ self,
234
+ prompt,
235
+ num_images_per_prompt,
236
+ do_classifier_free_guidance,
237
+ negative_prompt=None,
238
+ prompt_embeds: Optional[paddle.Tensor] = None,
239
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
240
+ lora_scale: Optional[float] = None,
241
+ **kwargs,
242
+ ):
243
+ deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
244
+ deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
245
+
246
+ prompt_embeds_tuple = self.encode_prompt(
247
+ prompt=prompt,
248
+ num_images_per_prompt=num_images_per_prompt,
249
+ do_classifier_free_guidance=do_classifier_free_guidance,
250
+ negative_prompt=negative_prompt,
251
+ prompt_embeds=prompt_embeds,
252
+ negative_prompt_embeds=negative_prompt_embeds,
253
+ lora_scale=lora_scale,
254
+ **kwargs,
255
+ )
256
+
257
+ # concatenate for backwards comp
258
+ prompt_embeds = paddle.concat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
259
+
260
+ return prompt_embeds
261
+
262
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
263
+ def encode_prompt(
264
+ self,
265
+ prompt,
266
+ num_images_per_prompt,
267
+ do_classifier_free_guidance,
268
+ negative_prompt=None,
269
+ prompt_embeds: Optional[paddle.Tensor] = None,
270
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
271
+ lora_scale: Optional[float] = None,
272
+ clip_skip: Optional[int] = None,
273
+ ):
274
+ r"""
275
+ Encodes the prompt into text encoder hidden states.
276
+
277
+ Args:
278
+ prompt (`str` or `List[str]`, *optional*):
279
+ prompt to be encoded
280
+ num_images_per_prompt (`int`):
281
+ number of images that should be generated per prompt
282
+ do_classifier_free_guidance (`bool`):
283
+ whether to use classifier free guidance or not
284
+ negative_prompt (`str` or `List[str]`, *optional*):
285
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
286
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
287
+ less than `1`).
288
+ prompt_embeds (`paddle.Tensor`, *optional*):
289
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
290
+ provided, text embeddings will be generated from `prompt` input argument.
291
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
292
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
293
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
294
+ argument.
295
+ lora_scale (`float`, *optional*):
296
+ A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
297
+ clip_skip (`int`, *optional*):
298
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
299
+ the output of the pre-final layer will be used for computing the prompt embeddings.
300
+ """
301
+ # set lora scale so that monkey patched LoRA
302
+ # function of text encoder can correctly access it
303
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
304
+ self._lora_scale = lora_scale
305
+
306
+ # dynamically adjust the LoRA scale
307
+ if not USE_PEFT_BACKEND:
308
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
309
+
310
+ if prompt is not None and isinstance(prompt, str):
311
+ batch_size = 1
312
+ elif prompt is not None and isinstance(prompt, list):
313
+ batch_size = len(prompt)
314
+ else:
315
+ batch_size = prompt_embeds.shape[0]
316
+
317
+ if prompt_embeds is None:
318
+ # textual inversion: process multi-vector tokens if necessary
319
+ if isinstance(self, TextualInversionLoaderMixin):
320
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
321
+
322
+ text_inputs = self.tokenizer(
323
+ prompt,
324
+ padding="max_length",
325
+ max_length=self.tokenizer.model_max_length,
326
+ truncation=True,
327
+ return_tensors="pd",
328
+ )
329
+ text_input_ids = text_inputs.input_ids
330
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
331
+
332
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
333
+ text_input_ids, untruncated_ids
334
+ ):
335
+ removed_text = self.tokenizer.batch_decode(
336
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
337
+ )
338
+ logger.warning(
339
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
340
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
341
+ )
342
+
343
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
344
+ attention_mask = text_inputs.attention_mask
345
+ else:
346
+ attention_mask = None
347
+
348
+ if clip_skip is None:
349
+ prompt_embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask)
350
+ prompt_embeds = prompt_embeds[0]
351
+ else:
352
+ prompt_embeds = self.text_encoder(
353
+ text_input_ids, attention_mask=attention_mask, output_hidden_states=True
354
+ )
355
+ # Access the `hidden_states` first, that contains a tuple of
356
+ # all the hidden states from the encoder layers. Then index into
357
+ # the tuple to access the hidden states from the desired layer.
358
+ prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
359
+ # We also need to apply the final LayerNorm here to not mess with the
360
+ # representations. The `last_hidden_states` that we typically use for
361
+ # obtaining the final prompt representations passes through the LayerNorm
362
+ # layer.
363
+ prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
364
+
365
+ if self.text_encoder is not None:
366
+ prompt_embeds_dtype = self.text_encoder.dtype
367
+ elif self.unet is not None:
368
+ prompt_embeds_dtype = self.unet.dtype
369
+ else:
370
+ prompt_embeds_dtype = prompt_embeds.dtype
371
+
372
+ prompt_embeds = prompt_embeds.cast(dtype=prompt_embeds_dtype)
373
+
374
+ bs_embed, seq_len, _ = prompt_embeds.shape
375
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
376
+ prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
377
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
378
+
379
+ # get unconditional embeddings for classifier free guidance
380
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
381
+ uncond_tokens: List[str]
382
+ if negative_prompt is None:
383
+ uncond_tokens = [""] * batch_size
384
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
385
+ raise TypeError(
386
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
387
+ f" {type(prompt)}."
388
+ )
389
+ elif isinstance(negative_prompt, str):
390
+ uncond_tokens = [negative_prompt]
391
+ elif batch_size != len(negative_prompt):
392
+ raise ValueError(
393
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
394
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
395
+ " the batch size of `prompt`."
396
+ )
397
+ else:
398
+ uncond_tokens = negative_prompt
399
+
400
+ # textual inversion: process multi-vector tokens if necessary
401
+ if isinstance(self, TextualInversionLoaderMixin):
402
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
403
+
404
+ max_length = prompt_embeds.shape[1]
405
+ uncond_input = self.tokenizer(
406
+ uncond_tokens,
407
+ padding="max_length",
408
+ max_length=max_length,
409
+ truncation=True,
410
+ return_tensors="pd",
411
+ )
412
+
413
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
414
+ attention_mask = uncond_input.attention_mask
415
+ else:
416
+ attention_mask = None
417
+
418
+ negative_prompt_embeds = self.text_encoder(
419
+ uncond_input.input_ids,
420
+ attention_mask=attention_mask,
421
+ )
422
+ negative_prompt_embeds = negative_prompt_embeds[0]
423
+
424
+ if do_classifier_free_guidance:
425
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
426
+ seq_len = negative_prompt_embeds.shape[1]
427
+
428
+ negative_prompt_embeds = negative_prompt_embeds.cast(dtype=prompt_embeds_dtype)
429
+
430
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
431
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
432
+
433
+ return prompt_embeds, negative_prompt_embeds
434
+
435
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
436
+ def encode_image(self, image, num_images_per_prompt):
437
+ dtype = next(self.image_encoder.named_parameters())[1].dtype
438
+
439
+ if not isinstance(image, paddle.Tensor):
440
+ image = self.feature_extractor(image, return_tensors="pd").pixel_values
441
+
442
+ image = image.cast(dtype=dtype)
443
+ image_embeds = self.image_encoder(image).image_embeds
444
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, axis=0)
445
+
446
+ uncond_image_embeds = paddle.zeros_like(image_embeds)
447
+ return image_embeds, uncond_image_embeds
448
+
449
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
450
+ def run_safety_checker(self, image, dtype):
451
+ if self.safety_checker is None:
452
+ has_nsfw_concept = None
453
+ else:
454
+ if paddle.is_tensor(x=image):
455
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
456
+ else:
457
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
458
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd")
459
+ image, has_nsfw_concept = self.safety_checker(
460
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
461
+ )
462
+ return image, has_nsfw_concept
463
+
464
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
465
+ def decode_latents(self, latents):
466
+ deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
467
+ deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
468
+
469
+ latents = 1 / self.vae.config.scaling_factor * latents
470
+ image = self.vae.decode(latents, return_dict=False)[0]
471
+ image = (image / 2 + 0.5).clip(0, 1)
472
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
473
+ image = image.cast("float32").transpose([0, 2, 3, 1]).cpu().numpy()
474
+ return image
475
+
476
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
477
+ def prepare_extra_step_kwargs(self, generator, eta):
478
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
479
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
480
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
481
+ # and should be between [0, 1]
482
+
483
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
484
+ extra_step_kwargs = {}
485
+ if accepts_eta:
486
+ extra_step_kwargs["eta"] = eta
487
+
488
+ # check if the scheduler accepts generator
489
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
490
+ if accepts_generator:
491
+ extra_step_kwargs["generator"] = generator
492
+ return extra_step_kwargs
493
+
494
+ def check_inputs(
495
+ self,
496
+ prompt,
497
+ image,
498
+ callback_steps,
499
+ negative_prompt=None,
500
+ prompt_embeds=None,
501
+ negative_prompt_embeds=None,
502
+ controlnet_conditioning_scale=1.0,
503
+ control_guidance_start=0.0,
504
+ control_guidance_end=1.0,
505
+ callback_on_step_end_tensor_inputs=None,
506
+ ):
507
+ if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
508
+ raise ValueError(
509
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
510
+ f" {type(callback_steps)}."
511
+ )
512
+
513
+ if callback_on_step_end_tensor_inputs is not None and not all(
514
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
515
+ ):
516
+ raise ValueError(
517
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
518
+ )
519
+
520
+ if prompt is not None and prompt_embeds is not None:
521
+ raise ValueError(
522
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
523
+ " only forward one of the two."
524
+ )
525
+ elif prompt is None and prompt_embeds is None:
526
+ raise ValueError(
527
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
528
+ )
529
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
530
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
531
+
532
+ if negative_prompt is not None and negative_prompt_embeds is not None:
533
+ raise ValueError(
534
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
535
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
536
+ )
537
+
538
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
539
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
540
+ raise ValueError(
541
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
542
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
543
+ f" {negative_prompt_embeds.shape}."
544
+ )
545
+
546
+ # `prompt` needs more sophisticated handling when there are multiple
547
+ # conditionings.
548
+ if isinstance(self.controlnet, MultiControlNetModel):
549
+ if isinstance(prompt, list):
550
+ logger.warning(
551
+ f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
552
+ " prompts. The conditionings will be fixed across the prompts."
553
+ )
554
+
555
+ # Check `image`
556
+ if isinstance(self.controlnet, ControlNetModel):
557
+ self.check_image(image, prompt, prompt_embeds)
558
+ elif isinstance(self.controlnet, MultiControlNetModel):
559
+ if not isinstance(image, list):
560
+ raise TypeError("For multiple controlnets: `image` must be type `list`")
561
+
562
+ # When `image` is a nested list:
563
+ # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
564
+ elif any(isinstance(i, list) for i in image):
565
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
566
+ elif len(image) != len(self.controlnet.nets):
567
+ raise ValueError(
568
+ f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
569
+ )
570
+
571
+ for image_ in image:
572
+ self.check_image(image_, prompt, prompt_embeds)
573
+ else:
574
+ assert False
575
+
576
+ # Check `controlnet_conditioning_scale`
577
+ if isinstance(self.controlnet, ControlNetModel):
578
+ if not isinstance(controlnet_conditioning_scale, float):
579
+ raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
580
+ elif isinstance(self.controlnet, MultiControlNetModel):
581
+ if isinstance(controlnet_conditioning_scale, list):
582
+ if any(isinstance(i, list) for i in controlnet_conditioning_scale):
583
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
584
+ elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
585
+ self.controlnet.nets
586
+ ):
587
+ raise ValueError(
588
+ "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
589
+ " the same length as the number of controlnets"
590
+ )
591
+ else:
592
+ assert False
593
+
594
+ if not isinstance(control_guidance_start, (tuple, list)):
595
+ control_guidance_start = [control_guidance_start]
596
+
597
+ if not isinstance(control_guidance_end, (tuple, list)):
598
+ control_guidance_end = [control_guidance_end]
599
+
600
+ if len(control_guidance_start) != len(control_guidance_end):
601
+ raise ValueError(
602
+ f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
603
+ )
604
+
605
+ if isinstance(self.controlnet, MultiControlNetModel):
606
+ if len(control_guidance_start) != len(self.controlnet.nets):
607
+ raise ValueError(
608
+ f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
609
+ )
610
+
611
+ for start, end in zip(control_guidance_start, control_guidance_end):
612
+ if start >= end:
613
+ raise ValueError(
614
+ f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
615
+ )
616
+ if start < 0.0:
617
+ raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
618
+ if end > 1.0:
619
+ raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
620
+
621
+ def check_image(self, image, prompt, prompt_embeds):
622
+ image_is_pil = isinstance(image, PIL.Image.Image)
623
+ image_is_tensor = isinstance(image, paddle.Tensor)
624
+ image_is_np = isinstance(image, np.ndarray)
625
+ image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
626
+ image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor)
627
+ image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
628
+
629
+ if (
630
+ not image_is_pil
631
+ and not image_is_tensor
632
+ and not image_is_np
633
+ and not image_is_pil_list
634
+ and not image_is_tensor_list
635
+ and not image_is_np_list
636
+ ):
637
+ raise TypeError(
638
+ f"image must be passed and be one of PIL image, numpy array, paddle tensor, list of PIL images, list of numpy arrays or list of paddle tensors, but is {type(image)}"
639
+ )
640
+
641
+ if image_is_pil:
642
+ image_batch_size = 1
643
+ else:
644
+ image_batch_size = len(image)
645
+
646
+ if prompt is not None and isinstance(prompt, str):
647
+ prompt_batch_size = 1
648
+ elif prompt is not None and isinstance(prompt, list):
649
+ prompt_batch_size = len(prompt)
650
+ elif prompt_embeds is not None:
651
+ prompt_batch_size = prompt_embeds.shape[0]
652
+
653
+ if image_batch_size != 1 and image_batch_size != prompt_batch_size:
654
+ raise ValueError(
655
+ f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
656
+ )
657
+
658
+ def prepare_image(
659
+ self,
660
+ image,
661
+ width,
662
+ height,
663
+ batch_size,
664
+ num_images_per_prompt,
665
+ dtype,
666
+ do_classifier_free_guidance=False,
667
+ guess_mode=False,
668
+ ):
669
+ image = self.control_image_processor.preprocess(image, height=height, width=width).cast(dtype=paddle.float32)
670
+ image_batch_size = image.shape[0]
671
+
672
+ if image_batch_size == 1:
673
+ repeat_by = batch_size
674
+ else:
675
+ # image batch size is the same as prompt batch size
676
+ repeat_by = num_images_per_prompt
677
+
678
+ image = image.repeat_interleave(repeat_by, axis=0)
679
+
680
+ image = image.cast(dtype=dtype)
681
+
682
+ if do_classifier_free_guidance and not guess_mode:
683
+ image = paddle.concat([image] * 2)
684
+
685
+ return image
686
+
687
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
688
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
689
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
690
+ if isinstance(generator, list) and len(generator) != batch_size:
691
+ raise ValueError(
692
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
693
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
694
+ )
695
+
696
+ if latents is None:
697
+ latents = randn_tensor(shape, generator=generator, dtype=dtype)
698
+ else:
699
+ latents = latents.cast(dtype)
700
+
701
+ # scale the initial noise by the standard deviation required by the scheduler
702
+ latents = latents * self.scheduler.init_noise_sigma
703
+ return latents
704
+
705
+ # Copied from ppdiffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
706
+ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=paddle.float32):
707
+ """
708
+ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
709
+
710
+ Args:
711
+ timesteps (`paddle.Tensor`):
712
+ generate embedding vectors at these timesteps
713
+ embedding_dim (`int`, *optional*, defaults to 512):
714
+ dimension of the embeddings to generate
715
+ dtype:
716
+ data type of the generated embeddings
717
+
718
+ Returns:
719
+ `paddle.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
720
+ """
721
+ assert len(w.shape) == 1
722
+ w = w * 1000.0
723
+
724
+ half_dim = embedding_dim // 2
725
+ emb = paddle.log(paddle.to_tensor(10000.0)) / (half_dim - 1)
726
+ emb = paddle.exp(paddle.arange(half_dim, dtype=dtype) * -emb)
727
+ emb = w.cast(dtype=dtype)[:, None] * emb[None, :]
728
+ emb = paddle.concat([paddle.sin(emb), paddle.cos(emb)], axis=1)
729
+ if embedding_dim % 2 == 1:
730
+ emb = paddle.concat(emb, paddle.zeros([emb.shape[0], 1]), axis=-1)
731
+ assert emb.shape == [w.shape[0], embedding_dim]
732
+ return emb
733
+
734
+ @property
735
+ def guidance_scale(self):
736
+ return self._guidance_scale
737
+
738
+ @property
739
+ def clip_skip(self):
740
+ return self._clip_skip
741
+
742
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
743
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
744
+ # corresponds to doing no classifier free guidance.
745
+ @property
746
+ def do_classifier_free_guidance(self):
747
+ return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
748
+
749
+ @property
750
+ def cross_attention_kwargs(self):
751
+ return self._cross_attention_kwargs
752
+
753
+ @property
754
+ def num_timesteps(self):
755
+ return self._num_timesteps
756
+
757
+ @paddle.no_grad()
758
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
759
+ def __call__(
760
+ self,
761
+ prompt: Union[str, List[str]] = None,
762
+ image: PipelineImageInput = None,
763
+ height: Optional[int] = None,
764
+ width: Optional[int] = None,
765
+ num_inference_steps: int = 50,
766
+ timesteps: List[int] = None,
767
+ guidance_scale: float = 7.5,
768
+ negative_prompt: Optional[Union[str, List[str]]] = None,
769
+ num_images_per_prompt: Optional[int] = 1,
770
+ eta: float = 0.0,
771
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
772
+ latents: Optional[paddle.Tensor] = None,
773
+ prompt_embeds: Optional[paddle.Tensor] = None,
774
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
775
+ ip_adapter_image: Optional[PipelineImageInput] = None,
776
+ output_type: Optional[str] = "pil",
777
+ return_dict: bool = True,
778
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
779
+ controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
780
+ guess_mode: bool = False,
781
+ control_guidance_start: Union[float, List[float]] = 0.0,
782
+ control_guidance_end: Union[float, List[float]] = 1.0,
783
+ clip_skip: Optional[int] = None,
784
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
785
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
786
+ **kwargs,
787
+ ):
788
+ r"""
789
+ The call function to the pipeline for generation.
790
+
791
+ Args:
792
+ prompt (`str` or `List[str]`, *optional*):
793
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
794
+ image (`paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[paddle.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
795
+ `List[List[paddle.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
796
+ The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
797
+ specified as `paddle.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
798
+ accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
799
+ and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
800
+ `init`, images must be passed as a list such that each element of the list can be correctly batched for
801
+ input to a single ControlNet.
802
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
803
+ The height in pixels of the generated image.
804
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
805
+ The width in pixels of the generated image.
806
+ num_inference_steps (`int`, *optional*, defaults to 50):
807
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
808
+ expense of slower inference.
809
+ timesteps (`List[int]`, *optional*):
810
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
811
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
812
+ passed will be used. Must be in descending order.
813
+ guidance_scale (`float`, *optional*, defaults to 7.5):
814
+ A higher guidance scale value encourages the model to generate images closely linked to the text
815
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
816
+ negative_prompt (`str` or `List[str]`, *optional*):
817
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
818
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
819
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
820
+ The number of images to generate per prompt.
821
+ eta (`float`, *optional*, defaults to 0.0):
822
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
823
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
824
+ generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
825
+ A [`paddle.Generator`] to make generation deterministic.
826
+
827
+ latents (`paddle.Tensor`, *optional*):
828
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
829
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
830
+ tensor is generated by sampling using the supplied random `generator`.
831
+ prompt_embeds (`paddle.Tensor`, *optional*):
832
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
833
+ provided, text embeddings are generated from the `prompt` input argument.
834
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
835
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
836
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
837
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
838
+ output_type (`str`, *optional*, defaults to `"pil"`):
839
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
840
+ return_dict (`bool`, *optional*, defaults to `True`):
841
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
842
+ plain tuple.
843
+ callback (`Callable`, *optional*):
844
+ A function that calls every `callback_steps` steps during inference. The function is called with the
845
+ following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
846
+ callback_steps (`int`, *optional*, defaults to 1):
847
+ The frequency at which the `callback` function is called. If not specified, the callback is called at
848
+ every step.
849
+ cross_attention_kwargs (`dict`, *optional*):
850
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
851
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
852
+ controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
853
+ The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
854
+ to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
855
+ the corresponding scale as a list.
856
+ guess_mode (`bool`, *optional*, defaults to `False`):
857
+ The ControlNet encoder tries to recognize the content of the input image even if you remove all
858
+ prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
859
+ control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
860
+ The percentage of total steps at which the ControlNet starts applying.
861
+ control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
862
+ The percentage of total steps at which the ControlNet stops applying.
863
+ clip_skip (`int`, *optional*):
864
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
865
+ the output of the pre-final layer will be used for computing the prompt embeddings.
866
+ callback_on_step_end (`Callable`, *optional*):
867
+ A function that calls at the end of each denoising steps during the inference. The function is called
868
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
869
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
870
+ `callback_on_step_end_tensor_inputs`.
871
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
872
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
873
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
874
+ `._callback_tensor_inputs` attribute of your pipeine class.
875
+
876
+ Examples:
877
+
878
+ Returns:
879
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
880
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
881
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
882
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
883
+ "not-safe-for-work" (nsfw) content.
884
+ """
885
+
886
+ callback = kwargs.pop("callback", None)
887
+ callback_steps = kwargs.pop("callback_steps", None)
888
+
889
+ if callback is not None:
890
+ deprecate(
891
+ "callback",
892
+ "1.0.0",
893
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
894
+ )
895
+ if callback_steps is not None:
896
+ deprecate(
897
+ "callback_steps",
898
+ "1.0.0",
899
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
900
+ )
901
+
902
+ controlnet = self.controlnet
903
+
904
+ # align format for control guidance
905
+ if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
906
+ control_guidance_start = len(control_guidance_end) * [control_guidance_start]
907
+ elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
908
+ control_guidance_end = len(control_guidance_start) * [control_guidance_end]
909
+ elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
910
+ mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
911
+ control_guidance_start, control_guidance_end = (
912
+ mult * [control_guidance_start],
913
+ mult * [control_guidance_end],
914
+ )
915
+
916
+ # 1. Check inputs. Raise error if not correct
917
+ self.check_inputs(
918
+ prompt,
919
+ image,
920
+ callback_steps,
921
+ negative_prompt,
922
+ prompt_embeds,
923
+ negative_prompt_embeds,
924
+ controlnet_conditioning_scale,
925
+ control_guidance_start,
926
+ control_guidance_end,
927
+ callback_on_step_end_tensor_inputs,
928
+ )
929
+
930
+ self._guidance_scale = guidance_scale
931
+ self._clip_skip = clip_skip
932
+ self._cross_attention_kwargs = cross_attention_kwargs
933
+
934
+ # 2. Define call parameters
935
+ if prompt is not None and isinstance(prompt, str):
936
+ batch_size = 1
937
+ elif prompt is not None and isinstance(prompt, list):
938
+ batch_size = len(prompt)
939
+ else:
940
+ batch_size = prompt_embeds.shape[0]
941
+
942
+ if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
943
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
944
+
945
+ global_pool_conditions = (
946
+ controlnet.config.global_pool_conditions
947
+ if isinstance(controlnet, ControlNetModel)
948
+ else controlnet.nets[0].config.global_pool_conditions
949
+ )
950
+ guess_mode = guess_mode or global_pool_conditions
951
+
952
+ # 3. Encode input prompt
953
+ text_encoder_lora_scale = (
954
+ self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
955
+ )
956
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
957
+ prompt,
958
+ num_images_per_prompt,
959
+ self.do_classifier_free_guidance,
960
+ negative_prompt,
961
+ prompt_embeds=prompt_embeds,
962
+ negative_prompt_embeds=negative_prompt_embeds,
963
+ lora_scale=text_encoder_lora_scale,
964
+ clip_skip=self.clip_skip,
965
+ )
966
+ # For classifier free guidance, we need to do two forward passes.
967
+ # Here we concatenate the unconditional and text embeddings into a single batch
968
+ # to avoid doing two forward passes
969
+ if self.do_classifier_free_guidance:
970
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
971
+
972
+ if ip_adapter_image is not None:
973
+ image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, num_images_per_prompt)
974
+ if self.do_classifier_free_guidance:
975
+ image_embeds = paddle.concat([negative_image_embeds, image_embeds])
976
+
977
+ # 4. Prepare image
978
+ if isinstance(controlnet, ControlNetModel):
979
+ image = self.prepare_image(
980
+ image=image,
981
+ width=width,
982
+ height=height,
983
+ batch_size=batch_size * num_images_per_prompt,
984
+ num_images_per_prompt=num_images_per_prompt,
985
+ dtype=controlnet.dtype,
986
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
987
+ guess_mode=guess_mode,
988
+ )
989
+ height, width = image.shape[-2:]
990
+ elif isinstance(controlnet, MultiControlNetModel):
991
+ images = []
992
+
993
+ for image_ in image:
994
+ image_ = self.prepare_image(
995
+ image=image_,
996
+ width=width,
997
+ height=height,
998
+ batch_size=batch_size * num_images_per_prompt,
999
+ num_images_per_prompt=num_images_per_prompt,
1000
+ dtype=controlnet.dtype,
1001
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
1002
+ guess_mode=guess_mode,
1003
+ )
1004
+
1005
+ images.append(image_)
1006
+
1007
+ image = images
1008
+ height, width = image[0].shape[-2:]
1009
+ else:
1010
+ assert False
1011
+
1012
+ # 5. Prepare timesteps
1013
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, timesteps)
1014
+ self._num_timesteps = len(timesteps)
1015
+
1016
+ # 6. Prepare latent variables
1017
+ num_channels_latents = self.unet.config.in_channels
1018
+ latents = self.prepare_latents(
1019
+ batch_size * num_images_per_prompt,
1020
+ num_channels_latents,
1021
+ height,
1022
+ width,
1023
+ prompt_embeds.dtype,
1024
+ generator,
1025
+ latents,
1026
+ )
1027
+
1028
+ # 6.5 Optionally get Guidance Scale Embedding
1029
+ timestep_cond = None
1030
+ if self.unet.config.time_cond_proj_dim is not None:
1031
+ guidance_scale_tensor = paddle.to_tensor([self.guidance_scale - 1]).tile(
1032
+ [
1033
+ batch_size * num_images_per_prompt,
1034
+ ]
1035
+ )
1036
+ timestep_cond = self.get_guidance_scale_embedding(
1037
+ guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
1038
+ ).cast(dtype=latents.dtype)
1039
+
1040
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
1041
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1042
+
1043
+ # 7.1 Add image embeds for IP-Adapter
1044
+ added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
1045
+
1046
+ # 7.2 Create tensor stating which controlnets to keep
1047
+ controlnet_keep = []
1048
+ for i in range(len(timesteps)):
1049
+ keeps = [
1050
+ 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
1051
+ for s, e in zip(control_guidance_start, control_guidance_end)
1052
+ ]
1053
+ controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
1054
+
1055
+ # 8. Denoising loop
1056
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
1057
+
1058
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1059
+ for i, t in enumerate(timesteps):
1060
+
1061
+ # expand the latents if we are doing classifier free guidance
1062
+ latent_model_input = paddle.concat([latents] * 2) if self.do_classifier_free_guidance else latents
1063
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
1064
+
1065
+ # controlnet(s) inference
1066
+ if guess_mode and self.do_classifier_free_guidance:
1067
+ # Infer ControlNet only for the conditional batch.
1068
+ control_model_input = latents
1069
+ control_model_input = self.scheduler.scale_model_input(control_model_input, t)
1070
+ controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
1071
+ else:
1072
+ control_model_input = latent_model_input
1073
+ controlnet_prompt_embeds = prompt_embeds
1074
+
1075
+ if isinstance(controlnet_keep[i], list):
1076
+ cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
1077
+ else:
1078
+ controlnet_cond_scale = controlnet_conditioning_scale
1079
+ if isinstance(controlnet_cond_scale, list):
1080
+ controlnet_cond_scale = controlnet_cond_scale[0]
1081
+ cond_scale = controlnet_cond_scale * controlnet_keep[i]
1082
+
1083
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
1084
+ control_model_input,
1085
+ t,
1086
+ encoder_hidden_states=controlnet_prompt_embeds,
1087
+ controlnet_cond=image,
1088
+ conditioning_scale=cond_scale,
1089
+ guess_mode=guess_mode,
1090
+ return_dict=False,
1091
+ )
1092
+
1093
+ if guess_mode and self.do_classifier_free_guidance:
1094
+ # Infered ControlNet only for the conditional batch.
1095
+ # To apply the output of ControlNet to both the unconditional and conditional batches,
1096
+ # add 0 to the unconditional batch to keep it unchanged.
1097
+ down_block_res_samples = [paddle.concat([paddle.zeros_like(d), d]) for d in down_block_res_samples]
1098
+ mid_block_res_sample = paddle.concat(
1099
+ [paddle.zeros_like(mid_block_res_sample), mid_block_res_sample]
1100
+ )
1101
+
1102
+ # predict the noise residual
1103
+ noise_pred = self.unet(
1104
+ latent_model_input,
1105
+ t,
1106
+ encoder_hidden_states=prompt_embeds,
1107
+ timestep_cond=timestep_cond,
1108
+ cross_attention_kwargs=self.cross_attention_kwargs,
1109
+ down_block_additional_residuals=down_block_res_samples,
1110
+ mid_block_additional_residual=mid_block_res_sample,
1111
+ added_cond_kwargs=added_cond_kwargs,
1112
+ return_dict=False,
1113
+ )[0]
1114
+
1115
+ # perform guidance
1116
+ if self.do_classifier_free_guidance:
1117
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1118
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
1119
+
1120
+ # compute the previous noisy sample x_t -> x_t-1
1121
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1122
+
1123
+ if callback_on_step_end is not None:
1124
+ callback_kwargs = {}
1125
+ for k in callback_on_step_end_tensor_inputs:
1126
+ callback_kwargs[k] = locals()[k]
1127
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
1128
+
1129
+ latents = callback_outputs.pop("latents", latents)
1130
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1131
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
1132
+
1133
+ # call the callback, if provided
1134
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1135
+ progress_bar.update()
1136
+ if callback is not None and i % callback_steps == 0:
1137
+ step_idx = i // getattr(self.scheduler, "order", 1)
1138
+ callback(step_idx, t, latents)
1139
+
1140
+ if not output_type == "latent":
1141
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
1142
+ 0
1143
+ ]
1144
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
1145
+ else:
1146
+ image = latents
1147
+ has_nsfw_concept = None
1148
+
1149
+ if has_nsfw_concept is None:
1150
+ do_denormalize = [True] * image.shape[0]
1151
+ else:
1152
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
1153
+
1154
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
1155
+
1156
+ if not return_dict:
1157
+ return (image, has_nsfw_concept)
1158
+
1159
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Salesforce.com, inc.
2
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from typing import List, Optional, Union
16
+
17
+ import paddle
18
+ import PIL.Image
19
+
20
+ from ppdiffusers.transformers import CLIPTokenizer
21
+
22
+ from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
23
+ from ...schedulers import PNDMScheduler
24
+ from ...utils import logging, replace_example_docstring
25
+ from ...utils.paddle_utils import randn_tensor
26
+ from ..blip_diffusion.blip_image_processing import BlipImageProcessor
27
+ from ..blip_diffusion.modeling_blip2 import Blip2QFormerModel
28
+ from ..blip_diffusion.modeling_ctx_clip import ContextCLIPTextModel
29
+ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
30
+
31
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
32
+
33
+ EXAMPLE_DOC_STRING = """
34
+ Examples:
35
+ ```py
36
+ >>> from ppdiffusers.pipelines import BlipDiffusionControlNetPipeline
37
+ >>> from ppdiffusers.utils import load_image
38
+ >>> from controlnet_aux import CannyDetector
39
+ >>> import paddle
40
+
41
+ >>> blip_diffusion_pipe = BlipDiffusionControlNetPipeline.from_pretrained(
42
+ ... "Salesforce/blipdiffusion-controlnet", paddle_dtype=paddle.float16
43
+ ... )
44
+
45
+ >>> style_subject = "flower"
46
+ >>> tgt_subject = "teapot"
47
+ >>> text_prompt = "on a marble table"
48
+
49
+ >>> cldm_cond_image = load_image(
50
+ ... "https://huggingface.co/datasets/ayushtues/blipdiffusion_images/resolve/main/kettle.jpg"
51
+ ... ).resize((512, 512))
52
+ >>> canny = CannyDetector()
53
+ >>> cldm_cond_image = canny(cldm_cond_image, 30, 70, output_type="pil")
54
+ >>> style_image = load_image(
55
+ ... "https://huggingface.co/datasets/ayushtues/blipdiffusion_images/resolve/main/flower.jpg"
56
+ ... )
57
+ >>> guidance_scale = 7.5
58
+ >>> num_inference_steps = 50
59
+ >>> negative_prompt = "over-exposure, under-exposure, saturated, duplicate, out of frame, lowres, cropped, worst quality, low quality, jpeg artifacts, morbid, mutilated, out of frame, ugly, bad anatomy, bad proportions, deformed, blurry, duplicate"
60
+
61
+
62
+ >>> output = blip_diffusion_pipe(
63
+ ... text_prompt,
64
+ ... style_image,
65
+ ... cldm_cond_image,
66
+ ... style_subject,
67
+ ... tgt_subject,
68
+ ... guidance_scale=guidance_scale,
69
+ ... num_inference_steps=num_inference_steps,
70
+ ... neg_prompt=negative_prompt,
71
+ ... height=512,
72
+ ... width=512,
73
+ ... ).images
74
+ >>> output[0].save("image.png")
75
+ ```
76
+ """
77
+
78
+
79
+ class BlipDiffusionControlNetPipeline(DiffusionPipeline):
80
+ """
81
+ Pipeline for Canny Edge based Controlled subject-driven generation using Blip Diffusion.
82
+
83
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
84
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
85
+
86
+ Args:
87
+ tokenizer ([`CLIPTokenizer`]):
88
+ Tokenizer for the text encoder
89
+ text_encoder ([`ContextCLIPTextModel`]):
90
+ Text encoder to encode the text prompt
91
+ vae ([`AutoencoderKL`]):
92
+ VAE model to map the latents to the image
93
+ unet ([`UNet2DConditionModel`]):
94
+ Conditional U-Net architecture to denoise the image embedding.
95
+ scheduler ([`PNDMScheduler`]):
96
+ A scheduler to be used in combination with `unet` to generate image latents.
97
+ qformer ([`Blip2QFormerModel`]):
98
+ QFormer model to get multi-modal embeddings from the text and image.
99
+ controlnet ([`ControlNetModel`]):
100
+ ControlNet model to get the conditioning image embedding.
101
+ image_processor ([`BlipImageProcessor`]):
102
+ Image Processor to preprocess and postprocess the image.
103
+ ctx_begin_pos (int, `optional`, defaults to 2):
104
+ Position of the context token in the text encoder.
105
+ """
106
+
107
+ model_cpu_offload_seq = "qformer->text_encoder->unet->vae"
108
+
109
+ def __init__(
110
+ self,
111
+ tokenizer: CLIPTokenizer,
112
+ text_encoder: ContextCLIPTextModel,
113
+ vae: AutoencoderKL,
114
+ unet: UNet2DConditionModel,
115
+ scheduler: PNDMScheduler,
116
+ qformer: Blip2QFormerModel,
117
+ controlnet: ControlNetModel,
118
+ image_processor: BlipImageProcessor,
119
+ ctx_begin_pos: int = 2,
120
+ mean: List[float] = None,
121
+ std: List[float] = None,
122
+ ):
123
+ super().__init__()
124
+
125
+ self.register_modules(
126
+ tokenizer=tokenizer,
127
+ text_encoder=text_encoder,
128
+ vae=vae,
129
+ unet=unet,
130
+ scheduler=scheduler,
131
+ qformer=qformer,
132
+ controlnet=controlnet,
133
+ image_processor=image_processor,
134
+ )
135
+ self.register_to_config(ctx_begin_pos=ctx_begin_pos, mean=mean, std=std)
136
+
137
+ def get_query_embeddings(self, input_image, src_subject):
138
+ return self.qformer(image_input=input_image, text_input=src_subject, return_dict=False)
139
+
140
+ # from the original Blip Diffusion code, speciefies the target subject and augments the prompt by repeating it
141
+ def _build_prompt(self, prompts, tgt_subjects, prompt_strength=1.0, prompt_reps=20):
142
+ rv = []
143
+ for prompt, tgt_subject in zip(prompts, tgt_subjects):
144
+ prompt = f"a {tgt_subject} {prompt.strip()}"
145
+ # a trick to amplify the prompt
146
+ rv.append(", ".join([prompt] * int(prompt_strength * prompt_reps)))
147
+
148
+ return rv
149
+
150
+ # Copied from ppdiffusers.pipelines.consistency_models.pipeline_consistency_models.ConsistencyModelPipeline.prepare_latents
151
+ def prepare_latents(self, batch_size, num_channels, height, width, dtype, generator, latents=None):
152
+ shape = (batch_size, num_channels, height, width)
153
+ if isinstance(generator, list) and len(generator) != batch_size:
154
+ raise ValueError(
155
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
156
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
157
+ )
158
+
159
+ if latents is None:
160
+ latents = randn_tensor(shape, generator=generator, dtype=dtype)
161
+ else:
162
+ latents = latents.cast(dtype=dtype)
163
+
164
+ # scale the initial noise by the standard deviation required by the scheduler
165
+ latents = latents * self.scheduler.init_noise_sigma
166
+ return latents
167
+
168
+ def encode_prompt(self, query_embeds, prompt):
169
+
170
+ # embeddings for prompt, with query_embeds as context
171
+ max_len = self.text_encoder.text_model.config.max_position_embeddings
172
+ max_len -= self.qformer.config.num_query_tokens
173
+
174
+ tokenized_prompt = self.tokenizer(
175
+ prompt,
176
+ padding="max_length",
177
+ truncation=True,
178
+ max_length=max_len,
179
+ return_tensors="pd",
180
+ )
181
+
182
+ batch_size = query_embeds.shape[0]
183
+ ctx_begin_pos = [self.config.ctx_begin_pos] * batch_size
184
+
185
+ text_embeddings = self.text_encoder(
186
+ input_ids=tokenized_prompt.input_ids,
187
+ ctx_embeddings=query_embeds,
188
+ ctx_begin_pos=ctx_begin_pos,
189
+ )[0]
190
+
191
+ return text_embeddings
192
+
193
+ # Adapted from ppdiffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
194
+ def prepare_control_image(
195
+ self,
196
+ image,
197
+ width,
198
+ height,
199
+ batch_size,
200
+ num_images_per_prompt,
201
+ dtype,
202
+ do_classifier_free_guidance=False,
203
+ ):
204
+ image = self.image_processor.preprocess(
205
+ image,
206
+ size={"width": width, "height": height},
207
+ do_rescale=True,
208
+ do_center_crop=False,
209
+ do_normalize=False,
210
+ return_tensors="pd",
211
+ )["pixel_values"]
212
+ image_batch_size = image.shape[0]
213
+
214
+ if image_batch_size == 1:
215
+ repeat_by = batch_size
216
+ else:
217
+ # image batch size is the same as prompt batch size
218
+ repeat_by = num_images_per_prompt
219
+
220
+ image = image.repeat_interleave(repeat_by, axis=0)
221
+
222
+ image = image.cast(dtype=dtype)
223
+
224
+ if do_classifier_free_guidance:
225
+ image = paddle.concat([image] * 2)
226
+
227
+ return image
228
+
229
+ @paddle.no_grad()
230
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
231
+ def __call__(
232
+ self,
233
+ prompt: List[str],
234
+ reference_image: PIL.Image.Image,
235
+ condtioning_image: PIL.Image.Image,
236
+ source_subject_category: List[str],
237
+ target_subject_category: List[str],
238
+ latents: Optional[paddle.Tensor] = None,
239
+ guidance_scale: float = 7.5,
240
+ height: int = 512,
241
+ width: int = 512,
242
+ num_inference_steps: int = 50,
243
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
244
+ neg_prompt: Optional[str] = "",
245
+ prompt_strength: float = 1.0,
246
+ prompt_reps: int = 20,
247
+ output_type: Optional[str] = "pil",
248
+ return_dict: bool = True,
249
+ ):
250
+ """
251
+ Function invoked when calling the pipeline for generation.
252
+
253
+ Args:
254
+ prompt (`List[str]`):
255
+ The prompt or prompts to guide the image generation.
256
+ reference_image (`PIL.Image.Image`):
257
+ The reference image to condition the generation on.
258
+ condtioning_image (`PIL.Image.Image`):
259
+ The conditioning canny edge image to condition the generation on.
260
+ source_subject_category (`List[str]`):
261
+ The source subject category.
262
+ target_subject_category (`List[str]`):
263
+ The target subject category.
264
+ latents (`paddle.Tensor`, *optional*):
265
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
266
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
267
+ tensor will ge generated by random sampling.
268
+ guidance_scale (`float`, *optional*, defaults to 7.5):
269
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
270
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
271
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
272
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
273
+ usually at the expense of lower image quality.
274
+ height (`int`, *optional*, defaults to 512):
275
+ The height of the generated image.
276
+ width (`int`, *optional*, defaults to 512):
277
+ The width of the generated image.
278
+ seed (`int`, *optional*, defaults to 42):
279
+ The seed to use for random generation.
280
+ num_inference_steps (`int`, *optional*, defaults to 50):
281
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
282
+ expense of slower inference.
283
+ generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
284
+ One or a list of [paddle generator(s)] to make generation deterministic.
285
+ neg_prompt (`str`, *optional*, defaults to ""):
286
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
287
+ if `guidance_scale` is less than `1`).
288
+ prompt_strength (`float`, *optional*, defaults to 1.0):
289
+ The strength of the prompt. Specifies the number of times the prompt is repeated along with prompt_reps
290
+ to amplify the prompt.
291
+ prompt_reps (`int`, *optional*, defaults to 20):
292
+ The number of times the prompt is repeated along with prompt_strength to amplify the prompt.
293
+ Examples:
294
+
295
+ Returns:
296
+ [`~pipelines.ImagePipelineOutput`] or `tuple`
297
+ """
298
+
299
+ reference_image = self.image_processor.preprocess(
300
+ reference_image, image_mean=self.config.mean, image_std=self.config.std, return_tensors="pd"
301
+ )["pixel_values"]
302
+
303
+ if isinstance(prompt, str):
304
+ prompt = [prompt]
305
+ if isinstance(source_subject_category, str):
306
+ source_subject_category = [source_subject_category]
307
+ if isinstance(target_subject_category, str):
308
+ target_subject_category = [target_subject_category]
309
+
310
+ batch_size = len(prompt)
311
+
312
+ prompt = self._build_prompt(
313
+ prompts=prompt,
314
+ tgt_subjects=target_subject_category,
315
+ prompt_strength=prompt_strength,
316
+ prompt_reps=prompt_reps,
317
+ )
318
+ query_embeds = self.get_query_embeddings(reference_image, source_subject_category)
319
+ text_embeddings = self.encode_prompt(query_embeds, prompt)
320
+ # 3. unconditional embedding
321
+ do_classifier_free_guidance = guidance_scale > 1.0
322
+ if do_classifier_free_guidance:
323
+ max_length = self.text_encoder.text_model.config.max_position_embeddings
324
+
325
+ uncond_input = self.tokenizer(
326
+ [neg_prompt] * batch_size,
327
+ padding="max_length",
328
+ max_length=max_length,
329
+ return_tensors="pd",
330
+ )
331
+ uncond_embeddings = self.text_encoder(
332
+ input_ids=uncond_input.input_ids,
333
+ ctx_embeddings=None,
334
+ )[0]
335
+ # For classifier free guidance, we need to do two forward passes.
336
+ # Here we concatenate the unconditional and text embeddings into a single batch
337
+ # to avoid doing two forward passes
338
+ text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
339
+ scale_down_factor = 2 ** (len(self.unet.config.block_out_channels) - 1)
340
+ latents = self.prepare_latents(
341
+ batch_size=batch_size,
342
+ num_channels=self.unet.config.in_channels,
343
+ height=height // scale_down_factor,
344
+ width=width // scale_down_factor,
345
+ generator=generator,
346
+ latents=latents,
347
+ dtype=self.unet.dtype,
348
+ )
349
+ # set timesteps
350
+ extra_set_kwargs = {}
351
+ self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
352
+
353
+ cond_image = self.prepare_control_image(
354
+ image=condtioning_image,
355
+ width=width,
356
+ height=height,
357
+ batch_size=batch_size,
358
+ num_images_per_prompt=1,
359
+ dtype=self.controlnet.dtype,
360
+ do_classifier_free_guidance=do_classifier_free_guidance,
361
+ )
362
+
363
+ for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
364
+ # expand the latents if we are doing classifier free guidance
365
+ do_classifier_free_guidance = guidance_scale > 1.0
366
+
367
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
368
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
369
+ latent_model_input,
370
+ t,
371
+ encoder_hidden_states=text_embeddings,
372
+ controlnet_cond=cond_image,
373
+ return_dict=False,
374
+ )
375
+
376
+ noise_pred = self.unet(
377
+ latent_model_input,
378
+ timestep=t,
379
+ encoder_hidden_states=text_embeddings,
380
+ down_block_additional_residuals=down_block_res_samples,
381
+ mid_block_additional_residual=mid_block_res_sample,
382
+ )["sample"]
383
+
384
+ # perform guidance
385
+ if do_classifier_free_guidance:
386
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
387
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
388
+
389
+ latents = self.scheduler.step(
390
+ noise_pred,
391
+ t,
392
+ latents,
393
+ )["prev_sample"]
394
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
395
+ image = self.image_processor.postprocess(image, output_type=output_type)
396
+
397
+ if not return_dict:
398
+ return (image,)
399
+
400
+ return ImagePipelineOutput(images=image)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_controlnet_img2img.py ADDED
@@ -0,0 +1,1116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
+
18
+ import numpy as np
19
+ import paddle
20
+ import PIL.Image
21
+
22
+ from ppdiffusers.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
23
+
24
+ from ...image_processor import PipelineImageInput, VaeImageProcessor
25
+ from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
26
+ from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
27
+ from ...models.lora import adjust_lora_scale_text_encoder
28
+ from ...schedulers import KarrasDiffusionSchedulers
29
+ from ...utils import USE_PEFT_BACKEND, deprecate, logging, replace_example_docstring
30
+ from ...utils.paddle_utils import randn_tensor
31
+ from ..pipeline_utils import DiffusionPipeline
32
+ from ..stable_diffusion import StableDiffusionPipelineOutput
33
+ from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
34
+ from .multicontrolnet import MultiControlNetModel
35
+
36
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
37
+
38
+
39
+ EXAMPLE_DOC_STRING = """
40
+ Examples:
41
+ ```py
42
+ >>> # !pip install opencv-python paddlenlp ppdiffusers
43
+ >>> from ppdiffusers import StableDiffusionControlNetImg2ImgPipeline, ControlNetModel, UniPCMultistepScheduler
44
+ >>> from ppdiffusers.utils import load_image
45
+ >>> import numpy as np
46
+ >>> import paddle
47
+
48
+ >>> import cv2
49
+ >>> from PIL import Image
50
+
51
+ >>> # download an image
52
+ >>> image = load_image(
53
+ ... "https://hf-mirror.com/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
54
+ ... )
55
+ >>> np_image = np.array(image)
56
+
57
+ >>> # get canny image
58
+ >>> np_image = cv2.Canny(np_image, 100, 200)
59
+ >>> np_image = np_image[:, :, None]
60
+ >>> np_image = np.concatenate([np_image, np_image, np_image], axis=2)
61
+ >>> canny_image = Image.fromarray(np_image)
62
+
63
+ >>> # load control net and stable diffusion v1-5
64
+ >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", paddle_dtype=paddle.float16)
65
+ >>> pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
66
+ ... "runwayml/stable-diffusion-v1-5", controlnet=controlnet, paddle_dtype=paddle.float16
67
+ ... )
68
+
69
+ >>> # speed up diffusion process with faster scheduler and memory optimization
70
+ >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
71
+
72
+ >>> # generate image
73
+ >>> generator = paddle.Generator().manual_seed(0)
74
+ >>> image = pipe(
75
+ ... "futuristic-looking woman",
76
+ ... num_inference_steps=20,
77
+ ... generator=generator,
78
+ ... image=image,
79
+ ... control_image=canny_image,
80
+ ... ).images[0]
81
+ ```
82
+ """
83
+
84
+
85
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
86
+ def retrieve_latents(
87
+ encoder_output: paddle.Tensor, generator: Optional[paddle.Generator] = None, sample_mode: str = "sample"
88
+ ):
89
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
90
+ return encoder_output.latent_dist.sample(generator)
91
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
92
+ return encoder_output.latent_dist.mode()
93
+ elif hasattr(encoder_output, "latents"):
94
+ return encoder_output.latents
95
+ else:
96
+ raise AttributeError("Could not access latents of provided encoder_output")
97
+
98
+
99
+ def prepare_image(image):
100
+ if isinstance(image, paddle.Tensor):
101
+ # Batch single image
102
+ if image.ndim == 3:
103
+ image = image.unsqueeze(0)
104
+
105
+ image = image.cast(dtype=paddle.float32)
106
+ else:
107
+ # preprocess image
108
+ if isinstance(image, (PIL.Image.Image, np.ndarray)):
109
+ image = [image]
110
+
111
+ if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
112
+ image = [np.array(i.convert("RGB"))[None, :] for i in image]
113
+ image = np.concatenate(image, axis=0)
114
+ elif isinstance(image, list) and isinstance(image[0], np.ndarray):
115
+ image = np.concatenate([i[None, :] for i in image], axis=0)
116
+
117
+ image = image.transpose(0, 3, 1, 2)
118
+ image = paddle.to_tensor(image, dtype="float32") / 127.5 - 1.0
119
+
120
+ return image
121
+
122
+
123
+ class StableDiffusionControlNetImg2ImgPipeline(
124
+ DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
125
+ ):
126
+ r"""
127
+ Pipeline for image-to-image generation using Stable Diffusion with ControlNet guidance.
128
+
129
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
130
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
131
+
132
+ The pipeline also inherits the following loading methods:
133
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
134
+
135
+ Args:
136
+ vae ([`AutoencoderKL`]):
137
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
138
+ text_encoder ([`~transformers.CLIPTextModel`]):
139
+ Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
140
+ tokenizer ([`~transformers.CLIPTokenizer`]):
141
+ A `CLIPTokenizer` to tokenize text.
142
+ unet ([`UNet2DConditionModel`]):
143
+ A `UNet2DConditionModel` to denoise the encoded image latents.
144
+ controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
145
+ Provides additional conditioning to the `unet` during the denoising process. If you set multiple
146
+ ControlNets as a list, the outputs from each ControlNet are added together to create one combined
147
+ additional conditioning.
148
+ scheduler ([`SchedulerMixin`]):
149
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
150
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
151
+ safety_checker ([`StableDiffusionSafetyChecker`]):
152
+ Classification module that estimates whether generated images could be considered offensive or harmful.
153
+ Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
154
+ about a model's potential harms.
155
+ feature_extractor ([`~transformers.CLIPImageProcessor`]):
156
+ A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
157
+ """
158
+
159
+ model_cpu_offload_seq = "text_encoder->unet->vae"
160
+ _optional_components = ["safety_checker", "feature_extractor"]
161
+ _exclude_from_cpu_offload = ["safety_checker"]
162
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
163
+
164
+ def __init__(
165
+ self,
166
+ vae: AutoencoderKL,
167
+ text_encoder: CLIPTextModel,
168
+ tokenizer: CLIPTokenizer,
169
+ unet: UNet2DConditionModel,
170
+ controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
171
+ scheduler: KarrasDiffusionSchedulers,
172
+ safety_checker: StableDiffusionSafetyChecker,
173
+ feature_extractor: CLIPImageProcessor,
174
+ requires_safety_checker: bool = True,
175
+ ):
176
+ super().__init__()
177
+
178
+ if safety_checker is None and requires_safety_checker:
179
+ logger.warning(
180
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
181
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
182
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
183
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
184
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
185
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
186
+ )
187
+
188
+ if safety_checker is not None and feature_extractor is None:
189
+ raise ValueError(
190
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
191
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
192
+ )
193
+
194
+ if isinstance(controlnet, (list, tuple)):
195
+ controlnet = MultiControlNetModel(controlnet)
196
+
197
+ self.register_modules(
198
+ vae=vae,
199
+ text_encoder=text_encoder,
200
+ tokenizer=tokenizer,
201
+ unet=unet,
202
+ controlnet=controlnet,
203
+ scheduler=scheduler,
204
+ safety_checker=safety_checker,
205
+ feature_extractor=feature_extractor,
206
+ )
207
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
208
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
209
+ self.control_image_processor = VaeImageProcessor(
210
+ vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
211
+ )
212
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
213
+
214
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
215
+ def _encode_prompt(
216
+ self,
217
+ prompt,
218
+ num_images_per_prompt,
219
+ do_classifier_free_guidance,
220
+ negative_prompt=None,
221
+ prompt_embeds: Optional[paddle.Tensor] = None,
222
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
223
+ lora_scale: Optional[float] = None,
224
+ **kwargs,
225
+ ):
226
+ deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
227
+ deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
228
+
229
+ prompt_embeds_tuple = self.encode_prompt(
230
+ prompt=prompt,
231
+ num_images_per_prompt=num_images_per_prompt,
232
+ do_classifier_free_guidance=do_classifier_free_guidance,
233
+ negative_prompt=negative_prompt,
234
+ prompt_embeds=prompt_embeds,
235
+ negative_prompt_embeds=negative_prompt_embeds,
236
+ lora_scale=lora_scale,
237
+ **kwargs,
238
+ )
239
+
240
+ # concatenate for backwards comp
241
+ prompt_embeds = paddle.concat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
242
+
243
+ return prompt_embeds
244
+
245
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
246
+ def encode_prompt(
247
+ self,
248
+ prompt,
249
+ num_images_per_prompt,
250
+ do_classifier_free_guidance,
251
+ negative_prompt=None,
252
+ prompt_embeds: Optional[paddle.Tensor] = None,
253
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
254
+ lora_scale: Optional[float] = None,
255
+ clip_skip: Optional[int] = None,
256
+ ):
257
+ r"""
258
+ Encodes the prompt into text encoder hidden states.
259
+
260
+ Args:
261
+ prompt (`str` or `List[str]`, *optional*):
262
+ prompt to be encoded
263
+ num_images_per_prompt (`int`):
264
+ number of images that should be generated per prompt
265
+ do_classifier_free_guidance (`bool`):
266
+ whether to use classifier free guidance or not
267
+ negative_prompt (`str` or `List[str]`, *optional*):
268
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
269
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
270
+ less than `1`).
271
+ prompt_embeds (`paddle.Tensor`, *optional*):
272
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
273
+ provided, text embeddings will be generated from `prompt` input argument.
274
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
275
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
276
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
277
+ argument.
278
+ lora_scale (`float`, *optional*):
279
+ A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
280
+ clip_skip (`int`, *optional*):
281
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
282
+ the output of the pre-final layer will be used for computing the prompt embeddings.
283
+ """
284
+ # set lora scale so that monkey patched LoRA
285
+ # function of text encoder can correctly access it
286
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
287
+ self._lora_scale = lora_scale
288
+
289
+ # dynamically adjust the LoRA scale
290
+ if not USE_PEFT_BACKEND:
291
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
292
+
293
+ if prompt is not None and isinstance(prompt, str):
294
+ batch_size = 1
295
+ elif prompt is not None and isinstance(prompt, list):
296
+ batch_size = len(prompt)
297
+ else:
298
+ batch_size = prompt_embeds.shape[0]
299
+
300
+ if prompt_embeds is None:
301
+ # textual inversion: process multi-vector tokens if necessary
302
+ if isinstance(self, TextualInversionLoaderMixin):
303
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
304
+
305
+ text_inputs = self.tokenizer(
306
+ prompt,
307
+ padding="max_length",
308
+ max_length=self.tokenizer.model_max_length,
309
+ truncation=True,
310
+ return_tensors="pd",
311
+ )
312
+ text_input_ids = text_inputs.input_ids
313
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
314
+
315
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
316
+ text_input_ids, untruncated_ids
317
+ ):
318
+ removed_text = self.tokenizer.batch_decode(
319
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
320
+ )
321
+ logger.warning(
322
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
323
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
324
+ )
325
+
326
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
327
+ attention_mask = text_inputs.attention_mask
328
+ else:
329
+ attention_mask = None
330
+
331
+ if clip_skip is None:
332
+ prompt_embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask)
333
+ prompt_embeds = prompt_embeds[0]
334
+ else:
335
+ prompt_embeds = self.text_encoder(
336
+ text_input_ids, attention_mask=attention_mask, output_hidden_states=True
337
+ )
338
+ # Access the `hidden_states` first, that contains a tuple of
339
+ # all the hidden states from the encoder layers. Then index into
340
+ # the tuple to access the hidden states from the desired layer.
341
+ prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
342
+ # We also need to apply the final LayerNorm here to not mess with the
343
+ # representations. The `last_hidden_states` that we typically use for
344
+ # obtaining the final prompt representations passes through the LayerNorm
345
+ # layer.
346
+ prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
347
+
348
+ if self.text_encoder is not None:
349
+ prompt_embeds_dtype = self.text_encoder.dtype
350
+ elif self.unet is not None:
351
+ prompt_embeds_dtype = self.unet.dtype
352
+ else:
353
+ prompt_embeds_dtype = prompt_embeds.dtype
354
+
355
+ prompt_embeds = prompt_embeds.cast(dtype=prompt_embeds_dtype)
356
+
357
+ bs_embed, seq_len, _ = prompt_embeds.shape
358
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
359
+ prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
360
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
361
+
362
+ # get unconditional embeddings for classifier free guidance
363
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
364
+ uncond_tokens: List[str]
365
+ if negative_prompt is None:
366
+ uncond_tokens = [""] * batch_size
367
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
368
+ raise TypeError(
369
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
370
+ f" {type(prompt)}."
371
+ )
372
+ elif isinstance(negative_prompt, str):
373
+ uncond_tokens = [negative_prompt]
374
+ elif batch_size != len(negative_prompt):
375
+ raise ValueError(
376
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
377
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
378
+ " the batch size of `prompt`."
379
+ )
380
+ else:
381
+ uncond_tokens = negative_prompt
382
+
383
+ # textual inversion: process multi-vector tokens if necessary
384
+ if isinstance(self, TextualInversionLoaderMixin):
385
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
386
+
387
+ max_length = prompt_embeds.shape[1]
388
+ uncond_input = self.tokenizer(
389
+ uncond_tokens,
390
+ padding="max_length",
391
+ max_length=max_length,
392
+ truncation=True,
393
+ return_tensors="pd",
394
+ )
395
+
396
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
397
+ attention_mask = uncond_input.attention_mask
398
+ else:
399
+ attention_mask = None
400
+
401
+ negative_prompt_embeds = self.text_encoder(
402
+ uncond_input.input_ids,
403
+ attention_mask=attention_mask,
404
+ )
405
+ negative_prompt_embeds = negative_prompt_embeds[0]
406
+
407
+ if do_classifier_free_guidance:
408
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
409
+ seq_len = negative_prompt_embeds.shape[1]
410
+
411
+ negative_prompt_embeds = negative_prompt_embeds.cast(dtype=prompt_embeds_dtype)
412
+
413
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
414
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
415
+
416
+ return prompt_embeds, negative_prompt_embeds
417
+
418
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
419
+ def run_safety_checker(self, image, dtype):
420
+ if self.safety_checker is None:
421
+ has_nsfw_concept = None
422
+ else:
423
+ if paddle.is_tensor(image):
424
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
425
+ else:
426
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
427
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd")
428
+ image, has_nsfw_concept = self.safety_checker(
429
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype=dtype)
430
+ )
431
+ return image, has_nsfw_concept
432
+
433
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
434
+ def decode_latents(self, latents):
435
+ deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
436
+ deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
437
+
438
+ latents = 1 / self.vae.config.scaling_factor * latents
439
+ image = self.vae.decode(latents, return_dict=False)[0]
440
+ image = (image / 2 + 0.5).clip(0, 1)
441
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
442
+ image = image.cast(dtype=paddle.float32).transpose([0, 2, 3, 1]).cpu().numpy()
443
+ return image
444
+
445
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
446
+ def prepare_extra_step_kwargs(self, generator, eta):
447
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
448
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
449
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
450
+ # and should be between [0, 1]
451
+
452
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
453
+ extra_step_kwargs = {}
454
+ if accepts_eta:
455
+ extra_step_kwargs["eta"] = eta
456
+
457
+ # check if the scheduler accepts generator
458
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
459
+ if accepts_generator:
460
+ extra_step_kwargs["generator"] = generator
461
+ return extra_step_kwargs
462
+
463
+ def check_inputs(
464
+ self,
465
+ prompt,
466
+ image,
467
+ callback_steps,
468
+ negative_prompt=None,
469
+ prompt_embeds=None,
470
+ negative_prompt_embeds=None,
471
+ controlnet_conditioning_scale=1.0,
472
+ control_guidance_start=0.0,
473
+ control_guidance_end=1.0,
474
+ callback_on_step_end_tensor_inputs=None,
475
+ ):
476
+ if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
477
+ raise ValueError(
478
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
479
+ f" {type(callback_steps)}."
480
+ )
481
+
482
+ if callback_on_step_end_tensor_inputs is not None and not all(
483
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
484
+ ):
485
+ raise ValueError(
486
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
487
+ )
488
+
489
+ if prompt is not None and prompt_embeds is not None:
490
+ raise ValueError(
491
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
492
+ " only forward one of the two."
493
+ )
494
+ elif prompt is None and prompt_embeds is None:
495
+ raise ValueError(
496
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
497
+ )
498
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
499
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
500
+
501
+ if negative_prompt is not None and negative_prompt_embeds is not None:
502
+ raise ValueError(
503
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
504
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
505
+ )
506
+
507
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
508
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
509
+ raise ValueError(
510
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
511
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
512
+ f" {negative_prompt_embeds.shape}."
513
+ )
514
+
515
+ # `prompt` needs more sophisticated handling when there are multiple
516
+ # conditionings.
517
+ if isinstance(self.controlnet, MultiControlNetModel):
518
+ if isinstance(prompt, list):
519
+ logger.warning(
520
+ f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
521
+ " prompts. The conditionings will be fixed across the prompts."
522
+ )
523
+
524
+ # Check `image`
525
+ if isinstance(self.controlnet, ControlNetModel):
526
+ self.check_image(image, prompt, prompt_embeds)
527
+ elif isinstance(self.controlnet, MultiControlNetModel):
528
+ if not isinstance(image, list):
529
+ raise TypeError("For multiple controlnets: `image` must be type `list`")
530
+
531
+ # When `image` is a nested list:
532
+ # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
533
+ elif any(isinstance(i, list) for i in image):
534
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
535
+ elif len(image) != len(self.controlnet.nets):
536
+ raise ValueError(
537
+ f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
538
+ )
539
+
540
+ for image_ in image:
541
+ self.check_image(image_, prompt, prompt_embeds)
542
+ else:
543
+ assert False
544
+
545
+ # Check `controlnet_conditioning_scale`
546
+ if isinstance(self.controlnet, ControlNetModel):
547
+ if not isinstance(controlnet_conditioning_scale, float):
548
+ raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
549
+ elif isinstance(self.controlnet, MultiControlNetModel):
550
+ if isinstance(controlnet_conditioning_scale, list):
551
+ if any(isinstance(i, list) for i in controlnet_conditioning_scale):
552
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
553
+ elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
554
+ self.controlnet.nets
555
+ ):
556
+ raise ValueError(
557
+ "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
558
+ " the same length as the number of controlnets"
559
+ )
560
+ else:
561
+ assert False
562
+
563
+ if len(control_guidance_start) != len(control_guidance_end):
564
+ raise ValueError(
565
+ f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
566
+ )
567
+
568
+ if isinstance(self.controlnet, MultiControlNetModel):
569
+ if len(control_guidance_start) != len(self.controlnet.nets):
570
+ raise ValueError(
571
+ f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
572
+ )
573
+
574
+ for start, end in zip(control_guidance_start, control_guidance_end):
575
+ if start >= end:
576
+ raise ValueError(
577
+ f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
578
+ )
579
+ if start < 0.0:
580
+ raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
581
+ if end > 1.0:
582
+ raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
583
+
584
+ # Copied from ppdiffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
585
+ def check_image(self, image, prompt, prompt_embeds):
586
+ image_is_pil = isinstance(image, PIL.Image.Image)
587
+ image_is_tensor = isinstance(image, paddle.Tensor)
588
+ image_is_np = isinstance(image, np.ndarray)
589
+ image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
590
+ image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor)
591
+ image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
592
+
593
+ if (
594
+ not image_is_pil
595
+ and not image_is_tensor
596
+ and not image_is_np
597
+ and not image_is_pil_list
598
+ and not image_is_tensor_list
599
+ and not image_is_np_list
600
+ ):
601
+ raise TypeError(
602
+ f"image must be passed and be one of PIL image, numpy array, paddle tensor, list of PIL images, list of numpy arrays or list of paddle tensors, but is {type(image)}"
603
+ )
604
+
605
+ if image_is_pil:
606
+ image_batch_size = 1
607
+ else:
608
+ image_batch_size = len(image)
609
+
610
+ if prompt is not None and isinstance(prompt, str):
611
+ prompt_batch_size = 1
612
+ elif prompt is not None and isinstance(prompt, list):
613
+ prompt_batch_size = len(prompt)
614
+ elif prompt_embeds is not None:
615
+ prompt_batch_size = prompt_embeds.shape[0]
616
+
617
+ if image_batch_size != 1 and image_batch_size != prompt_batch_size:
618
+ raise ValueError(
619
+ f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
620
+ )
621
+
622
+ # Copied from ppdiffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
623
+ def prepare_control_image(
624
+ self,
625
+ image,
626
+ width,
627
+ height,
628
+ batch_size,
629
+ num_images_per_prompt,
630
+ dtype,
631
+ do_classifier_free_guidance=False,
632
+ guess_mode=False,
633
+ ):
634
+ image = self.control_image_processor.preprocess(image, height=height, width=width).cast(dtype=paddle.float32)
635
+ image_batch_size = image.shape[0]
636
+
637
+ if image_batch_size == 1:
638
+ repeat_by = batch_size
639
+ else:
640
+ # image batch size is the same as prompt batch size
641
+ repeat_by = num_images_per_prompt
642
+
643
+ image = image.repeat_interleave(repeat_by, axis=0)
644
+
645
+ image = image.cast(dtype=dtype)
646
+
647
+ if do_classifier_free_guidance and not guess_mode:
648
+ image = paddle.concat([image] * 2)
649
+
650
+ return image
651
+
652
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
653
+ def get_timesteps(self, num_inference_steps, strength):
654
+ # get the original timestep using init_timestep
655
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
656
+
657
+ t_start = max(num_inference_steps - init_timestep, 0)
658
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
659
+
660
+ return timesteps, num_inference_steps - t_start
661
+
662
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.prepare_latents
663
+ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
664
+ if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)):
665
+ raise ValueError(
666
+ f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
667
+ )
668
+
669
+ image = image.cast(dtype=dtype)
670
+
671
+ batch_size = batch_size * num_images_per_prompt
672
+
673
+ if image.shape[1] == 4:
674
+ init_latents = image
675
+
676
+ else:
677
+ if isinstance(generator, list) and len(generator) != batch_size:
678
+ raise ValueError(
679
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
680
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
681
+ )
682
+
683
+ elif isinstance(generator, list):
684
+ init_latents = [
685
+ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
686
+ for i in range(batch_size)
687
+ ]
688
+ init_latents = paddle.concat(init_latents, axis=0)
689
+ else:
690
+ init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
691
+
692
+ init_latents = self.vae.config.scaling_factor * init_latents
693
+
694
+ if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
695
+ # expand init_latents for batch_size
696
+ deprecation_message = (
697
+ f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
698
+ " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
699
+ " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
700
+ " your script to pass as many initial images as text prompts to suppress this warning."
701
+ )
702
+ deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
703
+ additional_image_per_prompt = batch_size // init_latents.shape[0]
704
+ init_latents = paddle.concat([init_latents] * additional_image_per_prompt, axis=0)
705
+ elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
706
+ raise ValueError(
707
+ f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
708
+ )
709
+ else:
710
+ init_latents = paddle.concat([init_latents], axis=0)
711
+
712
+ shape = init_latents.shape
713
+ noise = randn_tensor(shape, generator=generator, dtype=dtype)
714
+
715
+ # get latents
716
+ init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
717
+ latents = init_latents
718
+
719
+ return latents
720
+
721
+ @property
722
+ def guidance_scale(self):
723
+ return self._guidance_scale
724
+
725
+ @property
726
+ def clip_skip(self):
727
+ return self._clip_skip
728
+
729
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
730
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
731
+ # corresponds to doing no classifier free guidance.
732
+ @property
733
+ def do_classifier_free_guidance(self):
734
+ return self._guidance_scale > 1
735
+
736
+ @property
737
+ def cross_attention_kwargs(self):
738
+ return self._cross_attention_kwargs
739
+
740
+ @property
741
+ def num_timesteps(self):
742
+ return self._num_timesteps
743
+
744
+ @paddle.no_grad()
745
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
746
+ def __call__(
747
+ self,
748
+ prompt: Union[str, List[str]] = None,
749
+ image: PipelineImageInput = None,
750
+ control_image: PipelineImageInput = None,
751
+ height: Optional[int] = None,
752
+ width: Optional[int] = None,
753
+ strength: float = 0.8,
754
+ num_inference_steps: int = 50,
755
+ guidance_scale: float = 7.5,
756
+ negative_prompt: Optional[Union[str, List[str]]] = None,
757
+ num_images_per_prompt: Optional[int] = 1,
758
+ eta: float = 0.0,
759
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
760
+ latents: Optional[paddle.Tensor] = None,
761
+ prompt_embeds: Optional[paddle.Tensor] = None,
762
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
763
+ output_type: Optional[str] = "pil",
764
+ return_dict: bool = True,
765
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
766
+ controlnet_conditioning_scale: Union[float, List[float]] = 0.8,
767
+ guess_mode: bool = False,
768
+ control_guidance_start: Union[float, List[float]] = 0.0,
769
+ control_guidance_end: Union[float, List[float]] = 1.0,
770
+ clip_skip: Optional[int] = None,
771
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
772
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
773
+ **kwargs,
774
+ ):
775
+ r"""
776
+ The call function to the pipeline for generation.
777
+
778
+ Args:
779
+ prompt (`str` or `List[str]`, *optional*):
780
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
781
+ image (`paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[paddle.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
782
+ `List[List[paddle.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
783
+ The initial image to be used as the starting point for the image generation process. Can also accept
784
+ image latents as `image`, and if passing latents directly they are not encoded again.
785
+ control_image (`paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[paddle.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
786
+ `List[List[paddle.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
787
+ The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
788
+ specified as `paddle.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
789
+ accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
790
+ and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
791
+ `init`, images must be passed as a list such that each element of the list can be correctly batched for
792
+ input to a single ControlNet.
793
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
794
+ The height in pixels of the generated image.
795
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
796
+ The width in pixels of the generated image.
797
+ num_inference_steps (`int`, *optional*, defaults to 50):
798
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
799
+ expense of slower inference.
800
+ guidance_scale (`float`, *optional*, defaults to 7.5):
801
+ A higher guidance scale value encourages the model to generate images closely linked to the text
802
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
803
+ negative_prompt (`str` or `List[str]`, *optional*):
804
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
805
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
806
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
807
+ The number of images to generate per prompt.
808
+ eta (`float`, *optional*, defaults to 0.0):
809
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
810
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
811
+ generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
812
+ A [`paddle.Generator`] to make generation deterministic.
813
+
814
+ latents (`paddle.Tensor`, *optional*):
815
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
816
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
817
+ tensor is generated by sampling using the supplied random `generator`.
818
+ prompt_embeds (`paddle.Tensor`, *optional*):
819
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
820
+ provided, text embeddings are generated from the `prompt` input argument.
821
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
822
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
823
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
824
+ output_type (`str`, *optional*, defaults to `"pil"`):
825
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
826
+ return_dict (`bool`, *optional*, defaults to `True`):
827
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
828
+ plain tuple.
829
+ cross_attention_kwargs (`dict`, *optional*):
830
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
831
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
832
+ controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
833
+ The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
834
+ to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
835
+ the corresponding scale as a list.
836
+ guess_mode (`bool`, *optional*, defaults to `False`):
837
+ The ControlNet encoder tries to recognize the content of the input image even if you remove all
838
+ prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
839
+ control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
840
+ The percentage of total steps at which the ControlNet starts applying.
841
+ control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
842
+ The percentage of total steps at which the ControlNet stops applying.
843
+ clip_skip (`int`, *optional*):
844
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
845
+ the output of the pre-final layer will be used for computing the prompt embeddings.
846
+ callback_on_step_end (`Callable`, *optional*):
847
+ A function that calls at the end of each denoising steps during the inference. The function is called
848
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
849
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
850
+ `callback_on_step_end_tensor_inputs`.
851
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
852
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
853
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
854
+ `._callback_tensor_inputs` attribute of your pipeine class.
855
+
856
+ Examples:
857
+
858
+ Returns:
859
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
860
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
861
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
862
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
863
+ "not-safe-for-work" (nsfw) content.
864
+ """
865
+
866
+ callback = kwargs.pop("callback", None)
867
+ callback_steps = kwargs.pop("callback_steps", None)
868
+
869
+ if callback is not None:
870
+ deprecate(
871
+ "callback",
872
+ "1.0.0",
873
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
874
+ )
875
+ if callback_steps is not None:
876
+ deprecate(
877
+ "callback_steps",
878
+ "1.0.0",
879
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
880
+ )
881
+
882
+ controlnet = self.controlnet
883
+
884
+ # align format for control guidance
885
+ if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
886
+ control_guidance_start = len(control_guidance_end) * [control_guidance_start]
887
+ elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
888
+ control_guidance_end = len(control_guidance_start) * [control_guidance_end]
889
+ elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
890
+ mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
891
+ control_guidance_start, control_guidance_end = (
892
+ mult * [control_guidance_start],
893
+ mult * [control_guidance_end],
894
+ )
895
+
896
+ # 1. Check inputs. Raise error if not correct
897
+ self.check_inputs(
898
+ prompt,
899
+ control_image,
900
+ callback_steps,
901
+ negative_prompt,
902
+ prompt_embeds,
903
+ negative_prompt_embeds,
904
+ controlnet_conditioning_scale,
905
+ control_guidance_start,
906
+ control_guidance_end,
907
+ callback_on_step_end_tensor_inputs,
908
+ )
909
+
910
+ self._guidance_scale = guidance_scale
911
+ self._clip_skip = clip_skip
912
+ self._cross_attention_kwargs = cross_attention_kwargs
913
+
914
+ # 2. Define call parameters
915
+ if prompt is not None and isinstance(prompt, str):
916
+ batch_size = 1
917
+ elif prompt is not None and isinstance(prompt, list):
918
+ batch_size = len(prompt)
919
+ else:
920
+ batch_size = prompt_embeds.shape[0]
921
+
922
+ if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
923
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
924
+
925
+ global_pool_conditions = (
926
+ controlnet.config.global_pool_conditions
927
+ if isinstance(controlnet, ControlNetModel)
928
+ else controlnet.nets[0].config.global_pool_conditions
929
+ )
930
+ guess_mode = guess_mode or global_pool_conditions
931
+
932
+ # 3. Encode input prompt
933
+ text_encoder_lora_scale = (
934
+ self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
935
+ )
936
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
937
+ prompt,
938
+ num_images_per_prompt,
939
+ self.do_classifier_free_guidance,
940
+ negative_prompt,
941
+ prompt_embeds=prompt_embeds,
942
+ negative_prompt_embeds=negative_prompt_embeds,
943
+ lora_scale=text_encoder_lora_scale,
944
+ clip_skip=self.clip_skip,
945
+ )
946
+ # For classifier free guidance, we need to do two forward passes.
947
+ # Here we concatenate the unconditional and text embeddings into a single batch
948
+ # to avoid doing two forward passes
949
+ if self.do_classifier_free_guidance:
950
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
951
+
952
+ # 4. Prepare image
953
+ image = self.image_processor.preprocess(image, height=height, width=width).cast(dtype=paddle.float32)
954
+
955
+ # 5. Prepare controlnet_conditioning_image
956
+ if isinstance(controlnet, ControlNetModel):
957
+ control_image = self.prepare_control_image(
958
+ image=control_image,
959
+ width=width,
960
+ height=height,
961
+ batch_size=batch_size * num_images_per_prompt,
962
+ num_images_per_prompt=num_images_per_prompt,
963
+ dtype=controlnet.dtype,
964
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
965
+ guess_mode=guess_mode,
966
+ )
967
+ elif isinstance(controlnet, MultiControlNetModel):
968
+ control_images = []
969
+
970
+ for control_image_ in control_image:
971
+ control_image_ = self.prepare_control_image(
972
+ image=control_image_,
973
+ width=width,
974
+ height=height,
975
+ batch_size=batch_size * num_images_per_prompt,
976
+ num_images_per_prompt=num_images_per_prompt,
977
+ dtype=controlnet.dtype,
978
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
979
+ guess_mode=guess_mode,
980
+ )
981
+
982
+ control_images.append(control_image_)
983
+
984
+ control_image = control_images
985
+ else:
986
+ assert False
987
+
988
+ # 5. Prepare timesteps
989
+ self.scheduler.set_timesteps(num_inference_steps)
990
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
991
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
992
+ self._num_timesteps = len(timesteps)
993
+
994
+ # 6. Prepare latent variables
995
+ latents = self.prepare_latents(
996
+ image,
997
+ latent_timestep,
998
+ batch_size,
999
+ num_images_per_prompt,
1000
+ prompt_embeds.dtype,
1001
+ generator,
1002
+ )
1003
+
1004
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
1005
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1006
+
1007
+ # 7.1 Create tensor stating which controlnets to keep
1008
+ controlnet_keep = []
1009
+ for i in range(len(timesteps)):
1010
+ keeps = [
1011
+ 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
1012
+ for s, e in zip(control_guidance_start, control_guidance_end)
1013
+ ]
1014
+ controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
1015
+
1016
+ # 8. Denoising loop
1017
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
1018
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1019
+ for i, t in enumerate(timesteps):
1020
+ # expand the latents if we are doing classifier free guidance
1021
+ latent_model_input = paddle.concat([latents] * 2) if self.do_classifier_free_guidance else latents
1022
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
1023
+
1024
+ # controlnet(s) inference
1025
+ if guess_mode and self.do_classifier_free_guidance:
1026
+ # Infer ControlNet only for the conditional batch.
1027
+ control_model_input = latents
1028
+ control_model_input = self.scheduler.scale_model_input(control_model_input, t)
1029
+ controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
1030
+ else:
1031
+ control_model_input = latent_model_input
1032
+ controlnet_prompt_embeds = prompt_embeds
1033
+
1034
+ if isinstance(controlnet_keep[i], list):
1035
+ cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
1036
+ else:
1037
+ controlnet_cond_scale = controlnet_conditioning_scale
1038
+ if isinstance(controlnet_cond_scale, list):
1039
+ controlnet_cond_scale = controlnet_cond_scale[0]
1040
+ cond_scale = controlnet_cond_scale * controlnet_keep[i]
1041
+
1042
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
1043
+ control_model_input,
1044
+ t,
1045
+ encoder_hidden_states=controlnet_prompt_embeds,
1046
+ controlnet_cond=control_image,
1047
+ conditioning_scale=cond_scale,
1048
+ guess_mode=guess_mode,
1049
+ return_dict=False,
1050
+ )
1051
+
1052
+ if guess_mode and self.do_classifier_free_guidance:
1053
+ # Infered ControlNet only for the conditional batch.
1054
+ # To apply the output of ControlNet to both the unconditional and conditional batches,
1055
+ # add 0 to the unconditional batch to keep it unchanged.
1056
+ down_block_res_samples = [paddle.concat([paddle.zeros_like(d), d]) for d in down_block_res_samples]
1057
+ mid_block_res_sample = paddle.concat(
1058
+ [paddle.zeros_like(mid_block_res_sample), mid_block_res_sample]
1059
+ )
1060
+
1061
+ # predict the noise residual
1062
+ noise_pred = self.unet(
1063
+ latent_model_input,
1064
+ t,
1065
+ encoder_hidden_states=prompt_embeds,
1066
+ cross_attention_kwargs=self.cross_attention_kwargs,
1067
+ down_block_additional_residuals=down_block_res_samples,
1068
+ mid_block_additional_residual=mid_block_res_sample,
1069
+ return_dict=False,
1070
+ )[0]
1071
+
1072
+ # perform guidance
1073
+ if self.do_classifier_free_guidance:
1074
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1075
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
1076
+
1077
+ # compute the previous noisy sample x_t -> x_t-1
1078
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1079
+
1080
+ if callback_on_step_end is not None:
1081
+ callback_kwargs = {}
1082
+ for k in callback_on_step_end_tensor_inputs:
1083
+ callback_kwargs[k] = locals()[k]
1084
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
1085
+
1086
+ latents = callback_outputs.pop("latents", latents)
1087
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1088
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
1089
+
1090
+ # call the callback, if provided
1091
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1092
+ progress_bar.update()
1093
+ if callback is not None and i % callback_steps == 0:
1094
+ step_idx = i // getattr(self.scheduler, "order", 1)
1095
+ callback(step_idx, t, latents)
1096
+
1097
+ if not output_type == "latent":
1098
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
1099
+ 0
1100
+ ]
1101
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
1102
+ else:
1103
+ image = latents
1104
+ has_nsfw_concept = None
1105
+
1106
+ if has_nsfw_concept is None:
1107
+ do_denormalize = [True] * image.shape[0]
1108
+ else:
1109
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
1110
+
1111
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
1112
+
1113
+ if not return_dict:
1114
+ return (image, has_nsfw_concept)
1115
+
1116
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py ADDED
@@ -0,0 +1,1428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # This model implementation is heavily inspired by https://github.com/haofanwang/ControlNet-for-Diffusers/
16
+
17
+ import inspect
18
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
19
+
20
+ import numpy as np
21
+ import paddle
22
+ import PIL.Image
23
+
24
+ from ppdiffusers.transformers import (
25
+ CLIPImageProcessor,
26
+ CLIPTextModel,
27
+ CLIPTokenizer,
28
+ CLIPVisionModelWithProjection,
29
+ )
30
+
31
+ from ...image_processor import PipelineImageInput, VaeImageProcessor
32
+ from ...loaders import (
33
+ FromSingleFileMixin,
34
+ IPAdapterMixin,
35
+ LoraLoaderMixin,
36
+ TextualInversionLoaderMixin,
37
+ )
38
+ from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
39
+ from ...models.lora import adjust_lora_scale_text_encoder
40
+ from ...schedulers import KarrasDiffusionSchedulers
41
+ from ...utils import USE_PEFT_BACKEND, deprecate, logging, replace_example_docstring
42
+ from ...utils.paddle_utils import randn_tensor
43
+ from ..pipeline_utils import DiffusionPipeline
44
+ from ..stable_diffusion import StableDiffusionPipelineOutput
45
+ from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
46
+ from .multicontrolnet import MultiControlNetModel
47
+
48
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
49
+
50
+
51
+ EXAMPLE_DOC_STRING = """
52
+ Examples:
53
+ ```py
54
+ >>> # !pip install paddlenlp ppdiffusers
55
+ >>> from ppdiffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel, DDIMScheduler
56
+ >>> from ppdiffusers.utils import load_image
57
+ >>> import numpy as np
58
+ >>> import paddle
59
+
60
+ >>> init_image = load_image(
61
+ ... "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy.png"
62
+ ... )
63
+ >>> init_image = init_image.resize((512, 512))
64
+
65
+ >>> generator = paddle.Generator().manual_seed(1)
66
+
67
+ >>> mask_image = load_image(
68
+ ... "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy_mask.png"
69
+ ... )
70
+ >>> mask_image = mask_image.resize((512, 512))
71
+
72
+
73
+ >>> def make_canny_condition(image):
74
+ ... image = np.array(image)
75
+ ... image = cv2.Canny(image, 100, 200)
76
+ ... image = image[:, :, None]
77
+ ... image = np.concatenate([image, image, image], axis=2)
78
+ ... image = Image.fromarray(image)
79
+ ... return image
80
+
81
+
82
+ >>> control_image = make_canny_condition(init_image)
83
+
84
+ >>> controlnet = ControlNetModel.from_pretrained(
85
+ ... "lllyasviel/control_v11p_sd15_inpaint", paddle_dtype=paddle.float16
86
+ ... )
87
+ >>> pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
88
+ ... "runwayml/stable-diffusion-v1-5", controlnet=controlnet, paddle_dtype=paddle.float16
89
+ ... )
90
+
91
+ >>> pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
92
+ >>> pipe.enable_model_cpu_offload()
93
+
94
+ >>> # generate image
95
+ >>> image = pipe(
96
+ ... "a handsome man with ray-ban sunglasses",
97
+ ... num_inference_steps=20,
98
+ ... generator=generator,
99
+ ... eta=1.0,
100
+ ... image=init_image,
101
+ ... mask_image=mask_image,
102
+ ... control_image=control_image,
103
+ ... ).images[0]
104
+ ```
105
+ """
106
+
107
+
108
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
109
+ def retrieve_latents(
110
+ encoder_output: paddle.Tensor, generator: Optional[paddle.Generator] = None, sample_mode: str = "sample"
111
+ ):
112
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
113
+ return encoder_output.latent_dist.sample(generator)
114
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
115
+ return encoder_output.latent_dist.mode()
116
+ elif hasattr(encoder_output, "latents"):
117
+ return encoder_output.latents
118
+ else:
119
+ raise AttributeError("Could not access latents of provided encoder_output")
120
+
121
+
122
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.prepare_mask_and_masked_image
123
+ def prepare_mask_and_masked_image(image, mask, height, width, return_image=False):
124
+ """
125
+ Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
126
+ converted to ``paddle.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
127
+ ``image`` and ``1`` for the ``mask``.
128
+
129
+ The ``image`` will be converted to ``paddle.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
130
+ binarized (``mask > 0.5``) and cast to ``paddle.float32`` too.
131
+
132
+ Args:
133
+ image (Union[np.array, PIL.Image, paddle.Tensor]): The image to inpaint.
134
+ It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
135
+ ``paddle.Tensor`` or a ``batch x channels x height x width`` ``paddle.Tensor``.
136
+ mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
137
+ It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
138
+ ``paddle.Tensor`` or a ``batch x 1 x height x width`` ``paddle.Tensor``.
139
+
140
+
141
+ Raises:
142
+ ValueError: ``paddle.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``paddle.Tensor`` mask
143
+ should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
144
+ TypeError: ``mask`` is a ``paddle.Tensor`` but ``image`` is not
145
+ (ot the other way around).
146
+
147
+ Returns:
148
+ tuple[paddle.Tensor]: The pair (mask, masked_image) as ``paddle.Tensor`` with 4
149
+ dimensions: ``batch x channels x height x width``.
150
+ """
151
+ deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead"
152
+ deprecate(
153
+ "prepare_mask_and_masked_image",
154
+ "0.30.0",
155
+ deprecation_message,
156
+ )
157
+ if image is None:
158
+ raise ValueError("`image` input cannot be undefined.")
159
+
160
+ if mask is None:
161
+ raise ValueError("`mask_image` input cannot be undefined.")
162
+
163
+ if isinstance(image, paddle.Tensor):
164
+ if not isinstance(mask, paddle.Tensor):
165
+ raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not")
166
+
167
+ # Batch single image
168
+ if image.ndim == 3:
169
+ assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
170
+ image = image.unsqueeze(0)
171
+
172
+ # Batch and add channel dim for single mask
173
+ if mask.ndim == 2:
174
+ mask = mask.unsqueeze(0).unsqueeze(0)
175
+
176
+ # Batch single mask or add channel dim
177
+ if mask.ndim == 3:
178
+ # Single batched mask, no channel dim or single mask not batched but channel dim
179
+ if mask.shape[0] == 1:
180
+ mask = mask.unsqueeze(0)
181
+
182
+ # Batched masks no channel dim
183
+ else:
184
+ mask = mask.unsqueeze(1)
185
+
186
+ assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
187
+ assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
188
+ assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
189
+
190
+ # Check image is in [-1, 1]
191
+ if image.min() < -1 or image.max() > 1:
192
+ raise ValueError("Image should be in [-1, 1] range")
193
+
194
+ # Check mask is in [0, 1]
195
+ if mask.min() < 0 or mask.max() > 1:
196
+ raise ValueError("Mask should be in [0, 1] range")
197
+
198
+ # Binarize mask
199
+ mask[mask < 0.5] = 0
200
+ mask[mask >= 0.5] = 1
201
+
202
+ # Image as float32
203
+ image = image.cast(dtype=paddle.float32)
204
+ elif isinstance(mask, paddle.Tensor):
205
+ raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
206
+ else:
207
+ # preprocess image
208
+ if isinstance(image, (PIL.Image.Image, np.ndarray)):
209
+ image = [image]
210
+ if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
211
+ # resize all images w.r.t passed height an width
212
+ image = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image]
213
+ image = [np.array(i.convert("RGB"))[None, :] for i in image]
214
+ image = np.concatenate(image, axis=0)
215
+ elif isinstance(image, list) and isinstance(image[0], np.ndarray):
216
+ image = np.concatenate([i[None, :] for i in image], axis=0)
217
+
218
+ image = image.transpose(0, 3, 1, 2)
219
+ image = paddle.to_tensor(image, dtype="float32") / 127.5 - 1.0
220
+
221
+ # preprocess mask
222
+ if isinstance(mask, (PIL.Image.Image, np.ndarray)):
223
+ mask = [mask]
224
+
225
+ if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
226
+ mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
227
+ mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
228
+ mask = mask.astype(np.float32) / 255.0
229
+ elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
230
+ mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
231
+
232
+ mask[mask < 0.5] = 0
233
+ mask[mask >= 0.5] = 1
234
+ mask = paddle.to_tensor(mask)
235
+
236
+ masked_image = image * (mask < 0.5).cast(image.dtype)
237
+
238
+ # n.b. ensure backwards compatibility as old function does not return image
239
+ if return_image:
240
+ return mask, masked_image, image
241
+
242
+ return mask, masked_image
243
+
244
+
245
+ class StableDiffusionControlNetInpaintPipeline(
246
+ DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
247
+ ):
248
+ r"""
249
+ Pipeline for image inpainting using Stable Diffusion with ControlNet guidance.
250
+
251
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
252
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
253
+
254
+ The pipeline also inherits the following loading methods:
255
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
256
+ - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
257
+
258
+ <Tip>
259
+
260
+ This pipeline can be used with checkpoints that have been specifically fine-tuned for inpainting
261
+ ([runwayml/stable-diffusion-inpainting](https://huggingface.co/runwayml/stable-diffusion-inpainting)) as well as
262
+ default text-to-image Stable Diffusion checkpoints
263
+ ([runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5)). Default text-to-image
264
+ Stable Diffusion checkpoints might be preferable for ControlNets that have been fine-tuned on those, such as
265
+ [lllyasviel/control_v11p_sd15_inpaint](https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint).
266
+
267
+ </Tip>
268
+
269
+ Args:
270
+ vae ([`AutoencoderKL`]):
271
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
272
+ text_encoder ([`~transformers.CLIPTextModel`]):
273
+ Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
274
+ tokenizer ([`~transformers.CLIPTokenizer`]):
275
+ A `CLIPTokenizer` to tokenize text.
276
+ unet ([`UNet2DConditionModel`]):
277
+ A `UNet2DConditionModel` to denoise the encoded image latents.
278
+ controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
279
+ Provides additional conditioning to the `unet` during the denoising process. If you set multiple
280
+ ControlNets as a list, the outputs from each ControlNet are added together to create one combined
281
+ additional conditioning.
282
+ scheduler ([`SchedulerMixin`]):
283
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
284
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
285
+ safety_checker ([`StableDiffusionSafetyChecker`]):
286
+ Classification module that estimates whether generated images could be considered offensive or harmful.
287
+ Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
288
+ about a model's potential harms.
289
+ feature_extractor ([`~transformers.CLIPImageProcessor`]):
290
+ A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
291
+ """
292
+
293
+ model_cpu_offload_seq = "text_encoder->unet->vae"
294
+ _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
295
+ _exclude_from_cpu_offload = ["safety_checker"]
296
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
297
+
298
+ def __init__(
299
+ self,
300
+ vae: AutoencoderKL,
301
+ text_encoder: CLIPTextModel,
302
+ tokenizer: CLIPTokenizer,
303
+ unet: UNet2DConditionModel,
304
+ controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
305
+ scheduler: KarrasDiffusionSchedulers,
306
+ safety_checker: StableDiffusionSafetyChecker,
307
+ feature_extractor: CLIPImageProcessor,
308
+ image_encoder: CLIPVisionModelWithProjection = None,
309
+ requires_safety_checker: bool = True,
310
+ ):
311
+ super().__init__()
312
+
313
+ if safety_checker is None and requires_safety_checker:
314
+ logger.warning(
315
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
316
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
317
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
318
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
319
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
320
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
321
+ )
322
+
323
+ if safety_checker is not None and feature_extractor is None:
324
+ raise ValueError(
325
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
326
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
327
+ )
328
+
329
+ if isinstance(controlnet, (list, tuple)):
330
+ controlnet = MultiControlNetModel(controlnet)
331
+
332
+ self.register_modules(
333
+ vae=vae,
334
+ text_encoder=text_encoder,
335
+ tokenizer=tokenizer,
336
+ unet=unet,
337
+ controlnet=controlnet,
338
+ scheduler=scheduler,
339
+ safety_checker=safety_checker,
340
+ feature_extractor=feature_extractor,
341
+ image_encoder=image_encoder,
342
+ )
343
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
344
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
345
+ self.mask_processor = VaeImageProcessor(
346
+ vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
347
+ )
348
+ self.control_image_processor = VaeImageProcessor(
349
+ vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
350
+ )
351
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
352
+
353
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
354
+ def _encode_prompt(
355
+ self,
356
+ prompt,
357
+ num_images_per_prompt,
358
+ do_classifier_free_guidance,
359
+ negative_prompt=None,
360
+ prompt_embeds: Optional[paddle.Tensor] = None,
361
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
362
+ lora_scale: Optional[float] = None,
363
+ **kwargs,
364
+ ):
365
+ deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
366
+ deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
367
+
368
+ prompt_embeds_tuple = self.encode_prompt(
369
+ prompt=prompt,
370
+ num_images_per_prompt=num_images_per_prompt,
371
+ do_classifier_free_guidance=do_classifier_free_guidance,
372
+ negative_prompt=negative_prompt,
373
+ prompt_embeds=prompt_embeds,
374
+ negative_prompt_embeds=negative_prompt_embeds,
375
+ lora_scale=lora_scale,
376
+ **kwargs,
377
+ )
378
+
379
+ # concatenate for backwards comp
380
+ prompt_embeds = paddle.concat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
381
+
382
+ return prompt_embeds
383
+
384
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
385
+ def encode_prompt(
386
+ self,
387
+ prompt,
388
+ num_images_per_prompt,
389
+ do_classifier_free_guidance,
390
+ negative_prompt=None,
391
+ prompt_embeds: Optional[paddle.Tensor] = None,
392
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
393
+ lora_scale: Optional[float] = None,
394
+ clip_skip: Optional[int] = None,
395
+ ):
396
+ r"""
397
+ Encodes the prompt into text encoder hidden states.
398
+
399
+ Args:
400
+ prompt (`str` or `List[str]`, *optional*):
401
+ prompt to be encoded
402
+ num_images_per_prompt (`int`):
403
+ number of images that should be generated per prompt
404
+ do_classifier_free_guidance (`bool`):
405
+ whether to use classifier free guidance or not
406
+ negative_prompt (`str` or `List[str]`, *optional*):
407
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
408
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
409
+ less than `1`).
410
+ prompt_embeds (`paddle.Tensor`, *optional*):
411
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
412
+ provided, text embeddings will be generated from `prompt` input argument.
413
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
414
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
415
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
416
+ argument.
417
+ lora_scale (`float`, *optional*):
418
+ A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
419
+ clip_skip (`int`, *optional*):
420
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
421
+ the output of the pre-final layer will be used for computing the prompt embeddings.
422
+ """
423
+ # set lora scale so that monkey patched LoRA
424
+ # function of text encoder can correctly access it
425
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
426
+ self._lora_scale = lora_scale
427
+
428
+ # dynamically adjust the LoRA scale
429
+ if not USE_PEFT_BACKEND:
430
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
431
+
432
+ if prompt is not None and isinstance(prompt, str):
433
+ batch_size = 1
434
+ elif prompt is not None and isinstance(prompt, list):
435
+ batch_size = len(prompt)
436
+ else:
437
+ batch_size = prompt_embeds.shape[0]
438
+
439
+ if prompt_embeds is None:
440
+ # textual inversion: process multi-vector tokens if necessary
441
+ if isinstance(self, TextualInversionLoaderMixin):
442
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
443
+
444
+ text_inputs = self.tokenizer(
445
+ prompt,
446
+ padding="max_length",
447
+ max_length=self.tokenizer.model_max_length,
448
+ truncation=True,
449
+ return_tensors="pd",
450
+ )
451
+ text_input_ids = text_inputs.input_ids
452
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
453
+
454
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
455
+ text_input_ids, untruncated_ids
456
+ ):
457
+ removed_text = self.tokenizer.batch_decode(
458
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
459
+ )
460
+ logger.warning(
461
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
462
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
463
+ )
464
+
465
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
466
+ attention_mask = text_inputs.attention_mask
467
+ else:
468
+ attention_mask = None
469
+
470
+ if clip_skip is None:
471
+ prompt_embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask)
472
+ prompt_embeds = prompt_embeds[0]
473
+ else:
474
+ prompt_embeds = self.text_encoder(
475
+ text_input_ids, attention_mask=attention_mask, output_hidden_states=True
476
+ )
477
+ # Access the `hidden_states` first, that contains a tuple of
478
+ # all the hidden states from the encoder layers. Then index into
479
+ # the tuple to access the hidden states from the desired layer.
480
+ prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
481
+ # We also need to apply the final LayerNorm here to not mess with the
482
+ # representations. The `last_hidden_states` that we typically use for
483
+ # obtaining the final prompt representations passes through the LayerNorm
484
+ # layer.
485
+ prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
486
+
487
+ if self.text_encoder is not None:
488
+ prompt_embeds_dtype = self.text_encoder.dtype
489
+ elif self.unet is not None:
490
+ prompt_embeds_dtype = self.unet.dtype
491
+ else:
492
+ prompt_embeds_dtype = prompt_embeds.dtype
493
+
494
+ prompt_embeds = prompt_embeds.cast(dtype=prompt_embeds_dtype)
495
+
496
+ bs_embed, seq_len, _ = prompt_embeds.shape
497
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
498
+ prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
499
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
500
+
501
+ # get unconditional embeddings for classifier free guidance
502
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
503
+ uncond_tokens: List[str]
504
+ if negative_prompt is None:
505
+ uncond_tokens = [""] * batch_size
506
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
507
+ raise TypeError(
508
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
509
+ f" {type(prompt)}."
510
+ )
511
+ elif isinstance(negative_prompt, str):
512
+ uncond_tokens = [negative_prompt]
513
+ elif batch_size != len(negative_prompt):
514
+ raise ValueError(
515
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
516
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
517
+ " the batch size of `prompt`."
518
+ )
519
+ else:
520
+ uncond_tokens = negative_prompt
521
+
522
+ # textual inversion: process multi-vector tokens if necessary
523
+ if isinstance(self, TextualInversionLoaderMixin):
524
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
525
+
526
+ max_length = prompt_embeds.shape[1]
527
+ uncond_input = self.tokenizer(
528
+ uncond_tokens,
529
+ padding="max_length",
530
+ max_length=max_length,
531
+ truncation=True,
532
+ return_tensors="pd",
533
+ )
534
+
535
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
536
+ attention_mask = uncond_input.attention_mask
537
+ else:
538
+ attention_mask = None
539
+
540
+ negative_prompt_embeds = self.text_encoder(
541
+ uncond_input.input_ids,
542
+ attention_mask=attention_mask,
543
+ )
544
+ negative_prompt_embeds = negative_prompt_embeds[0]
545
+
546
+ if do_classifier_free_guidance:
547
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
548
+ seq_len = negative_prompt_embeds.shape[1]
549
+
550
+ negative_prompt_embeds = negative_prompt_embeds.cast(dtype=prompt_embeds_dtype)
551
+
552
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
553
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
554
+
555
+ return prompt_embeds, negative_prompt_embeds
556
+
557
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
558
+ def encode_image(self, image, num_images_per_prompt):
559
+ dtype = next(self.image_encoder.named_parameters())[1].dtype
560
+
561
+ if not isinstance(image, paddle.Tensor):
562
+ image = self.feature_extractor(image, return_tensors="pd").pixel_values
563
+
564
+ image = image.cast(dtype=dtype)
565
+ image_embeds = self.image_encoder(image).image_embeds
566
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, axis=0)
567
+
568
+ uncond_image_embeds = paddle.zeros_like(image_embeds)
569
+ return image_embeds, uncond_image_embeds
570
+
571
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
572
+ def run_safety_checker(self, image, dtype):
573
+ if self.safety_checker is None:
574
+ has_nsfw_concept = None
575
+ else:
576
+ if paddle.is_tensor(image):
577
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
578
+ else:
579
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
580
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd")
581
+ image, has_nsfw_concept = self.safety_checker(
582
+ images=image, clip_input=safety_checker_input.pixel_values.cast(dtype=dtype)
583
+ )
584
+ return image, has_nsfw_concept
585
+
586
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
587
+ def decode_latents(self, latents):
588
+ deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
589
+ deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
590
+
591
+ latents = 1 / self.vae.config.scaling_factor * latents
592
+ image = self.vae.decode(latents, return_dict=False)[0]
593
+ image = (image / 2 + 0.5).clip(0, 1)
594
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
595
+ image = image.cast(dtype=paddle.float32).transpose([0, 2, 3, 1]).cpu().numpy()
596
+ return image
597
+
598
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
599
+ def prepare_extra_step_kwargs(self, generator, eta):
600
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
601
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
602
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
603
+ # and should be between [0, 1]
604
+
605
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
606
+ extra_step_kwargs = {}
607
+ if accepts_eta:
608
+ extra_step_kwargs["eta"] = eta
609
+
610
+ # check if the scheduler accepts generator
611
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
612
+ if accepts_generator:
613
+ extra_step_kwargs["generator"] = generator
614
+ return extra_step_kwargs
615
+
616
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
617
+ def get_timesteps(self, num_inference_steps, strength):
618
+ # get the original timestep using init_timestep
619
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
620
+
621
+ t_start = max(num_inference_steps - init_timestep, 0)
622
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
623
+
624
+ return timesteps, num_inference_steps - t_start
625
+
626
+ def check_inputs(
627
+ self,
628
+ prompt,
629
+ image,
630
+ height,
631
+ width,
632
+ callback_steps,
633
+ negative_prompt=None,
634
+ prompt_embeds=None,
635
+ negative_prompt_embeds=None,
636
+ controlnet_conditioning_scale=1.0,
637
+ control_guidance_start=0.0,
638
+ control_guidance_end=1.0,
639
+ callback_on_step_end_tensor_inputs=None,
640
+ ):
641
+ if height is not None and height % 8 != 0 or width is not None and width % 8 != 0:
642
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
643
+
644
+ if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
645
+ raise ValueError(
646
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
647
+ f" {type(callback_steps)}."
648
+ )
649
+
650
+ if callback_on_step_end_tensor_inputs is not None and not all(
651
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
652
+ ):
653
+ raise ValueError(
654
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
655
+ )
656
+
657
+ if prompt is not None and prompt_embeds is not None:
658
+ raise ValueError(
659
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
660
+ " only forward one of the two."
661
+ )
662
+ elif prompt is None and prompt_embeds is None:
663
+ raise ValueError(
664
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
665
+ )
666
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
667
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
668
+
669
+ if negative_prompt is not None and negative_prompt_embeds is not None:
670
+ raise ValueError(
671
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
672
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
673
+ )
674
+
675
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
676
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
677
+ raise ValueError(
678
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
679
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
680
+ f" {negative_prompt_embeds.shape}."
681
+ )
682
+
683
+ # `prompt` needs more sophisticated handling when there are multiple
684
+ # conditionings.
685
+ if isinstance(self.controlnet, MultiControlNetModel):
686
+ if isinstance(prompt, list):
687
+ logger.warning(
688
+ f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
689
+ " prompts. The conditionings will be fixed across the prompts."
690
+ )
691
+
692
+ # Check `image`
693
+ if isinstance(self.controlnet, ControlNetModel):
694
+ self.check_image(image, prompt, prompt_embeds)
695
+ elif isinstance(self.controlnet, MultiControlNetModel):
696
+ if not isinstance(image, list):
697
+ raise TypeError("For multiple controlnets: `image` must be type `list`")
698
+
699
+ # When `image` is a nested list:
700
+ # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
701
+ elif any(isinstance(i, list) for i in image):
702
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
703
+ elif len(image) != len(self.controlnet.nets):
704
+ raise ValueError(
705
+ f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
706
+ )
707
+
708
+ for image_ in image:
709
+ self.check_image(image_, prompt, prompt_embeds)
710
+ else:
711
+ assert False
712
+
713
+ # Check `controlnet_conditioning_scale`
714
+ if isinstance(self.controlnet, ControlNetModel):
715
+ if not isinstance(controlnet_conditioning_scale, float):
716
+ raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
717
+ elif isinstance(self.controlnet, MultiControlNetModel):
718
+ if isinstance(controlnet_conditioning_scale, list):
719
+ if any(isinstance(i, list) for i in controlnet_conditioning_scale):
720
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
721
+ elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
722
+ self.controlnet.nets
723
+ ):
724
+ raise ValueError(
725
+ "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
726
+ " the same length as the number of controlnets"
727
+ )
728
+ else:
729
+ assert False
730
+
731
+ if len(control_guidance_start) != len(control_guidance_end):
732
+ raise ValueError(
733
+ f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
734
+ )
735
+
736
+ if isinstance(self.controlnet, MultiControlNetModel):
737
+ if len(control_guidance_start) != len(self.controlnet.nets):
738
+ raise ValueError(
739
+ f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
740
+ )
741
+
742
+ for start, end in zip(control_guidance_start, control_guidance_end):
743
+ if start >= end:
744
+ raise ValueError(
745
+ f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
746
+ )
747
+ if start < 0.0:
748
+ raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
749
+ if end > 1.0:
750
+ raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
751
+
752
+ # Copied from ppdiffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
753
+ def check_image(self, image, prompt, prompt_embeds):
754
+ image_is_pil = isinstance(image, PIL.Image.Image)
755
+ image_is_tensor = isinstance(image, paddle.Tensor)
756
+ image_is_np = isinstance(image, np.ndarray)
757
+ image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
758
+ image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor)
759
+ image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
760
+
761
+ if (
762
+ not image_is_pil
763
+ and not image_is_tensor
764
+ and not image_is_np
765
+ and not image_is_pil_list
766
+ and not image_is_tensor_list
767
+ and not image_is_np_list
768
+ ):
769
+ raise TypeError(
770
+ f"image must be passed and be one of PIL image, numpy array, paddle tensor, list of PIL images, list of numpy arrays or list of paddle tensors, but is {type(image)}"
771
+ )
772
+
773
+ if image_is_pil:
774
+ image_batch_size = 1
775
+ else:
776
+ image_batch_size = len(image)
777
+
778
+ if prompt is not None and isinstance(prompt, str):
779
+ prompt_batch_size = 1
780
+ elif prompt is not None and isinstance(prompt, list):
781
+ prompt_batch_size = len(prompt)
782
+ elif prompt_embeds is not None:
783
+ prompt_batch_size = prompt_embeds.shape[0]
784
+
785
+ if image_batch_size != 1 and image_batch_size != prompt_batch_size:
786
+ raise ValueError(
787
+ f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
788
+ )
789
+
790
+ # Copied from ppdiffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
791
+ def prepare_control_image(
792
+ self,
793
+ image,
794
+ width,
795
+ height,
796
+ batch_size,
797
+ num_images_per_prompt,
798
+ dtype,
799
+ do_classifier_free_guidance=False,
800
+ guess_mode=False,
801
+ ):
802
+ image = self.control_image_processor.preprocess(image, height=height, width=width).cast(dtype=paddle.float32)
803
+ image_batch_size = image.shape[0]
804
+
805
+ if image_batch_size == 1:
806
+ repeat_by = batch_size
807
+ else:
808
+ # image batch size is the same as prompt batch size
809
+ repeat_by = num_images_per_prompt
810
+
811
+ image = image.repeat_interleave(repeat_by, axis=0)
812
+
813
+ image = image.cast(dtype=dtype)
814
+
815
+ if do_classifier_free_guidance and not guess_mode:
816
+ image = paddle.concat([image] * 2)
817
+
818
+ return image
819
+
820
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_latents
821
+ def prepare_latents(
822
+ self,
823
+ batch_size,
824
+ num_channels_latents,
825
+ height,
826
+ width,
827
+ dtype,
828
+ generator,
829
+ latents=None,
830
+ image=None,
831
+ timestep=None,
832
+ is_strength_max=True,
833
+ return_noise=False,
834
+ return_image_latents=False,
835
+ ):
836
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
837
+ if isinstance(generator, list) and len(generator) != batch_size:
838
+ raise ValueError(
839
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
840
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
841
+ )
842
+
843
+ if (image is None or timestep is None) and not is_strength_max:
844
+ raise ValueError(
845
+ "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
846
+ "However, either the image or the noise timestep has not been provided."
847
+ )
848
+
849
+ if return_image_latents or (latents is None and not is_strength_max):
850
+ image = image.cast(dtype=dtype)
851
+
852
+ if image.shape[1] == 4:
853
+ image_latents = image
854
+ else:
855
+ image_latents = self._encode_vae_image(image=image, generator=generator)
856
+ image_latents = image_latents.tile([batch_size // image_latents.shape[0], 1, 1, 1])
857
+
858
+ if latents is None:
859
+ noise = randn_tensor(shape, generator=generator, dtype=dtype)
860
+ # if strength is 1. then initialise the latents to noise, else initial to image + noise
861
+ latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
862
+ # if pure noise then scale the initial latents by the Scheduler's init sigma
863
+ latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
864
+ else:
865
+ noise = latents.cast(dtype)
866
+ latents = noise * self.scheduler.init_noise_sigma
867
+
868
+ outputs = (latents,)
869
+
870
+ if return_noise:
871
+ outputs += (noise,)
872
+
873
+ if return_image_latents:
874
+ outputs += (image_latents,)
875
+
876
+ return outputs
877
+
878
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_mask_latents
879
+ def prepare_mask_latents(
880
+ self, mask, masked_image, batch_size, height, width, dtype, generator, do_classifier_free_guidance
881
+ ):
882
+ # resize the mask to latents shape as we concatenate the mask to the latents
883
+ # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
884
+ # and half precision
885
+ mask = paddle.nn.functional.interpolate(
886
+ mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
887
+ )
888
+ mask = mask.cast(dtype=dtype)
889
+
890
+ masked_image = masked_image.cast(dtype=dtype)
891
+
892
+ if masked_image.shape[1] == 4:
893
+ masked_image_latents = masked_image
894
+ else:
895
+ masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
896
+
897
+ # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
898
+ if mask.shape[0] < batch_size:
899
+ if not batch_size % mask.shape[0] == 0:
900
+ raise ValueError(
901
+ "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
902
+ f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
903
+ " of masks that you pass is divisible by the total requested batch size."
904
+ )
905
+ mask = mask.tile([batch_size // mask.shape[0], 1, 1, 1])
906
+ if masked_image_latents.shape[0] < batch_size:
907
+ if not batch_size % masked_image_latents.shape[0] == 0:
908
+ raise ValueError(
909
+ "The passed images and the required batch size don't match. Images are supposed to be duplicated"
910
+ f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
911
+ " Make sure the number of images that you pass is divisible by the total requested batch size."
912
+ )
913
+ masked_image_latents = masked_image_latents.tile([batch_size // masked_image_latents.shape[0], 1, 1, 1])
914
+
915
+ mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask
916
+ masked_image_latents = (
917
+ paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
918
+ )
919
+
920
+ # aligning device to prevent device errors when concating it with the latent model input
921
+ masked_image_latents = masked_image_latents.cast(dtype=dtype)
922
+ return mask, masked_image_latents
923
+
924
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline._encode_vae_image
925
+ def _encode_vae_image(self, image: paddle.Tensor, generator: paddle.Generator):
926
+ if isinstance(generator, list):
927
+ image_latents = [
928
+ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
929
+ for i in range(image.shape[0])
930
+ ]
931
+ image_latents = paddle.concat(image_latents, axis=0)
932
+ else:
933
+ image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
934
+
935
+ image_latents = self.vae.config.scaling_factor * image_latents
936
+
937
+ return image_latents
938
+
939
+ @property
940
+ def guidance_scale(self):
941
+ return self._guidance_scale
942
+
943
+ @property
944
+ def clip_skip(self):
945
+ return self._clip_skip
946
+
947
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
948
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
949
+ # corresponds to doing no classifier free guidance.
950
+ @property
951
+ def do_classifier_free_guidance(self):
952
+ return self._guidance_scale > 1
953
+
954
+ @property
955
+ def cross_attention_kwargs(self):
956
+ return self._cross_attention_kwargs
957
+
958
+ @property
959
+ def num_timesteps(self):
960
+ return self._num_timesteps
961
+
962
+ @paddle.no_grad()
963
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
964
+ def __call__(
965
+ self,
966
+ prompt: Union[str, List[str]] = None,
967
+ image: PipelineImageInput = None,
968
+ mask_image: PipelineImageInput = None,
969
+ control_image: PipelineImageInput = None,
970
+ height: Optional[int] = None,
971
+ width: Optional[int] = None,
972
+ strength: float = 1.0,
973
+ num_inference_steps: int = 50,
974
+ guidance_scale: float = 7.5,
975
+ negative_prompt: Optional[Union[str, List[str]]] = None,
976
+ num_images_per_prompt: Optional[int] = 1,
977
+ eta: float = 0.0,
978
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
979
+ latents: Optional[paddle.Tensor] = None,
980
+ prompt_embeds: Optional[paddle.Tensor] = None,
981
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
982
+ ip_adapter_image: Optional[PipelineImageInput] = None,
983
+ output_type: Optional[str] = "pil",
984
+ return_dict: bool = True,
985
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
986
+ controlnet_conditioning_scale: Union[float, List[float]] = 0.5,
987
+ guess_mode: bool = False,
988
+ control_guidance_start: Union[float, List[float]] = 0.0,
989
+ control_guidance_end: Union[float, List[float]] = 1.0,
990
+ clip_skip: Optional[int] = None,
991
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
992
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
993
+ **kwargs,
994
+ ):
995
+ r"""
996
+ The call function to the pipeline for generation.
997
+
998
+ Args:
999
+ prompt (`str` or `List[str]`, *optional*):
1000
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
1001
+ image (`paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[paddle.Tensor]`,
1002
+ `List[PIL.Image.Image]`, or `List[np.ndarray]`):
1003
+ `Image`, NumPy array or tensor representing an image batch to be used as the starting point. For both
1004
+ NumPy array and Paddle tensor, the expected value range is between `[0, 1]`. If it's a tensor or a
1005
+ list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a NumPy array or
1006
+ a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)`. It can also accept image
1007
+ latents as `image`, but if passing latents directly it is not encoded again.
1008
+ mask_image (`paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[paddle.Tensor]`,
1009
+ `List[PIL.Image.Image]`, or `List[np.ndarray]`):
1010
+ `Image`, NumPy array or tensor representing an image batch to mask `image`. White pixels in the mask
1011
+ are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
1012
+ single channel (luminance) before use. If it's a NumPy array or Paddle tensor, it should contain one
1013
+ color channel (L) instead of 3, so the expected shape for Paddle tensor would be `(B, 1, H, W)`, `(B,
1014
+ H, W)`, `(1, H, W)`, `(H, W)`. And for NumPy array, it would be for `(B, H, W, 1)`, `(B, H, W)`, `(H,
1015
+ W, 1)`, or `(H, W)`.
1016
+ control_image (`paddle.Tensor`, `PIL.Image.Image`, `List[paddle.Tensor]`, `List[PIL.Image.Image]`,
1017
+ `List[List[paddle.Tensor]]`, or `List[List[PIL.Image.Image]]`):
1018
+ The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
1019
+ specified as `paddle.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
1020
+ accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
1021
+ and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
1022
+ `init`, images must be passed as a list such that each element of the list can be correctly batched for
1023
+ input to a single ControlNet.
1024
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
1025
+ The height in pixels of the generated image.
1026
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
1027
+ The width in pixels of the generated image.
1028
+ strength (`float`, *optional*, defaults to 1.0):
1029
+ Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
1030
+ starting point and more noise is added the higher the `strength`. The number of denoising steps depends
1031
+ on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
1032
+ process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
1033
+ essentially ignores `image`.
1034
+ num_inference_steps (`int`, *optional*, defaults to 50):
1035
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
1036
+ expense of slower inference.
1037
+ guidance_scale (`float`, *optional*, defaults to 7.5):
1038
+ A higher guidance scale value encourages the model to generate images closely linked to the text
1039
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
1040
+ negative_prompt (`str` or `List[str]`, *optional*):
1041
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
1042
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
1043
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
1044
+ The number of images to generate per prompt.
1045
+ eta (`float`, *optional*, defaults to 0.0):
1046
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
1047
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
1048
+ generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
1049
+ A [`paddle.Generator`] to make generation deterministic.
1050
+
1051
+ latents (`paddle.Tensor`, *optional*):
1052
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
1053
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
1054
+ tensor is generated by sampling using the supplied random `generator`.
1055
+ prompt_embeds (`paddle.Tensor`, *optional*):
1056
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
1057
+ provided, text embeddings are generated from the `prompt` input argument.
1058
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
1059
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
1060
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
1061
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
1062
+ output_type (`str`, *optional*, defaults to `"pil"`):
1063
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
1064
+ return_dict (`bool`, *optional*, defaults to `True`):
1065
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
1066
+ plain tuple.
1067
+ cross_attention_kwargs (`dict`, *optional*):
1068
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
1069
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
1070
+ controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 0.5):
1071
+ The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
1072
+ to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
1073
+ the corresponding scale as a list.
1074
+ guess_mode (`bool`, *optional*, defaults to `False`):
1075
+ The ControlNet encoder tries to recognize the content of the input image even if you remove all
1076
+ prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
1077
+ control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
1078
+ The percentage of total steps at which the ControlNet starts applying.
1079
+ control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
1080
+ The percentage of total steps at which the ControlNet stops applying.
1081
+ clip_skip (`int`, *optional*):
1082
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
1083
+ the output of the pre-final layer will be used for computing the prompt embeddings.
1084
+ callback_on_step_end (`Callable`, *optional*):
1085
+ A function that calls at the end of each denoising steps during the inference. The function is called
1086
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
1087
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
1088
+ `callback_on_step_end_tensor_inputs`.
1089
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
1090
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
1091
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
1092
+ `._callback_tensor_inputs` attribute of your pipeine class.
1093
+
1094
+ Examples:
1095
+
1096
+ Returns:
1097
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
1098
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
1099
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
1100
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
1101
+ "not-safe-for-work" (nsfw) content.
1102
+ """
1103
+
1104
+ callback = kwargs.pop("callback", None)
1105
+ callback_steps = kwargs.pop("callback_steps", None)
1106
+
1107
+ if callback is not None:
1108
+ deprecate(
1109
+ "callback",
1110
+ "1.0.0",
1111
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
1112
+ )
1113
+ if callback_steps is not None:
1114
+ deprecate(
1115
+ "callback_steps",
1116
+ "1.0.0",
1117
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
1118
+ )
1119
+
1120
+ controlnet = self.controlnet
1121
+
1122
+ # align format for control guidance
1123
+ if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
1124
+ control_guidance_start = len(control_guidance_end) * [control_guidance_start]
1125
+ elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
1126
+ control_guidance_end = len(control_guidance_start) * [control_guidance_end]
1127
+ elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
1128
+ mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
1129
+ control_guidance_start, control_guidance_end = (
1130
+ mult * [control_guidance_start],
1131
+ mult * [control_guidance_end],
1132
+ )
1133
+
1134
+ # 1. Check inputs. Raise error if not correct
1135
+ self.check_inputs(
1136
+ prompt,
1137
+ control_image,
1138
+ height,
1139
+ width,
1140
+ callback_steps,
1141
+ negative_prompt,
1142
+ prompt_embeds,
1143
+ negative_prompt_embeds,
1144
+ controlnet_conditioning_scale,
1145
+ control_guidance_start,
1146
+ control_guidance_end,
1147
+ callback_on_step_end_tensor_inputs,
1148
+ )
1149
+
1150
+ self._guidance_scale = guidance_scale
1151
+ self._clip_skip = clip_skip
1152
+ self._cross_attention_kwargs = cross_attention_kwargs
1153
+
1154
+ # 2. Define call parameters
1155
+ if prompt is not None and isinstance(prompt, str):
1156
+ batch_size = 1
1157
+ elif prompt is not None and isinstance(prompt, list):
1158
+ batch_size = len(prompt)
1159
+ else:
1160
+ batch_size = prompt_embeds.shape[0]
1161
+
1162
+ if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
1163
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
1164
+
1165
+ global_pool_conditions = (
1166
+ controlnet.config.global_pool_conditions
1167
+ if isinstance(controlnet, ControlNetModel)
1168
+ else controlnet.nets[0].config.global_pool_conditions
1169
+ )
1170
+ guess_mode = guess_mode or global_pool_conditions
1171
+
1172
+ # 3. Encode input prompt
1173
+ text_encoder_lora_scale = (
1174
+ self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
1175
+ )
1176
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
1177
+ prompt,
1178
+ num_images_per_prompt,
1179
+ self.do_classifier_free_guidance,
1180
+ negative_prompt,
1181
+ prompt_embeds=prompt_embeds,
1182
+ negative_prompt_embeds=negative_prompt_embeds,
1183
+ lora_scale=text_encoder_lora_scale,
1184
+ clip_skip=self.clip_skip,
1185
+ )
1186
+ # For classifier free guidance, we need to do two forward passes.
1187
+ # Here we concatenate the unconditional and text embeddings into a single batch
1188
+ # to avoid doing two forward passes
1189
+ if self.do_classifier_free_guidance:
1190
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
1191
+
1192
+ if ip_adapter_image is not None:
1193
+ image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, num_images_per_prompt)
1194
+ if self.do_classifier_free_guidance:
1195
+ image_embeds = paddle.concat([negative_image_embeds, image_embeds])
1196
+
1197
+ # 4. Prepare image
1198
+ if isinstance(controlnet, ControlNetModel):
1199
+ control_image = self.prepare_control_image(
1200
+ image=control_image,
1201
+ width=width,
1202
+ height=height,
1203
+ batch_size=batch_size * num_images_per_prompt,
1204
+ num_images_per_prompt=num_images_per_prompt,
1205
+ dtype=controlnet.dtype,
1206
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
1207
+ guess_mode=guess_mode,
1208
+ )
1209
+ elif isinstance(controlnet, MultiControlNetModel):
1210
+ control_images = []
1211
+
1212
+ for control_image_ in control_image:
1213
+ control_image_ = self.prepare_control_image(
1214
+ image=control_image_,
1215
+ width=width,
1216
+ height=height,
1217
+ batch_size=batch_size * num_images_per_prompt,
1218
+ num_images_per_prompt=num_images_per_prompt,
1219
+ dtype=controlnet.dtype,
1220
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
1221
+ guess_mode=guess_mode,
1222
+ )
1223
+
1224
+ control_images.append(control_image_)
1225
+
1226
+ control_image = control_images
1227
+ else:
1228
+ assert False
1229
+
1230
+ # 4.1 Preprocess mask and image - resizes image and mask w.r.t height and width
1231
+ init_image = self.image_processor.preprocess(image, height=height, width=width)
1232
+ init_image = init_image.cast(dtype=paddle.float32)
1233
+
1234
+ mask = self.mask_processor.preprocess(mask_image, height=height, width=width)
1235
+
1236
+ masked_image = init_image * (mask < 0.5).cast(init_image.dtype)
1237
+ _, _, height, width = init_image.shape
1238
+
1239
+ # 5. Prepare timesteps
1240
+ self.scheduler.set_timesteps(num_inference_steps)
1241
+ timesteps, num_inference_steps = self.get_timesteps(
1242
+ num_inference_steps=num_inference_steps,
1243
+ strength=strength,
1244
+ )
1245
+ # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
1246
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
1247
+ # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
1248
+ is_strength_max = strength == 1.0
1249
+ self._num_timesteps = len(timesteps)
1250
+
1251
+ # 6. Prepare latent variables
1252
+ num_channels_latents = self.vae.config.latent_channels
1253
+ num_channels_unet = self.unet.config.in_channels
1254
+ return_image_latents = num_channels_unet == 4
1255
+ latents_outputs = self.prepare_latents(
1256
+ batch_size * num_images_per_prompt,
1257
+ num_channels_latents,
1258
+ height,
1259
+ width,
1260
+ prompt_embeds.dtype,
1261
+ generator,
1262
+ latents,
1263
+ image=init_image,
1264
+ timestep=latent_timestep,
1265
+ is_strength_max=is_strength_max,
1266
+ return_noise=True,
1267
+ return_image_latents=return_image_latents,
1268
+ )
1269
+
1270
+ if return_image_latents:
1271
+ latents, noise, image_latents = latents_outputs
1272
+ else:
1273
+ latents, noise = latents_outputs
1274
+
1275
+ # 7. Prepare mask latent variables
1276
+ mask, masked_image_latents = self.prepare_mask_latents(
1277
+ mask,
1278
+ masked_image,
1279
+ batch_size * num_images_per_prompt,
1280
+ height,
1281
+ width,
1282
+ prompt_embeds.dtype,
1283
+ generator,
1284
+ self.do_classifier_free_guidance,
1285
+ )
1286
+
1287
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
1288
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1289
+
1290
+ # 7.1 Add image embeds for IP-Adapter
1291
+ added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
1292
+
1293
+ # 7.2 Create tensor stating which controlnets to keep
1294
+ controlnet_keep = []
1295
+ for i in range(len(timesteps)):
1296
+ keeps = [
1297
+ 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
1298
+ for s, e in zip(control_guidance_start, control_guidance_end)
1299
+ ]
1300
+ controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
1301
+
1302
+ # 8. Denoising loop
1303
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
1304
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1305
+ for i, t in enumerate(timesteps):
1306
+ # expand the latents if we are doing classifier free guidance
1307
+ latent_model_input = paddle.concat([latents] * 2) if self.do_classifier_free_guidance else latents
1308
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
1309
+
1310
+ # controlnet(s) inference
1311
+ if guess_mode and self.do_classifier_free_guidance:
1312
+ # Infer ControlNet only for the conditional batch.
1313
+ control_model_input = latents
1314
+ control_model_input = self.scheduler.scale_model_input(control_model_input, t)
1315
+ controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
1316
+ else:
1317
+ control_model_input = latent_model_input
1318
+ controlnet_prompt_embeds = prompt_embeds
1319
+
1320
+ if isinstance(controlnet_keep[i], list):
1321
+ cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
1322
+ else:
1323
+ controlnet_cond_scale = controlnet_conditioning_scale
1324
+ if isinstance(controlnet_cond_scale, list):
1325
+ controlnet_cond_scale = controlnet_cond_scale[0]
1326
+ cond_scale = controlnet_cond_scale * controlnet_keep[i]
1327
+
1328
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
1329
+ control_model_input,
1330
+ t,
1331
+ encoder_hidden_states=controlnet_prompt_embeds,
1332
+ controlnet_cond=control_image,
1333
+ conditioning_scale=cond_scale,
1334
+ guess_mode=guess_mode,
1335
+ return_dict=False,
1336
+ )
1337
+
1338
+ if guess_mode and self.do_classifier_free_guidance:
1339
+ # Infered ControlNet only for the conditional batch.
1340
+ # To apply the output of ControlNet to both the unconditional and conditional batches,
1341
+ # add 0 to the unconditional batch to keep it unchanged.
1342
+ down_block_res_samples = [paddle.concat([paddle.zeros_like(d), d]) for d in down_block_res_samples]
1343
+ mid_block_res_sample = paddle.concat(
1344
+ [paddle.zeros_like(mid_block_res_sample), mid_block_res_sample]
1345
+ )
1346
+
1347
+ # predict the noise residual
1348
+ if num_channels_unet == 9:
1349
+ latent_model_input = paddle.concat(
1350
+ [
1351
+ latent_model_input,
1352
+ mask.cast(latent_model_input.dtype),
1353
+ masked_image_latents.cast(latent_model_input.dtype),
1354
+ ],
1355
+ axis=1,
1356
+ )
1357
+
1358
+ noise_pred = self.unet(
1359
+ latent_model_input,
1360
+ t,
1361
+ encoder_hidden_states=prompt_embeds,
1362
+ cross_attention_kwargs=self.cross_attention_kwargs,
1363
+ down_block_additional_residuals=down_block_res_samples,
1364
+ mid_block_additional_residual=mid_block_res_sample,
1365
+ added_cond_kwargs=added_cond_kwargs,
1366
+ return_dict=False,
1367
+ )[0]
1368
+
1369
+ # perform guidance
1370
+ if self.do_classifier_free_guidance:
1371
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1372
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
1373
+
1374
+ # compute the previous noisy sample x_t -> x_t-1
1375
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1376
+
1377
+ if num_channels_unet == 4:
1378
+ init_latents_proper = image_latents
1379
+ if self.do_classifier_free_guidance:
1380
+ init_mask, _ = mask.chunk(2)
1381
+ else:
1382
+ init_mask = mask
1383
+
1384
+ if i < len(timesteps) - 1:
1385
+ noise_timestep = timesteps[i + 1]
1386
+ init_latents_proper = self.scheduler.add_noise(
1387
+ init_latents_proper, noise, paddle.to_tensor([noise_timestep])
1388
+ )
1389
+
1390
+ latents = (1 - init_mask) * init_latents_proper + init_mask * latents
1391
+
1392
+ if callback_on_step_end is not None:
1393
+ callback_kwargs = {}
1394
+ for k in callback_on_step_end_tensor_inputs:
1395
+ callback_kwargs[k] = locals()[k]
1396
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
1397
+
1398
+ latents = callback_outputs.pop("latents", latents)
1399
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1400
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
1401
+
1402
+ # call the callback, if provided
1403
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1404
+ progress_bar.update()
1405
+ if callback is not None and i % callback_steps == 0:
1406
+ step_idx = i // getattr(self.scheduler, "order", 1)
1407
+ callback(step_idx, t, latents)
1408
+
1409
+ if not output_type == "latent":
1410
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
1411
+ 0
1412
+ ]
1413
+ image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
1414
+ else:
1415
+ image = latents
1416
+ has_nsfw_concept = None
1417
+
1418
+ if has_nsfw_concept is None:
1419
+ do_denormalize = [True] * image.shape[0]
1420
+ else:
1421
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
1422
+
1423
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
1424
+
1425
+ if not return_dict:
1426
+ return (image, has_nsfw_concept)
1427
+
1428
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py ADDED
@@ -0,0 +1,1579 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Harutatsu Akiyama, Jinbin Bai, and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
+
18
+ import numpy as np
19
+ import paddle
20
+ import PIL.Image
21
+
22
+ from ppdiffusers.transformers import (
23
+ CLIPTextModel,
24
+ CLIPTextModelWithProjection,
25
+ CLIPTokenizer,
26
+ )
27
+
28
+ from ...image_processor import PipelineImageInput, VaeImageProcessor
29
+ from ...loaders import (
30
+ FromSingleFileMixin,
31
+ StableDiffusionXLLoraLoaderMixin,
32
+ TextualInversionLoaderMixin,
33
+ )
34
+ from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
35
+ from ...models.attention_processor import (
36
+ AttnProcessor2_5,
37
+ LoRAAttnProcessor2_5,
38
+ LoRAXFormersAttnProcessor,
39
+ XFormersAttnProcessor,
40
+ )
41
+ from ...models.lora import adjust_lora_scale_text_encoder
42
+ from ...schedulers import KarrasDiffusionSchedulers
43
+ from ...utils import (
44
+ USE_PEFT_BACKEND,
45
+ deprecate,
46
+ is_pp_invisible_watermark_available,
47
+ logging,
48
+ replace_example_docstring,
49
+ )
50
+ from ...utils.paddle_utils import randn_tensor
51
+ from ..pipeline_utils import DiffusionPipeline
52
+ from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
53
+ from .multicontrolnet import MultiControlNetModel
54
+
55
+ if is_pp_invisible_watermark_available():
56
+ from ppdiffusers.pipelines.stable_diffusion_xl.watermark import (
57
+ StableDiffusionXLWatermarker,
58
+ )
59
+
60
+
61
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
62
+
63
+
64
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
65
+ def retrieve_latents(
66
+ encoder_output: paddle.Tensor, generator: Optional[paddle.Generator] = None, sample_mode: str = "sample"
67
+ ):
68
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
69
+ return encoder_output.latent_dist.sample(generator)
70
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
71
+ return encoder_output.latent_dist.mode()
72
+ elif hasattr(encoder_output, "latents"):
73
+ return encoder_output.latents
74
+ else:
75
+ raise AttributeError("Could not access latents of provided encoder_output")
76
+
77
+
78
+ EXAMPLE_DOC_STRING = """
79
+ Examples:
80
+ ```py
81
+ >>> # !pip install transformers accelerate
82
+ >>> from ppdiffusers import StableDiffusionXLControlNetInpaintPipeline, ControlNetModel, DDIMScheduler
83
+ >>> from ppdiffusers.utils import load_image
84
+ >>> import numpy as np
85
+ >>> import paddle
86
+
87
+ >>> init_image = load_image(
88
+ ... "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy.png"
89
+ ... )
90
+ >>> init_image = init_image.resize((1024, 1024))
91
+
92
+ >>> generator = paddle.Generator().manual_seed(1)
93
+
94
+ >>> mask_image = load_image(
95
+ ... "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy_mask.png"
96
+ ... )
97
+ >>> mask_image = mask_image.resize((1024, 1024))
98
+
99
+
100
+ >>> def make_canny_condition(image):
101
+ ... image = np.array(image)
102
+ ... image = cv2.Canny(image, 100, 200)
103
+ ... image = image[:, :, None]
104
+ ... image = np.concatenate([image, image, image], axis=2)
105
+ ... image = Image.fromarray(image)
106
+ ... return image
107
+
108
+
109
+ >>> control_image = make_canny_condition(init_image)
110
+
111
+ >>> controlnet = ControlNetModel.from_pretrained(
112
+ ... "diffusers/controlnet-canny-sdxl-1.0", paddle_dtype=paddle.float16
113
+ ... )
114
+ >>> pipe = StableDiffusionXLControlNetInpaintPipeline.from_pretrained(
115
+ ... "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, paddle_dtype=paddle.float16
116
+ ... )
117
+
118
+ >>> # generate image
119
+ >>> image = pipe(
120
+ ... "a handsome man with ray-ban sunglasses",
121
+ ... num_inference_steps=20,
122
+ ... generator=generator,
123
+ ... eta=1.0,
124
+ ... image=init_image,
125
+ ... mask_image=mask_image,
126
+ ... control_image=control_image,
127
+ ... ).images[0]
128
+ ```
129
+ """
130
+
131
+
132
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
133
+ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
134
+ """
135
+ Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
136
+ Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
137
+ """
138
+ std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
139
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
140
+ # rescale the results from guidance (fixes overexposure)
141
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
142
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
143
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
144
+ return noise_cfg
145
+
146
+
147
+ class StableDiffusionXLControlNetInpaintPipeline(
148
+ DiffusionPipeline, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin
149
+ ):
150
+ r"""
151
+ Pipeline for text-to-image generation using Stable Diffusion XL.
152
+
153
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
154
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
155
+
156
+ In addition the pipeline inherits the following loading methods:
157
+ - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`]
158
+ - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
159
+
160
+ as well as the following saving methods:
161
+ - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`]
162
+
163
+ Args:
164
+ vae ([`AutoencoderKL`]):
165
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
166
+ text_encoder ([`CLIPTextModel`]):
167
+ Frozen text-encoder. Stable Diffusion XL uses the text portion of
168
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
169
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
170
+ text_encoder_2 ([` CLIPTextModelWithProjection`]):
171
+ Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
172
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
173
+ specifically the
174
+ [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
175
+ variant.
176
+ tokenizer (`CLIPTokenizer`):
177
+ Tokenizer of class
178
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
179
+ tokenizer_2 (`CLIPTokenizer`):
180
+ Second Tokenizer of class
181
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
182
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
183
+ scheduler ([`SchedulerMixin`]):
184
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
185
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
186
+ """
187
+
188
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
189
+ _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
190
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
191
+
192
+ def __init__(
193
+ self,
194
+ vae: AutoencoderKL,
195
+ text_encoder: CLIPTextModel,
196
+ text_encoder_2: CLIPTextModelWithProjection,
197
+ tokenizer: CLIPTokenizer,
198
+ tokenizer_2: CLIPTokenizer,
199
+ unet: UNet2DConditionModel,
200
+ controlnet: ControlNetModel,
201
+ scheduler: KarrasDiffusionSchedulers,
202
+ requires_aesthetics_score: bool = False,
203
+ force_zeros_for_empty_prompt: bool = True,
204
+ add_watermarker: Optional[bool] = None,
205
+ ):
206
+ super().__init__()
207
+
208
+ if isinstance(controlnet, (list, tuple)):
209
+ controlnet = MultiControlNetModel(controlnet)
210
+
211
+ self.register_modules(
212
+ vae=vae,
213
+ text_encoder=text_encoder,
214
+ text_encoder_2=text_encoder_2,
215
+ tokenizer=tokenizer,
216
+ tokenizer_2=tokenizer_2,
217
+ unet=unet,
218
+ controlnet=controlnet,
219
+ scheduler=scheduler,
220
+ )
221
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
222
+ self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
223
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
224
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
225
+ self.mask_processor = VaeImageProcessor(
226
+ vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
227
+ )
228
+ self.control_image_processor = VaeImageProcessor(
229
+ vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
230
+ )
231
+
232
+ add_watermarker = add_watermarker if add_watermarker is not None else is_pp_invisible_watermark_available()
233
+
234
+ if add_watermarker:
235
+ self.watermark = StableDiffusionXLWatermarker()
236
+ else:
237
+ self.watermark = None
238
+
239
+ # Copied from ppdiffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
240
+ def encode_prompt(
241
+ self,
242
+ prompt: str,
243
+ prompt_2: Optional[str] = None,
244
+ num_images_per_prompt: int = 1,
245
+ do_classifier_free_guidance: bool = True,
246
+ negative_prompt: Optional[str] = None,
247
+ negative_prompt_2: Optional[str] = None,
248
+ prompt_embeds: Optional[paddle.Tensor] = None,
249
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
250
+ pooled_prompt_embeds: Optional[paddle.Tensor] = None,
251
+ negative_pooled_prompt_embeds: Optional[paddle.Tensor] = None,
252
+ lora_scale: Optional[float] = None,
253
+ clip_skip: Optional[int] = None,
254
+ ):
255
+ r"""
256
+ Encodes the prompt into text encoder hidden states.
257
+
258
+ Args:
259
+ prompt (`str` or `List[str]`, *optional*):
260
+ prompt to be encoded
261
+ prompt_2 (`str` or `List[str]`, *optional*):
262
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
263
+ used in both text-encoders
264
+ num_images_per_prompt (`int`):
265
+ number of images that should be generated per prompt
266
+ do_classifier_free_guidance (`bool`):
267
+ whether to use classifier free guidance or not
268
+ negative_prompt (`str` or `List[str]`, *optional*):
269
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
270
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
271
+ less than `1`).
272
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
273
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
274
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
275
+ prompt_embeds (`paddle.Tensor`, *optional*):
276
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
277
+ provided, text embeddings will be generated from `prompt` input argument.
278
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
279
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
280
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
281
+ argument.
282
+ pooled_prompt_embeds (`paddle.Tensor`, *optional*):
283
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
284
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
285
+ negative_pooled_prompt_embeds (`paddle.Tensor`, *optional*):
286
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
287
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
288
+ input argument.
289
+ lora_scale (`float`, *optional*):
290
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
291
+ clip_skip (`int`, *optional*):
292
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
293
+ the output of the pre-final layer will be used for computing the prompt embeddings.
294
+ """
295
+ # set lora scale so that monkey patched LoRA
296
+ # function of text encoder can correctly access it
297
+ if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
298
+ self._lora_scale = lora_scale
299
+
300
+ # dynamically adjust the LoRA scale
301
+ if self.text_encoder is not None:
302
+ if not USE_PEFT_BACKEND:
303
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
304
+
305
+ if self.text_encoder_2 is not None:
306
+ if not USE_PEFT_BACKEND:
307
+ adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
308
+
309
+ prompt = [prompt] if isinstance(prompt, str) else prompt
310
+
311
+ if prompt is not None:
312
+ batch_size = len(prompt)
313
+ else:
314
+ batch_size = prompt_embeds.shape[0]
315
+
316
+ # Define tokenizers and text encoders
317
+ tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
318
+ text_encoders = (
319
+ [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
320
+ )
321
+
322
+ if prompt_embeds is None:
323
+ prompt_2 = prompt_2 or prompt
324
+ prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
325
+
326
+ # textual inversion: process multi-vector tokens if necessary
327
+ prompt_embeds_list = []
328
+ prompts = [prompt, prompt_2]
329
+ for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
330
+ if isinstance(self, TextualInversionLoaderMixin):
331
+ prompt = self.maybe_convert_prompt(prompt, tokenizer)
332
+
333
+ text_inputs = tokenizer(
334
+ prompt,
335
+ padding="max_length",
336
+ max_length=tokenizer.model_max_length,
337
+ truncation=True,
338
+ return_tensors="pd",
339
+ )
340
+
341
+ text_input_ids = text_inputs.input_ids
342
+ untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
343
+
344
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
345
+ text_input_ids, untruncated_ids
346
+ ):
347
+ removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
348
+ logger.warning(
349
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
350
+ f" {tokenizer.model_max_length} tokens: {removed_text}"
351
+ )
352
+
353
+ prompt_embeds = text_encoder(text_input_ids, output_hidden_states=True)
354
+
355
+ # We are only ALWAYS interested in the pooled output of the final text encoder
356
+ pooled_prompt_embeds = prompt_embeds[0]
357
+ if clip_skip is None:
358
+ prompt_embeds = prompt_embeds.hidden_states[-2]
359
+ else:
360
+ # "2" because SDXL always indexes from the penultimate layer.
361
+ prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
362
+
363
+ prompt_embeds_list.append(prompt_embeds)
364
+
365
+ prompt_embeds = paddle.concat(prompt_embeds_list, axis=-1)
366
+
367
+ # get unconditional embeddings for classifier free guidance
368
+ zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
369
+ if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
370
+ negative_prompt_embeds = paddle.zeros_like(prompt_embeds)
371
+ negative_pooled_prompt_embeds = paddle.zeros_like(pooled_prompt_embeds)
372
+ elif do_classifier_free_guidance and negative_prompt_embeds is None:
373
+ negative_prompt = negative_prompt or ""
374
+ negative_prompt_2 = negative_prompt_2 or negative_prompt
375
+
376
+ # normalize str to list
377
+ negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
378
+ negative_prompt_2 = (
379
+ batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
380
+ )
381
+
382
+ uncond_tokens: List[str]
383
+ if prompt is not None and type(prompt) is not type(negative_prompt):
384
+ raise TypeError(
385
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
386
+ f" {type(prompt)}."
387
+ )
388
+ elif batch_size != len(negative_prompt):
389
+ raise ValueError(
390
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
391
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
392
+ " the batch size of `prompt`."
393
+ )
394
+ else:
395
+ uncond_tokens = [negative_prompt, negative_prompt_2]
396
+
397
+ negative_prompt_embeds_list = []
398
+ for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
399
+ if isinstance(self, TextualInversionLoaderMixin):
400
+ negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
401
+
402
+ max_length = prompt_embeds.shape[1]
403
+ uncond_input = tokenizer(
404
+ negative_prompt,
405
+ padding="max_length",
406
+ max_length=max_length,
407
+ truncation=True,
408
+ return_tensors="pd",
409
+ )
410
+
411
+ negative_prompt_embeds = text_encoder(
412
+ uncond_input.input_ids,
413
+ output_hidden_states=True,
414
+ )
415
+ # We are only ALWAYS interested in the pooled output of the final text encoder
416
+ negative_pooled_prompt_embeds = negative_prompt_embeds[0]
417
+ negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
418
+
419
+ negative_prompt_embeds_list.append(negative_prompt_embeds)
420
+
421
+ negative_prompt_embeds = paddle.concat(negative_prompt_embeds_list, axis=-1)
422
+
423
+ if self.text_encoder_2 is not None:
424
+ prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder_2.dtype)
425
+ else:
426
+ prompt_embeds = prompt_embeds.cast(dtype=self.unet.dtype)
427
+
428
+ bs_embed, seq_len, _ = prompt_embeds.shape
429
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
430
+ prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
431
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
432
+
433
+ if do_classifier_free_guidance:
434
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
435
+ seq_len = negative_prompt_embeds.shape[1]
436
+
437
+ if self.text_encoder_2 is not None:
438
+ negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder_2.dtype)
439
+ else:
440
+ negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.unet.dtype)
441
+
442
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
443
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
444
+
445
+ pooled_prompt_embeds = pooled_prompt_embeds.tile([1, num_images_per_prompt]).reshape(
446
+ [bs_embed * num_images_per_prompt, -1]
447
+ )
448
+ if do_classifier_free_guidance:
449
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.tile([1, num_images_per_prompt]).reshape(
450
+ [bs_embed * num_images_per_prompt, -1]
451
+ )
452
+
453
+ return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
454
+
455
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
456
+ def prepare_extra_step_kwargs(self, generator, eta):
457
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
458
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
459
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
460
+ # and should be between [0, 1]
461
+
462
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
463
+ extra_step_kwargs = {}
464
+ if accepts_eta:
465
+ extra_step_kwargs["eta"] = eta
466
+
467
+ # check if the scheduler accepts generator
468
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
469
+ if accepts_generator:
470
+ extra_step_kwargs["generator"] = generator
471
+ return extra_step_kwargs
472
+
473
+ def check_image(self, image, prompt, prompt_embeds):
474
+ image_is_pil = isinstance(image, PIL.Image.Image)
475
+ image_is_tensor = isinstance(image, paddle.Tensor)
476
+ image_is_np = isinstance(image, np.ndarray)
477
+ image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
478
+ image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor)
479
+ image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
480
+
481
+ if (
482
+ not image_is_pil
483
+ and not image_is_tensor
484
+ and not image_is_np
485
+ and not image_is_pil_list
486
+ and not image_is_tensor_list
487
+ and not image_is_np_list
488
+ ):
489
+ raise TypeError(
490
+ f"image must be passed and be one of PIL image, numpy array, paddle tensor, list of PIL images, list of numpy arrays or list of paddle tensors, but is {type(image)}"
491
+ )
492
+
493
+ if image_is_pil:
494
+ image_batch_size = 1
495
+ else:
496
+ image_batch_size = len(image)
497
+
498
+ if prompt is not None and isinstance(prompt, str):
499
+ prompt_batch_size = 1
500
+ elif prompt is not None and isinstance(prompt, list):
501
+ prompt_batch_size = len(prompt)
502
+ elif prompt_embeds is not None:
503
+ prompt_batch_size = prompt_embeds.shape[0]
504
+
505
+ if image_batch_size != 1 and image_batch_size != prompt_batch_size:
506
+ raise ValueError(
507
+ f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
508
+ )
509
+
510
+ def check_inputs(
511
+ self,
512
+ prompt,
513
+ prompt_2,
514
+ image,
515
+ strength,
516
+ num_inference_steps,
517
+ callback_steps,
518
+ negative_prompt=None,
519
+ negative_prompt_2=None,
520
+ prompt_embeds=None,
521
+ negative_prompt_embeds=None,
522
+ pooled_prompt_embeds=None,
523
+ negative_pooled_prompt_embeds=None,
524
+ controlnet_conditioning_scale=1.0,
525
+ control_guidance_start=0.0,
526
+ control_guidance_end=1.0,
527
+ callback_on_step_end_tensor_inputs=None,
528
+ ):
529
+ if strength < 0 or strength > 1:
530
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
531
+ if num_inference_steps is None:
532
+ raise ValueError("`num_inference_steps` cannot be None.")
533
+ elif not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
534
+ raise ValueError(
535
+ f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type"
536
+ f" {type(num_inference_steps)}."
537
+ )
538
+
539
+ if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
540
+ raise ValueError(
541
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
542
+ f" {type(callback_steps)}."
543
+ )
544
+
545
+ if callback_on_step_end_tensor_inputs is not None and not all(
546
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
547
+ ):
548
+ raise ValueError(
549
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
550
+ )
551
+
552
+ if prompt is not None and prompt_embeds is not None:
553
+ raise ValueError(
554
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
555
+ " only forward one of the two."
556
+ )
557
+ elif prompt_2 is not None and prompt_embeds is not None:
558
+ raise ValueError(
559
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
560
+ " only forward one of the two."
561
+ )
562
+ elif prompt is None and prompt_embeds is None:
563
+ raise ValueError(
564
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
565
+ )
566
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
567
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
568
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
569
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
570
+
571
+ if negative_prompt is not None and negative_prompt_embeds is not None:
572
+ raise ValueError(
573
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
574
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
575
+ )
576
+ elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
577
+ raise ValueError(
578
+ f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
579
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
580
+ )
581
+
582
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
583
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
584
+ raise ValueError(
585
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
586
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
587
+ f" {negative_prompt_embeds.shape}."
588
+ )
589
+
590
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
591
+ raise ValueError(
592
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
593
+ )
594
+
595
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
596
+ raise ValueError(
597
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
598
+ )
599
+
600
+ # `prompt` needs more sophisticated handling when there are multiple
601
+ # conditionings.
602
+ if isinstance(self.controlnet, MultiControlNetModel):
603
+ if isinstance(prompt, list):
604
+ logger.warning(
605
+ f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
606
+ " prompts. The conditionings will be fixed across the prompts."
607
+ )
608
+
609
+ # Check `image`
610
+ if isinstance(self.controlnet, ControlNetModel):
611
+ self.check_image(image, prompt, prompt_embeds)
612
+ elif isinstance(self.controlnet, MultiControlNetModel):
613
+ if not isinstance(image, list):
614
+ raise TypeError("For multiple controlnets: `image` must be type `list`")
615
+
616
+ # When `image` is a nested list:
617
+ # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
618
+ elif any(isinstance(i, list) for i in image):
619
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
620
+ elif len(image) != len(self.controlnet.nets):
621
+ raise ValueError(
622
+ f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
623
+ )
624
+
625
+ for image_ in image:
626
+ self.check_image(image_, prompt, prompt_embeds)
627
+ else:
628
+ assert False
629
+
630
+ # Check `controlnet_conditioning_scale`
631
+ if isinstance(self.controlnet, ControlNetModel):
632
+ if not isinstance(controlnet_conditioning_scale, float):
633
+ raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
634
+ elif isinstance(self.controlnet, MultiControlNetModel):
635
+ if isinstance(controlnet_conditioning_scale, list):
636
+ if any(isinstance(i, list) for i in controlnet_conditioning_scale):
637
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
638
+ elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
639
+ self.controlnet.nets
640
+ ):
641
+ raise ValueError(
642
+ "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
643
+ " the same length as the number of controlnets"
644
+ )
645
+ else:
646
+ assert False
647
+
648
+ if not isinstance(control_guidance_start, (tuple, list)):
649
+ control_guidance_start = [control_guidance_start]
650
+
651
+ if not isinstance(control_guidance_end, (tuple, list)):
652
+ control_guidance_end = [control_guidance_end]
653
+
654
+ if len(control_guidance_start) != len(control_guidance_end):
655
+ raise ValueError(
656
+ f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
657
+ )
658
+
659
+ if isinstance(self.controlnet, MultiControlNetModel):
660
+ if len(control_guidance_start) != len(self.controlnet.nets):
661
+ raise ValueError(
662
+ f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
663
+ )
664
+
665
+ for start, end in zip(control_guidance_start, control_guidance_end):
666
+ if start >= end:
667
+ raise ValueError(
668
+ f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
669
+ )
670
+ if start < 0.0:
671
+ raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
672
+ if end > 1.0:
673
+ raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
674
+
675
+ def prepare_control_image(
676
+ self,
677
+ image,
678
+ width,
679
+ height,
680
+ batch_size,
681
+ num_images_per_prompt,
682
+ dtype,
683
+ do_classifier_free_guidance=False,
684
+ guess_mode=False,
685
+ ):
686
+ image = self.control_image_processor.preprocess(image, height=height, width=width).cast(dtype=paddle.float32)
687
+ image_batch_size = image.shape[0]
688
+
689
+ if image_batch_size == 1:
690
+ repeat_by = batch_size
691
+ else:
692
+ # image batch size is the same as prompt batch size
693
+ repeat_by = num_images_per_prompt
694
+
695
+ image = image.repeat_interleave(repeat_by, axis=0)
696
+
697
+ image = image.cast(dtype=dtype)
698
+
699
+ if do_classifier_free_guidance and not guess_mode:
700
+ image = paddle.concat([image] * 2)
701
+
702
+ return image
703
+
704
+ def prepare_latents(
705
+ self,
706
+ batch_size,
707
+ num_channels_latents,
708
+ height,
709
+ width,
710
+ dtype,
711
+ generator,
712
+ latents=None,
713
+ image=None,
714
+ timestep=None,
715
+ is_strength_max=True,
716
+ add_noise=True,
717
+ return_noise=False,
718
+ return_image_latents=False,
719
+ ):
720
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
721
+ if isinstance(generator, list) and len(generator) != batch_size:
722
+ raise ValueError(
723
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
724
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
725
+ )
726
+
727
+ if (image is None or timestep is None) and not is_strength_max:
728
+ raise ValueError(
729
+ "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
730
+ "However, either the image or the noise timestep has not been provided."
731
+ )
732
+
733
+ if return_image_latents or (latents is None and not is_strength_max):
734
+ image = image.cast(dtype=dtype)
735
+
736
+ if image.shape[1] == 4:
737
+ image_latents = image
738
+ else:
739
+ image_latents = self._encode_vae_image(image=image, generator=generator)
740
+ image_latents = image_latents.tile([batch_size // image_latents.shape[0], 1, 1, 1])
741
+
742
+ if latents is None and add_noise:
743
+ noise = randn_tensor(shape, generator=generator, dtype=dtype)
744
+ # if strength is 1. then initialise the latents to noise, else initial to image + noise
745
+ latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
746
+ # if pure noise then scale the initial latents by the Scheduler's init sigma
747
+ latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
748
+ elif add_noise:
749
+ noise = latents.cast(dtype)
750
+ latents = noise * self.scheduler.init_noise_sigma
751
+ else:
752
+ noise = randn_tensor(shape, generator=generator, dtype=dtype)
753
+ latents = image_latents
754
+
755
+ outputs = (latents,)
756
+
757
+ if return_noise:
758
+ outputs += (noise,)
759
+
760
+ if return_image_latents:
761
+ outputs += (image_latents,)
762
+
763
+ return outputs
764
+
765
+ def _encode_vae_image(self, image: paddle.Tensor, generator: paddle.Generator):
766
+ dtype = image.dtype
767
+ if self.vae.config.force_upcast:
768
+ image = image.cast(paddle.float32)
769
+ self.vae.to(dtype=paddle.float32)
770
+
771
+ if isinstance(generator, list):
772
+ image_latents = [
773
+ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
774
+ for i in range(image.shape[0])
775
+ ]
776
+ image_latents = paddle.concat(image_latents, axis=0)
777
+ else:
778
+ image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
779
+
780
+ if self.vae.config.force_upcast:
781
+ self.vae.to(dtype=dtype)
782
+
783
+ image_latents = image_latents.cast(dtype)
784
+ image_latents = self.vae.config.scaling_factor * image_latents
785
+
786
+ return image_latents
787
+
788
+ def prepare_mask_latents(
789
+ self, mask, masked_image, batch_size, height, width, dtype, generator, do_classifier_free_guidance
790
+ ):
791
+ # resize the mask to latents shape as we concatenate the mask to the latents
792
+ # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
793
+ # and half precision
794
+ mask = paddle.nn.functional.interpolate(
795
+ mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
796
+ )
797
+ mask = mask.cast(dtype=dtype)
798
+
799
+ # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
800
+ if mask.shape[0] < batch_size:
801
+ if not batch_size % mask.shape[0] == 0:
802
+ raise ValueError(
803
+ "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
804
+ f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
805
+ " of masks that you pass is divisible by the total requested batch size."
806
+ )
807
+ mask = mask.tile([batch_size // mask.shape[0], 1, 1, 1])
808
+
809
+ mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask
810
+
811
+ masked_image_latents = None
812
+ if masked_image is not None:
813
+ masked_image = masked_image.cast(dtype=dtype)
814
+ masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
815
+ if masked_image_latents.shape[0] < batch_size:
816
+ if not batch_size % masked_image_latents.shape[0] == 0:
817
+ raise ValueError(
818
+ "The passed images and the required batch size don't match. Images are supposed to be duplicated"
819
+ f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
820
+ " Make sure the number of images that you pass is divisible by the total requested batch size."
821
+ )
822
+ masked_image_latents = masked_image_latents.tile(
823
+ [batch_size // masked_image_latents.shape[0], 1, 1, 1]
824
+ )
825
+
826
+ masked_image_latents = (
827
+ paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
828
+ )
829
+
830
+ # aligning device to prevent device errors when concating it with the latent model input
831
+ masked_image_latents = masked_image_latents.cast(dtype=dtype)
832
+
833
+ return mask, masked_image_latents
834
+
835
+ # Copied from ppdiffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.get_timesteps
836
+ def get_timesteps(self, num_inference_steps, strength, denoising_start=None):
837
+ # get the original timestep using init_timestep
838
+ if denoising_start is None:
839
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
840
+ t_start = max(num_inference_steps - init_timestep, 0)
841
+ else:
842
+ t_start = 0
843
+
844
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
845
+
846
+ # Strength is irrelevant if we directly request a timestep to start at;
847
+ # that is, strength is determined by the denoising_start instead.
848
+ if denoising_start is not None:
849
+ discrete_timestep_cutoff = int(
850
+ round(
851
+ self.scheduler.config.num_train_timesteps
852
+ - (denoising_start * self.scheduler.config.num_train_timesteps)
853
+ )
854
+ )
855
+
856
+ num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
857
+ if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
858
+ # if the scheduler is a 2nd order scheduler we might have to do +1
859
+ # because `num_inference_steps` might be even given that every timestep
860
+ # (except the highest one) is duplicated. If `num_inference_steps` is even it would
861
+ # mean that we cut the timesteps in the middle of the denoising step
862
+ # (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
863
+ # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
864
+ num_inference_steps = num_inference_steps + 1
865
+
866
+ # because t_n+1 >= t_n, we slice the timesteps starting from the end
867
+ timesteps = timesteps[-num_inference_steps:]
868
+ return timesteps, num_inference_steps
869
+
870
+ return timesteps, num_inference_steps - t_start
871
+
872
+ def _get_add_time_ids(
873
+ self,
874
+ original_size,
875
+ crops_coords_top_left,
876
+ target_size,
877
+ aesthetic_score,
878
+ negative_aesthetic_score,
879
+ dtype,
880
+ text_encoder_projection_dim=None,
881
+ ):
882
+ if self.config.requires_aesthetics_score:
883
+ add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
884
+ add_neg_time_ids = list(original_size + crops_coords_top_left + (negative_aesthetic_score,))
885
+ else:
886
+ add_time_ids = list(original_size + crops_coords_top_left + target_size)
887
+ add_neg_time_ids = list(original_size + crops_coords_top_left + target_size)
888
+
889
+ passed_add_embed_dim = (
890
+ self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
891
+ )
892
+ expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
893
+
894
+ if (
895
+ expected_add_embed_dim > passed_add_embed_dim
896
+ and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
897
+ ):
898
+ raise ValueError(
899
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
900
+ )
901
+ elif (
902
+ expected_add_embed_dim < passed_add_embed_dim
903
+ and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
904
+ ):
905
+ raise ValueError(
906
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
907
+ )
908
+ elif expected_add_embed_dim != passed_add_embed_dim:
909
+ raise ValueError(
910
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
911
+ )
912
+
913
+ add_time_ids = paddle.to_tensor([add_time_ids], dtype=dtype)
914
+ add_neg_time_ids = paddle.to_tensor([add_neg_time_ids], dtype=dtype)
915
+
916
+ return add_time_ids, add_neg_time_ids
917
+
918
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
919
+ def upcast_vae(self):
920
+ dtype = self.vae.dtype
921
+ self.vae.to(dtype=paddle.float32)
922
+ use_paddle_2_5_or_ppxformers = isinstance(
923
+ self.vae.decoder.mid_block.attentions[0].processor,
924
+ (
925
+ AttnProcessor2_5,
926
+ XFormersAttnProcessor,
927
+ LoRAXFormersAttnProcessor,
928
+ LoRAAttnProcessor2_5,
929
+ ),
930
+ )
931
+ # if xformers or torch_2_0 is used attention block does not need
932
+ # to be in float32 which can save lots of memory
933
+ if use_paddle_2_5_or_ppxformers:
934
+ self.vae.post_quant_conv.to(dtype=dtype)
935
+ self.vae.decoder.conv_in.to(dtype=dtype)
936
+ self.vae.decoder.mid_block.to(dtype=dtype)
937
+
938
+ @property
939
+ def guidance_scale(self):
940
+ return self._guidance_scale
941
+
942
+ @property
943
+ def clip_skip(self):
944
+ return self._clip_skip
945
+
946
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
947
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
948
+ # corresponds to doing no classifier free guidance.
949
+ @property
950
+ def do_classifier_free_guidance(self):
951
+ return self._guidance_scale > 1
952
+
953
+ @property
954
+ def cross_attention_kwargs(self):
955
+ return self._cross_attention_kwargs
956
+
957
+ @property
958
+ def num_timesteps(self):
959
+ return self._num_timesteps
960
+
961
+ @paddle.no_grad()
962
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
963
+ def __call__(
964
+ self,
965
+ prompt: Union[str, List[str]] = None,
966
+ prompt_2: Optional[Union[str, List[str]]] = None,
967
+ image: PipelineImageInput = None,
968
+ mask_image: PipelineImageInput = None,
969
+ control_image: Union[
970
+ PipelineImageInput,
971
+ List[PipelineImageInput],
972
+ ] = None,
973
+ height: Optional[int] = None,
974
+ width: Optional[int] = None,
975
+ strength: float = 0.9999,
976
+ num_inference_steps: int = 50,
977
+ denoising_start: Optional[float] = None,
978
+ denoising_end: Optional[float] = None,
979
+ guidance_scale: float = 5.0,
980
+ negative_prompt: Optional[Union[str, List[str]]] = None,
981
+ negative_prompt_2: Optional[Union[str, List[str]]] = None,
982
+ num_images_per_prompt: Optional[int] = 1,
983
+ eta: float = 0.0,
984
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
985
+ latents: Optional[paddle.Tensor] = None,
986
+ prompt_embeds: Optional[paddle.Tensor] = None,
987
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
988
+ pooled_prompt_embeds: Optional[paddle.Tensor] = None,
989
+ negative_pooled_prompt_embeds: Optional[paddle.Tensor] = None,
990
+ output_type: Optional[str] = "pil",
991
+ return_dict: bool = True,
992
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
993
+ controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
994
+ guess_mode: bool = False,
995
+ control_guidance_start: Union[float, List[float]] = 0.0,
996
+ control_guidance_end: Union[float, List[float]] = 1.0,
997
+ guidance_rescale: float = 0.0,
998
+ original_size: Tuple[int, int] = None,
999
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
1000
+ target_size: Tuple[int, int] = None,
1001
+ aesthetic_score: float = 6.0,
1002
+ negative_aesthetic_score: float = 2.5,
1003
+ clip_skip: Optional[int] = None,
1004
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
1005
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
1006
+ **kwargs,
1007
+ ):
1008
+ r"""
1009
+ Function invoked when calling the pipeline for generation.
1010
+
1011
+ Args:
1012
+ prompt (`str` or `List[str]`, *optional*):
1013
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
1014
+ instead.
1015
+ prompt_2 (`str` or `List[str]`, *optional*):
1016
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
1017
+ used in both text-encoders
1018
+ image (`PIL.Image.Image`):
1019
+ `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
1020
+ be masked out with `mask_image` and repainted according to `prompt`.
1021
+ mask_image (`PIL.Image.Image`):
1022
+ `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
1023
+ repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
1024
+ to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
1025
+ instead of 3, so the expected shape would be `(B, H, W, 1)`.
1026
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
1027
+ The height in pixels of the generated image.
1028
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
1029
+ The width in pixels of the generated image.
1030
+ strength (`float`, *optional*, defaults to 0.9999):
1031
+ Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
1032
+ between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
1033
+ `strength`. The number of denoising steps depends on the amount of noise initially added. When
1034
+ `strength` is 1, added noise will be maximum and the denoising process will run for the full number of
1035
+ iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores the masked
1036
+ portion of the reference `image`. Note that in the case of `denoising_start` being declared as an
1037
+ integer, the value of `strength` will be ignored.
1038
+ num_inference_steps (`int`, *optional*, defaults to 50):
1039
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
1040
+ expense of slower inference.
1041
+ denoising_start (`float`, *optional*):
1042
+ When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
1043
+ bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
1044
+ it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
1045
+ strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
1046
+ is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refining the Image
1047
+ Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
1048
+ denoising_end (`float`, *optional*):
1049
+ When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
1050
+ completed before it is intentionally prematurely terminated. As a result, the returned sample will
1051
+ still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
1052
+ denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
1053
+ final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
1054
+ forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
1055
+ Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
1056
+ guidance_scale (`float`, *optional*, defaults to 7.5):
1057
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
1058
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
1059
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1060
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
1061
+ usually at the expense of lower image quality.
1062
+ negative_prompt (`str` or `List[str]`, *optional*):
1063
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
1064
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
1065
+ less than `1`).
1066
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
1067
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
1068
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
1069
+ prompt_embeds (`paddle.Tensor`, *optional*):
1070
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
1071
+ provided, text embeddings will be generated from `prompt` input argument.
1072
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
1073
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
1074
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
1075
+ argument.
1076
+ pooled_prompt_embeds (`paddle.Tensor`, *optional*):
1077
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
1078
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
1079
+ negative_pooled_prompt_embeds (`paddle.Tensor`, *optional*):
1080
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
1081
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
1082
+ input argument.
1083
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
1084
+ The number of images to generate per prompt.
1085
+ eta (`float`, *optional*, defaults to 0.0):
1086
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
1087
+ [`schedulers.DDIMScheduler`], will be ignored for others.
1088
+ generator (`paddle.Generator`, *optional*):
1089
+ One or a list of [paddle generator(s)] to make generation deterministic.
1090
+ latents (`paddle.Tensor`, *optional*):
1091
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
1092
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
1093
+ tensor will ge generated by sampling using the supplied random `generator`.
1094
+ output_type (`str`, *optional*, defaults to `"pil"`):
1095
+ The output format of the generate image. Choose between
1096
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
1097
+ return_dict (`bool`, *optional*, defaults to `True`):
1098
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
1099
+ plain tuple.
1100
+ cross_attention_kwargs (`dict`, *optional*):
1101
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
1102
+ `self.processor` in
1103
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
1104
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
1105
+ If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
1106
+ `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
1107
+ explained in section 2.2 of
1108
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
1109
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
1110
+ `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
1111
+ `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
1112
+ `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
1113
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
1114
+ target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
1115
+ For most cases, `target_size` should be set to the desired height and width of the generated image. If
1116
+ not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
1117
+ section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
1118
+ aesthetic_score (`float`, *optional*, defaults to 6.0):
1119
+ Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
1120
+ Part of SDXL's micro-conditioning as explained in section 2.2 of
1121
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
1122
+ negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
1123
+ Part of SDXL's micro-conditioning as explained in section 2.2 of
1124
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
1125
+ simulate an aesthetic score of the generated image by influencing the negative text condition.
1126
+ clip_skip (`int`, *optional*):
1127
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
1128
+ the output of the pre-final layer will be used for computing the prompt embeddings.
1129
+ callback_on_step_end (`Callable`, *optional*):
1130
+ A function that calls at the end of each denoising steps during the inference. The function is called
1131
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
1132
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
1133
+ `callback_on_step_end_tensor_inputs`.
1134
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
1135
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
1136
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
1137
+ `._callback_tensor_inputs` attribute of your pipeine class.
1138
+
1139
+ Examples:
1140
+
1141
+ Returns:
1142
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
1143
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
1144
+ `tuple. `tuple. When returning a tuple, the first element is a list with the generated images.
1145
+ """
1146
+
1147
+ callback = kwargs.pop("callback", None)
1148
+ callback_steps = kwargs.pop("callback_steps", None)
1149
+
1150
+ if callback is not None:
1151
+ deprecate(
1152
+ "callback",
1153
+ "1.0.0",
1154
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
1155
+ )
1156
+ if callback_steps is not None:
1157
+ deprecate(
1158
+ "callback_steps",
1159
+ "1.0.0",
1160
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
1161
+ )
1162
+
1163
+ controlnet = self.controlnet
1164
+
1165
+ # align format for control guidance
1166
+ if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
1167
+ control_guidance_start = len(control_guidance_end) * [control_guidance_start]
1168
+ elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
1169
+ control_guidance_end = len(control_guidance_start) * [control_guidance_end]
1170
+ elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
1171
+ mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
1172
+ control_guidance_start, control_guidance_end = (
1173
+ mult * [control_guidance_start],
1174
+ mult * [control_guidance_end],
1175
+ )
1176
+
1177
+ # # 0.0 Default height and width to unet
1178
+ # height = height or self.unet.config.sample_size * self.vae_scale_factor
1179
+ # width = width or self.unet.config.sample_size * self.vae_scale_factor
1180
+
1181
+ # 0.1 align format for control guidance
1182
+ if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
1183
+ control_guidance_start = len(control_guidance_end) * [control_guidance_start]
1184
+ elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
1185
+ control_guidance_end = len(control_guidance_start) * [control_guidance_end]
1186
+ elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
1187
+ mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
1188
+ control_guidance_start, control_guidance_end = (
1189
+ mult * [control_guidance_start],
1190
+ mult * [control_guidance_end],
1191
+ )
1192
+
1193
+ # 1. Check inputs
1194
+ self.check_inputs(
1195
+ prompt,
1196
+ prompt_2,
1197
+ control_image,
1198
+ strength,
1199
+ num_inference_steps,
1200
+ callback_steps,
1201
+ negative_prompt,
1202
+ negative_prompt_2,
1203
+ prompt_embeds,
1204
+ negative_prompt_embeds,
1205
+ pooled_prompt_embeds,
1206
+ negative_pooled_prompt_embeds,
1207
+ controlnet_conditioning_scale,
1208
+ control_guidance_start,
1209
+ control_guidance_end,
1210
+ callback_on_step_end_tensor_inputs,
1211
+ )
1212
+
1213
+ self._guidance_scale = guidance_scale
1214
+ self._clip_skip = clip_skip
1215
+ self._cross_attention_kwargs = cross_attention_kwargs
1216
+
1217
+ # 2. Define call parameters
1218
+ if prompt is not None and isinstance(prompt, str):
1219
+ batch_size = 1
1220
+ elif prompt is not None and isinstance(prompt, list):
1221
+ batch_size = len(prompt)
1222
+ else:
1223
+ batch_size = prompt_embeds.shape[0]
1224
+
1225
+ if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
1226
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
1227
+
1228
+ # 3. Encode input prompt
1229
+ text_encoder_lora_scale = (
1230
+ self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
1231
+ )
1232
+
1233
+ (
1234
+ prompt_embeds,
1235
+ negative_prompt_embeds,
1236
+ pooled_prompt_embeds,
1237
+ negative_pooled_prompt_embeds,
1238
+ ) = self.encode_prompt(
1239
+ prompt=prompt,
1240
+ prompt_2=prompt_2,
1241
+ num_images_per_prompt=num_images_per_prompt,
1242
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
1243
+ negative_prompt=negative_prompt,
1244
+ negative_prompt_2=negative_prompt_2,
1245
+ prompt_embeds=prompt_embeds,
1246
+ negative_prompt_embeds=negative_prompt_embeds,
1247
+ pooled_prompt_embeds=pooled_prompt_embeds,
1248
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
1249
+ lora_scale=text_encoder_lora_scale,
1250
+ clip_skip=self.clip_skip,
1251
+ )
1252
+
1253
+ # 4. set timesteps
1254
+ def denoising_value_valid(dnv):
1255
+ return isinstance(denoising_end, float) and 0 < dnv < 1
1256
+
1257
+ self.scheduler.set_timesteps(num_inference_steps)
1258
+ timesteps, num_inference_steps = self.get_timesteps(
1259
+ num_inference_steps, strength, denoising_start=denoising_start if denoising_value_valid else None
1260
+ )
1261
+ # check that number of inference steps is not < 1 - as this doesn't make sense
1262
+ if num_inference_steps < 1:
1263
+ raise ValueError(
1264
+ f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
1265
+ f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
1266
+ )
1267
+ # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
1268
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
1269
+ # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
1270
+ is_strength_max = strength == 1.0
1271
+ self._num_timesteps = len(timesteps)
1272
+
1273
+ # 5. Preprocess mask and image - resizes image and mask w.r.t height and width
1274
+ # 5.1 Prepare init image
1275
+ init_image = self.image_processor.preprocess(image, height=height, width=width)
1276
+ init_image = init_image.cast(dtype=paddle.float32)
1277
+
1278
+ # 5.2 Prepare control images
1279
+ if isinstance(controlnet, ControlNetModel):
1280
+ control_image = self.prepare_control_image(
1281
+ image=control_image,
1282
+ width=width,
1283
+ height=height,
1284
+ batch_size=batch_size * num_images_per_prompt,
1285
+ num_images_per_prompt=num_images_per_prompt,
1286
+ dtype=controlnet.dtype,
1287
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
1288
+ guess_mode=guess_mode,
1289
+ )
1290
+ elif isinstance(controlnet, MultiControlNetModel):
1291
+ control_images = []
1292
+
1293
+ for control_image_ in control_image:
1294
+ control_image_ = self.prepare_control_image(
1295
+ image=control_image_,
1296
+ width=width,
1297
+ height=height,
1298
+ batch_size=batch_size * num_images_per_prompt,
1299
+ num_images_per_prompt=num_images_per_prompt,
1300
+ dtype=controlnet.dtype,
1301
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
1302
+ guess_mode=guess_mode,
1303
+ )
1304
+
1305
+ control_images.append(control_image_)
1306
+
1307
+ control_image = control_images
1308
+ else:
1309
+ raise ValueError(f"{controlnet.__class__} is not supported.")
1310
+
1311
+ # 5.3 Prepare mask
1312
+ mask = self.mask_processor.preprocess(mask_image, height=height, width=width)
1313
+
1314
+ masked_image = init_image * (mask < 0.5).cast(init_image.dtype)
1315
+ _, _, height, width = init_image.shape
1316
+
1317
+ # 6. Prepare latent variables
1318
+ num_channels_latents = self.vae.config.latent_channels
1319
+ num_channels_unet = self.unet.config.in_channels
1320
+ return_image_latents = num_channels_unet == 4
1321
+
1322
+ add_noise = True if denoising_start is None else False
1323
+ latents_outputs = self.prepare_latents(
1324
+ batch_size * num_images_per_prompt,
1325
+ num_channels_latents,
1326
+ height,
1327
+ width,
1328
+ prompt_embeds.dtype,
1329
+ generator,
1330
+ latents,
1331
+ image=init_image,
1332
+ timestep=latent_timestep,
1333
+ is_strength_max=is_strength_max,
1334
+ add_noise=add_noise,
1335
+ return_noise=True,
1336
+ return_image_latents=return_image_latents,
1337
+ )
1338
+
1339
+ if return_image_latents:
1340
+ latents, noise, image_latents = latents_outputs
1341
+ else:
1342
+ latents, noise = latents_outputs
1343
+
1344
+ # 7. Prepare mask latent variables
1345
+ mask, masked_image_latents = self.prepare_mask_latents(
1346
+ mask,
1347
+ masked_image,
1348
+ batch_size * num_images_per_prompt,
1349
+ height,
1350
+ width,
1351
+ prompt_embeds.dtype,
1352
+ generator,
1353
+ self.do_classifier_free_guidance,
1354
+ )
1355
+
1356
+ # 8. Check that sizes of mask, masked image and latents match
1357
+ if num_channels_unet == 9:
1358
+ # default case for runwayml/stable-diffusion-inpainting
1359
+ num_channels_mask = mask.shape[1]
1360
+ num_channels_masked_image = masked_image_latents.shape[1]
1361
+ if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
1362
+ raise ValueError(
1363
+ f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
1364
+ f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
1365
+ f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
1366
+ f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
1367
+ " `pipeline.unet` or your `mask_image` or `image` input."
1368
+ )
1369
+ elif num_channels_unet != 4:
1370
+ raise ValueError(
1371
+ f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
1372
+ )
1373
+ # 8.1 Prepare extra step kwargs.
1374
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1375
+
1376
+ # 8.2 Create tensor stating which controlnets to keep
1377
+ controlnet_keep = []
1378
+ for i in range(len(timesteps)):
1379
+ keeps = [
1380
+ 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
1381
+ for s, e in zip(control_guidance_start, control_guidance_end)
1382
+ ]
1383
+ if isinstance(self.controlnet, MultiControlNetModel):
1384
+ controlnet_keep.append(keeps)
1385
+ else:
1386
+ controlnet_keep.append(keeps[0])
1387
+
1388
+ # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
1389
+ height, width = latents.shape[-2:]
1390
+ height = height * self.vae_scale_factor
1391
+ width = width * self.vae_scale_factor
1392
+
1393
+ original_size = original_size or (height, width)
1394
+ target_size = target_size or (height, width)
1395
+
1396
+ # 10. Prepare added time ids & embeddings
1397
+ add_text_embeds = pooled_prompt_embeds
1398
+ if self.text_encoder_2 is None:
1399
+ text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
1400
+ else:
1401
+ text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
1402
+
1403
+ add_time_ids, add_neg_time_ids = self._get_add_time_ids(
1404
+ original_size,
1405
+ crops_coords_top_left,
1406
+ target_size,
1407
+ aesthetic_score,
1408
+ negative_aesthetic_score,
1409
+ dtype=prompt_embeds.dtype,
1410
+ text_encoder_projection_dim=text_encoder_projection_dim,
1411
+ )
1412
+ add_time_ids = add_time_ids.tile([batch_size * num_images_per_prompt, 1])
1413
+
1414
+ if self.do_classifier_free_guidance:
1415
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds], axis=0)
1416
+ add_text_embeds = paddle.concat([negative_pooled_prompt_embeds, add_text_embeds], axis=0)
1417
+ add_neg_time_ids = add_neg_time_ids.tile([batch_size * num_images_per_prompt, 1])
1418
+ add_time_ids = paddle.concat([add_neg_time_ids, add_time_ids], axis=0)
1419
+
1420
+ # 11. Denoising loop
1421
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
1422
+
1423
+ if (
1424
+ denoising_end is not None
1425
+ and denoising_start is not None
1426
+ and denoising_value_valid(denoising_end)
1427
+ and denoising_value_valid(denoising_start)
1428
+ and denoising_start >= denoising_end
1429
+ ):
1430
+ raise ValueError(
1431
+ f"`denoising_start`: {denoising_start} cannot be larger than or equal to `denoising_end`: "
1432
+ + f" {denoising_end} when using type float."
1433
+ )
1434
+ elif denoising_end is not None and denoising_value_valid(denoising_end):
1435
+ discrete_timestep_cutoff = int(
1436
+ round(
1437
+ self.scheduler.config.num_train_timesteps
1438
+ - (denoising_end * self.scheduler.config.num_train_timesteps)
1439
+ )
1440
+ )
1441
+ num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
1442
+ timesteps = timesteps[:num_inference_steps]
1443
+
1444
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1445
+ for i, t in enumerate(timesteps):
1446
+ # expand the latents if we are doing classifier free guidance
1447
+ latent_model_input = paddle.concat([latents] * 2) if self.do_classifier_free_guidance else latents
1448
+
1449
+ # concat latents, mask, masked_image_latents in the channel dimension
1450
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
1451
+
1452
+ added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
1453
+
1454
+ # controlnet(s) inference
1455
+ if guess_mode and self.do_classifier_free_guidance:
1456
+ # Infer ControlNet only for the conditional batch.
1457
+ control_model_input = latents
1458
+ control_model_input = self.scheduler.scale_model_input(control_model_input, t)
1459
+ controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
1460
+ controlnet_added_cond_kwargs = {
1461
+ "text_embeds": add_text_embeds.chunk(2)[1],
1462
+ "time_ids": add_time_ids.chunk(2)[1],
1463
+ }
1464
+ else:
1465
+ control_model_input = latent_model_input
1466
+ controlnet_prompt_embeds = prompt_embeds
1467
+ controlnet_added_cond_kwargs = added_cond_kwargs
1468
+
1469
+ if isinstance(controlnet_keep[i], list):
1470
+ cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
1471
+ else:
1472
+ controlnet_cond_scale = controlnet_conditioning_scale
1473
+ if isinstance(controlnet_cond_scale, list):
1474
+ controlnet_cond_scale = controlnet_cond_scale[0]
1475
+ cond_scale = controlnet_cond_scale * controlnet_keep[i]
1476
+
1477
+ # # Resize control_image to match the size of the input to the controlnet
1478
+ # if control_image.shape[-2:] != control_model_input.shape[-2:]:
1479
+ # control_image = F.interpolate(control_image, size=control_model_input.shape[-2:], mode="bilinear", align_corners=False)
1480
+
1481
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
1482
+ control_model_input,
1483
+ t,
1484
+ encoder_hidden_states=controlnet_prompt_embeds,
1485
+ controlnet_cond=control_image,
1486
+ conditioning_scale=cond_scale,
1487
+ guess_mode=guess_mode,
1488
+ added_cond_kwargs=controlnet_added_cond_kwargs,
1489
+ return_dict=False,
1490
+ )
1491
+
1492
+ if guess_mode and self.do_classifier_free_guidance:
1493
+ # Infered ControlNet only for the conditional batch.
1494
+ # To apply the output of ControlNet to both the unconditional and conditional batches,
1495
+ # add 0 to the unconditional batch to keep it unchanged.
1496
+ down_block_res_samples = [paddle.concat([paddle.zeros_like(d), d]) for d in down_block_res_samples]
1497
+ mid_block_res_sample = paddle.concat(
1498
+ [paddle.zeros_like(mid_block_res_sample), mid_block_res_sample]
1499
+ )
1500
+
1501
+ if num_channels_unet == 9:
1502
+ latent_model_input = paddle.concat([latent_model_input, mask, masked_image_latents], axis=1)
1503
+
1504
+ # predict the noise residual
1505
+ noise_pred = self.unet(
1506
+ latent_model_input,
1507
+ t,
1508
+ encoder_hidden_states=prompt_embeds,
1509
+ cross_attention_kwargs=self.cross_attention_kwargs,
1510
+ down_block_additional_residuals=down_block_res_samples,
1511
+ mid_block_additional_residual=mid_block_res_sample,
1512
+ added_cond_kwargs=added_cond_kwargs,
1513
+ return_dict=False,
1514
+ )[0]
1515
+
1516
+ # perform guidance
1517
+ if self.do_classifier_free_guidance:
1518
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1519
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
1520
+
1521
+ if self.do_classifier_free_guidance and guidance_rescale > 0.0:
1522
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
1523
+ noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
1524
+
1525
+ # compute the previous noisy sample x_t -> x_t-1
1526
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1527
+
1528
+ if num_channels_unet == 4:
1529
+ init_latents_proper = image_latents
1530
+ if self.do_classifier_free_guidance:
1531
+ init_mask, _ = mask.chunk(2)
1532
+ else:
1533
+ init_mask = mask
1534
+
1535
+ if i < len(timesteps) - 1:
1536
+ noise_timestep = timesteps[i + 1]
1537
+ init_latents_proper = self.scheduler.add_noise(
1538
+ init_latents_proper, noise, paddle.to_tensor([noise_timestep])
1539
+ )
1540
+
1541
+ latents = (1 - init_mask) * init_latents_proper + init_mask * latents
1542
+
1543
+ if callback_on_step_end is not None:
1544
+ callback_kwargs = {}
1545
+ for k in callback_on_step_end_tensor_inputs:
1546
+ callback_kwargs[k] = locals()[k]
1547
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
1548
+
1549
+ latents = callback_outputs.pop("latents", latents)
1550
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1551
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
1552
+
1553
+ # call the callback, if provided
1554
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1555
+ progress_bar.update()
1556
+ if callback is not None and i % callback_steps == 0:
1557
+ step_idx = i // getattr(self.scheduler, "order", 1)
1558
+ callback(step_idx, t, latents)
1559
+
1560
+ # make sure the VAE is in float32 mode, as it overflows in float16
1561
+ if self.vae.dtype in [paddle.float16, "float16"] and self.vae.config.force_upcast:
1562
+ self.upcast_vae()
1563
+ latents = latents.cast(dtype=next(iter(self.vae.post_quant_conv.named_parameters()))[1].dtype)
1564
+
1565
+ if not output_type == "latent":
1566
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
1567
+ else:
1568
+ return StableDiffusionXLPipelineOutput(images=latents)
1569
+
1570
+ # apply watermark if available
1571
+ if self.watermark is not None:
1572
+ image = self.watermark.apply_watermark(image)
1573
+
1574
+ image = self.image_processor.postprocess(image, output_type=output_type)
1575
+
1576
+ if not return_dict:
1577
+ return (image,)
1578
+
1579
+ return StableDiffusionXLPipelineOutput(images=image)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py ADDED
@@ -0,0 +1,1398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import inspect
17
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
18
+
19
+ import numpy as np
20
+ import paddle
21
+ import PIL.Image
22
+
23
+ from ppdiffusers.transformers import (
24
+ CLIPTextModel,
25
+ CLIPTextModelWithProjection,
26
+ CLIPTokenizer,
27
+ )
28
+
29
+ from ...image_processor import PipelineImageInput, VaeImageProcessor
30
+ from ...loaders import StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
31
+ from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
32
+ from ...models.attention_processor import (
33
+ AttnProcessor2_5,
34
+ LoRAAttnProcessor2_5,
35
+ LoRAXFormersAttnProcessor,
36
+ XFormersAttnProcessor,
37
+ )
38
+ from ...models.lora import adjust_lora_scale_text_encoder
39
+ from ...schedulers import KarrasDiffusionSchedulers
40
+ from ...utils import (
41
+ USE_PEFT_BACKEND,
42
+ deprecate,
43
+ is_pp_invisible_watermark_available,
44
+ logging,
45
+ replace_example_docstring,
46
+ )
47
+ from ...utils.paddle_utils import randn_tensor
48
+ from ..pipeline_utils import DiffusionPipeline
49
+ from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
50
+
51
+ if is_pp_invisible_watermark_available():
52
+ from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
53
+
54
+ from .multicontrolnet import MultiControlNetModel
55
+
56
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
57
+
58
+
59
+ EXAMPLE_DOC_STRING = """
60
+ Examples:
61
+ ```py
62
+ >>> # pip install paddlenlp safetensors ppdiffusers
63
+
64
+ >>> import paddle
65
+ >>> import numpy as np
66
+ >>> from PIL import Image
67
+
68
+ >>> from ppdififusers.transformers import DPTFeatureExtractor, DPTForDepthEstimation
69
+ >>> from ppdiffusers import ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, AutoencoderKL
70
+ >>> from ppdiffusers.utils import load_image
71
+
72
+
73
+ >>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas")
74
+ >>> feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
75
+ >>> controlnet = ControlNetModel.from_pretrained(
76
+ ... "diffusers/controlnet-depth-sdxl-1.0-small",
77
+ ... variant="fp16",
78
+ ... use_safetensors=True,
79
+ ... paddle_dtype=paddle.float16,
80
+ ... )
81
+ >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", paddle_dtype=paddle.float16)
82
+ >>> pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
83
+ ... "stabilityai/stable-diffusion-xl-base-1.0",
84
+ ... controlnet=controlnet,
85
+ ... vae=vae,
86
+ ... variant="fp16",
87
+ ... use_safetensors=True,
88
+ ... paddle_dtype=paddle.float16,
89
+ ... )
90
+
91
+
92
+ >>> def get_depth_map(image):
93
+ ... image = feature_extractor(images=image, return_tensors="pd").pixel_values
94
+ ... with paddle.no_grad(), paddle.amp.auto_cast():
95
+ ... depth_map = depth_estimator(image).predicted_depth
96
+
97
+ ... depth_map = paddle.nn.functional.interpolate(
98
+ ... depth_map.unsqueeze(1),
99
+ ... size=(1024, 1024),
100
+ ... mode="bicubic",
101
+ ... align_corners=False,
102
+ ... )
103
+ ... depth_min = paddle.amin(depth_map, axis=[1, 2, 3], keepdim=True)
104
+ ... depth_max = paddle.amax(depth_map, axis=[1, 2, 3], keepdim=True)
105
+ ... depth_map = (depth_map - depth_min) / (depth_max - depth_min)
106
+ ... image = paddle.concat([depth_map] * 3, axis=1)
107
+ ... image = image.transpose([0, 2, 3, 1]).cpu().numpy()[0]
108
+ ... image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
109
+ ... return image
110
+
111
+
112
+ >>> prompt = "A robot, 4k photo"
113
+ >>> image = load_image(
114
+ ... "https://hf-mirror.com/datasets/hf-internal-testing/diffusers-images/resolve/main"
115
+ ... "/kandinsky/cat.png"
116
+ ... ).resize((1024, 1024))
117
+ >>> controlnet_conditioning_scale = 0.5 # recommended for good generalization
118
+ >>> depth_image = get_depth_map(image)
119
+
120
+ >>> images = pipe(
121
+ ... prompt,
122
+ ... image=image,
123
+ ... control_image=depth_image,
124
+ ... strength=0.99,
125
+ ... num_inference_steps=50,
126
+ ... controlnet_conditioning_scale=controlnet_conditioning_scale,
127
+ ... ).images
128
+ >>> images[0].save(f"robot_cat.png")
129
+ ```
130
+ """
131
+
132
+
133
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
134
+ def retrieve_latents(
135
+ encoder_output: paddle.Tensor, generator: Optional[paddle.Generator] = None, sample_mode: str = "sample"
136
+ ):
137
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
138
+ return encoder_output.latent_dist.sample(generator)
139
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
140
+ return encoder_output.latent_dist.mode()
141
+ elif hasattr(encoder_output, "latents"):
142
+ return encoder_output.latents
143
+ else:
144
+ raise AttributeError("Could not access latents of provided encoder_output")
145
+
146
+
147
+ class StableDiffusionXLControlNetImg2ImgPipeline(
148
+ DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin
149
+ ):
150
+ r"""
151
+ Pipeline for image-to-image generation using Stable Diffusion XL with ControlNet guidance.
152
+
153
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
154
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
155
+
156
+ In addition the pipeline inherits the following loading methods:
157
+ - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
158
+ - *LoRA*: [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`]
159
+
160
+ Args:
161
+ vae ([`AutoencoderKL`]):
162
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
163
+ text_encoder ([`CLIPTextModel`]):
164
+ Frozen text-encoder. Stable Diffusion uses the text portion of
165
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
166
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
167
+ text_encoder_2 ([` CLIPTextModelWithProjection`]):
168
+ Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
169
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
170
+ specifically the
171
+ [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
172
+ variant.
173
+ tokenizer (`CLIPTokenizer`):
174
+ Tokenizer of class
175
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
176
+ tokenizer_2 (`CLIPTokenizer`):
177
+ Second Tokenizer of class
178
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
179
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
180
+ controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
181
+ Provides additional conditioning to the unet during the denoising process. If you set multiple ControlNets
182
+ as a list, the outputs from each ControlNet are added together to create one combined additional
183
+ conditioning.
184
+ scheduler ([`SchedulerMixin`]):
185
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
186
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
187
+ requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
188
+ Whether the `unet` requires an `aesthetic_score` condition to be passed during inference. Also see the
189
+ config of `stabilityai/stable-diffusion-xl-refiner-1-0`.
190
+ force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
191
+ Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
192
+ `stabilityai/stable-diffusion-xl-base-1-0`.
193
+ add_watermarker (`bool`, *optional*):
194
+ Whether to use the [pp_invisible_watermark library](https://github.com/junnyu/pp-invisible-watermark/) to
195
+ watermark output images. If not defined, it will default to True if the package is installed, otherwise no
196
+ watermarker will be used.
197
+ """
198
+
199
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
200
+ _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
201
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
202
+
203
+ def __init__(
204
+ self,
205
+ vae: AutoencoderKL,
206
+ text_encoder: CLIPTextModel,
207
+ text_encoder_2: CLIPTextModelWithProjection,
208
+ tokenizer: CLIPTokenizer,
209
+ tokenizer_2: CLIPTokenizer,
210
+ unet: UNet2DConditionModel,
211
+ controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
212
+ scheduler: KarrasDiffusionSchedulers,
213
+ requires_aesthetics_score: bool = False,
214
+ force_zeros_for_empty_prompt: bool = True,
215
+ add_watermarker: Optional[bool] = None,
216
+ ):
217
+ super().__init__()
218
+
219
+ if isinstance(controlnet, (list, tuple)):
220
+ controlnet = MultiControlNetModel(controlnet)
221
+
222
+ self.register_modules(
223
+ vae=vae,
224
+ text_encoder=text_encoder,
225
+ text_encoder_2=text_encoder_2,
226
+ tokenizer=tokenizer,
227
+ tokenizer_2=tokenizer_2,
228
+ unet=unet,
229
+ controlnet=controlnet,
230
+ scheduler=scheduler,
231
+ )
232
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
233
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
234
+ self.control_image_processor = VaeImageProcessor(
235
+ vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
236
+ )
237
+ add_watermarker = add_watermarker if add_watermarker is not None else is_pp_invisible_watermark_available()
238
+
239
+ if add_watermarker:
240
+ self.watermark = StableDiffusionXLWatermarker()
241
+ else:
242
+ self.watermark = None
243
+
244
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
245
+ self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
246
+
247
+ # Copied from ppdiffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
248
+ def encode_prompt(
249
+ self,
250
+ prompt: str,
251
+ prompt_2: Optional[str] = None,
252
+ num_images_per_prompt: int = 1,
253
+ do_classifier_free_guidance: bool = True,
254
+ negative_prompt: Optional[str] = None,
255
+ negative_prompt_2: Optional[str] = None,
256
+ prompt_embeds: Optional[paddle.Tensor] = None,
257
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
258
+ pooled_prompt_embeds: Optional[paddle.Tensor] = None,
259
+ negative_pooled_prompt_embeds: Optional[paddle.Tensor] = None,
260
+ lora_scale: Optional[float] = None,
261
+ clip_skip: Optional[int] = None,
262
+ ):
263
+ r"""
264
+ Encodes the prompt into text encoder hidden states.
265
+
266
+ Args:
267
+ prompt (`str` or `List[str]`, *optional*):
268
+ prompt to be encoded
269
+ prompt_2 (`str` or `List[str]`, *optional*):
270
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
271
+ used in both text-encoders
272
+ num_images_per_prompt (`int`):
273
+ number of images that should be generated per prompt
274
+ do_classifier_free_guidance (`bool`):
275
+ whether to use classifier free guidance or not
276
+ negative_prompt (`str` or `List[str]`, *optional*):
277
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
278
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
279
+ less than `1`).
280
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
281
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
282
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
283
+ prompt_embeds (`paddle.Tensor`, *optional*):
284
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
285
+ provided, text embeddings will be generated from `prompt` input argument.
286
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
287
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
288
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
289
+ argument.
290
+ pooled_prompt_embeds (`paddle.Tensor`, *optional*):
291
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
292
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
293
+ negative_pooled_prompt_embeds (`paddle.Tensor`, *optional*):
294
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
295
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
296
+ input argument.
297
+ lora_scale (`float`, *optional*):
298
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
299
+ clip_skip (`int`, *optional*):
300
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
301
+ the output of the pre-final layer will be used for computing the prompt embeddings.
302
+ """
303
+ # set lora scale so that monkey patched LoRA
304
+ # function of text encoder can correctly access it
305
+ if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
306
+ self._lora_scale = lora_scale
307
+
308
+ # dynamically adjust the LoRA scale
309
+ if self.text_encoder is not None:
310
+ if not USE_PEFT_BACKEND:
311
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
312
+
313
+ if self.text_encoder_2 is not None:
314
+ if not USE_PEFT_BACKEND:
315
+ adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
316
+
317
+ prompt = [prompt] if isinstance(prompt, str) else prompt
318
+
319
+ if prompt is not None:
320
+ batch_size = len(prompt)
321
+ else:
322
+ batch_size = prompt_embeds.shape[0]
323
+
324
+ # Define tokenizers and text encoders
325
+ tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
326
+ text_encoders = (
327
+ [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
328
+ )
329
+
330
+ if prompt_embeds is None:
331
+ prompt_2 = prompt_2 or prompt
332
+ prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
333
+
334
+ # textual inversion: process multi-vector tokens if necessary
335
+ prompt_embeds_list = []
336
+ prompts = [prompt, prompt_2]
337
+ for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
338
+ if isinstance(self, TextualInversionLoaderMixin):
339
+ prompt = self.maybe_convert_prompt(prompt, tokenizer)
340
+
341
+ text_inputs = tokenizer(
342
+ prompt,
343
+ padding="max_length",
344
+ max_length=tokenizer.model_max_length,
345
+ truncation=True,
346
+ return_tensors="pd",
347
+ )
348
+
349
+ text_input_ids = text_inputs.input_ids
350
+ untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
351
+
352
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
353
+ text_input_ids, untruncated_ids
354
+ ):
355
+ removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
356
+ logger.warning(
357
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
358
+ f" {tokenizer.model_max_length} tokens: {removed_text}"
359
+ )
360
+
361
+ prompt_embeds = text_encoder(text_input_ids, output_hidden_states=True)
362
+
363
+ # We are only ALWAYS interested in the pooled output of the final text encoder
364
+ pooled_prompt_embeds = prompt_embeds[0]
365
+ if clip_skip is None:
366
+ prompt_embeds = prompt_embeds.hidden_states[-2]
367
+ else:
368
+ # "2" because SDXL always indexes from the penultimate layer.
369
+ prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
370
+
371
+ prompt_embeds_list.append(prompt_embeds)
372
+
373
+ prompt_embeds = paddle.concat(prompt_embeds_list, axis=-1)
374
+
375
+ # get unconditional embeddings for classifier free guidance
376
+ zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
377
+ if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
378
+ negative_prompt_embeds = paddle.zeros_like(prompt_embeds)
379
+ negative_pooled_prompt_embeds = paddle.zeros_like(pooled_prompt_embeds)
380
+ elif do_classifier_free_guidance and negative_prompt_embeds is None:
381
+ negative_prompt = negative_prompt or ""
382
+ negative_prompt_2 = negative_prompt_2 or negative_prompt
383
+
384
+ # normalize str to list
385
+ negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
386
+ negative_prompt_2 = (
387
+ batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
388
+ )
389
+
390
+ uncond_tokens: List[str]
391
+ if prompt is not None and type(prompt) is not type(negative_prompt):
392
+ raise TypeError(
393
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
394
+ f" {type(prompt)}."
395
+ )
396
+ elif batch_size != len(negative_prompt):
397
+ raise ValueError(
398
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
399
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
400
+ " the batch size of `prompt`."
401
+ )
402
+ else:
403
+ uncond_tokens = [negative_prompt, negative_prompt_2]
404
+
405
+ negative_prompt_embeds_list = []
406
+ for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
407
+ if isinstance(self, TextualInversionLoaderMixin):
408
+ negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
409
+
410
+ max_length = prompt_embeds.shape[1]
411
+ uncond_input = tokenizer(
412
+ negative_prompt,
413
+ padding="max_length",
414
+ max_length=max_length,
415
+ truncation=True,
416
+ return_tensors="pd",
417
+ )
418
+
419
+ negative_prompt_embeds = text_encoder(
420
+ uncond_input.input_ids,
421
+ output_hidden_states=True,
422
+ )
423
+ # We are only ALWAYS interested in the pooled output of the final text encoder
424
+ negative_pooled_prompt_embeds = negative_prompt_embeds[0]
425
+ negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
426
+
427
+ negative_prompt_embeds_list.append(negative_prompt_embeds)
428
+
429
+ negative_prompt_embeds = paddle.concat(negative_prompt_embeds_list, axis=-1)
430
+
431
+ if self.text_encoder_2 is not None:
432
+ prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder_2.dtype)
433
+ else:
434
+ prompt_embeds = prompt_embeds.cast(dtype=self.unet.dtype)
435
+
436
+ bs_embed, seq_len, _ = prompt_embeds.shape
437
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
438
+ prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
439
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
440
+
441
+ if do_classifier_free_guidance:
442
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
443
+ seq_len = negative_prompt_embeds.shape[1]
444
+
445
+ if self.text_encoder_2 is not None:
446
+ negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder_2.dtype)
447
+ else:
448
+ negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.unet.dtype)
449
+
450
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
451
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
452
+
453
+ pooled_prompt_embeds = pooled_prompt_embeds.tile([1, num_images_per_prompt]).reshape(
454
+ [bs_embed * num_images_per_prompt, -1]
455
+ )
456
+ if do_classifier_free_guidance:
457
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.tile([1, num_images_per_prompt]).reshape(
458
+ [bs_embed * num_images_per_prompt, -1]
459
+ )
460
+
461
+ return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
462
+
463
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
464
+ def prepare_extra_step_kwargs(self, generator, eta):
465
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
466
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
467
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
468
+ # and should be between [0, 1]
469
+
470
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
471
+ extra_step_kwargs = {}
472
+ if accepts_eta:
473
+ extra_step_kwargs["eta"] = eta
474
+
475
+ # check if the scheduler accepts generator
476
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
477
+ if accepts_generator:
478
+ extra_step_kwargs["generator"] = generator
479
+ return extra_step_kwargs
480
+
481
+ def check_inputs(
482
+ self,
483
+ prompt,
484
+ prompt_2,
485
+ image,
486
+ strength,
487
+ num_inference_steps,
488
+ callback_steps,
489
+ negative_prompt=None,
490
+ negative_prompt_2=None,
491
+ prompt_embeds=None,
492
+ negative_prompt_embeds=None,
493
+ pooled_prompt_embeds=None,
494
+ negative_pooled_prompt_embeds=None,
495
+ controlnet_conditioning_scale=1.0,
496
+ control_guidance_start=0.0,
497
+ control_guidance_end=1.0,
498
+ callback_on_step_end_tensor_inputs=None,
499
+ ):
500
+ if strength < 0 or strength > 1:
501
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
502
+ if num_inference_steps is None:
503
+ raise ValueError("`num_inference_steps` cannot be None.")
504
+ elif not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
505
+ raise ValueError(
506
+ f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type"
507
+ f" {type(num_inference_steps)}."
508
+ )
509
+
510
+ if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
511
+ raise ValueError(
512
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
513
+ f" {type(callback_steps)}."
514
+ )
515
+
516
+ if callback_on_step_end_tensor_inputs is not None and not all(
517
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
518
+ ):
519
+ raise ValueError(
520
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
521
+ )
522
+
523
+ if prompt is not None and prompt_embeds is not None:
524
+ raise ValueError(
525
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
526
+ " only forward one of the two."
527
+ )
528
+ elif prompt_2 is not None and prompt_embeds is not None:
529
+ raise ValueError(
530
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
531
+ " only forward one of the two."
532
+ )
533
+ elif prompt is None and prompt_embeds is None:
534
+ raise ValueError(
535
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
536
+ )
537
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
538
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
539
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
540
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
541
+
542
+ if negative_prompt is not None and negative_prompt_embeds is not None:
543
+ raise ValueError(
544
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
545
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
546
+ )
547
+ elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
548
+ raise ValueError(
549
+ f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
550
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
551
+ )
552
+
553
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
554
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
555
+ raise ValueError(
556
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
557
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
558
+ f" {negative_prompt_embeds.shape}."
559
+ )
560
+
561
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
562
+ raise ValueError(
563
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
564
+ )
565
+
566
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
567
+ raise ValueError(
568
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
569
+ )
570
+
571
+ # `prompt` needs more sophisticated handling when there are multiple
572
+ # conditionings.
573
+ if isinstance(self.controlnet, MultiControlNetModel):
574
+ if isinstance(prompt, list):
575
+ logger.warning(
576
+ f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
577
+ " prompts. The conditionings will be fixed across the prompts."
578
+ )
579
+
580
+ # Check `image`
581
+ if isinstance(self.controlnet, ControlNetModel):
582
+ self.check_image(image, prompt, prompt_embeds)
583
+ elif isinstance(self.controlnet, MultiControlNetModel):
584
+ if not isinstance(image, list):
585
+ raise TypeError("For multiple controlnets: `image` must be type `list`")
586
+
587
+ # When `image` is a nested list:
588
+ # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
589
+ elif any(isinstance(i, list) for i in image):
590
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
591
+ elif len(image) != len(self.controlnet.nets):
592
+ raise ValueError(
593
+ f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
594
+ )
595
+
596
+ for image_ in image:
597
+ self.check_image(image_, prompt, prompt_embeds)
598
+ else:
599
+ assert False
600
+
601
+ # Check `controlnet_conditioning_scale`
602
+ if isinstance(self.controlnet, ControlNetModel):
603
+ if not isinstance(controlnet_conditioning_scale, float):
604
+ raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
605
+ elif isinstance(self.controlnet, MultiControlNetModel):
606
+ if isinstance(controlnet_conditioning_scale, list):
607
+ if any(isinstance(i, list) for i in controlnet_conditioning_scale):
608
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
609
+ elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
610
+ self.controlnet.nets
611
+ ):
612
+ raise ValueError(
613
+ "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
614
+ " the same length as the number of controlnets"
615
+ )
616
+ else:
617
+ assert False
618
+
619
+ if not isinstance(control_guidance_start, (tuple, list)):
620
+ control_guidance_start = [control_guidance_start]
621
+
622
+ if not isinstance(control_guidance_end, (tuple, list)):
623
+ control_guidance_end = [control_guidance_end]
624
+
625
+ if len(control_guidance_start) != len(control_guidance_end):
626
+ raise ValueError(
627
+ f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
628
+ )
629
+
630
+ if isinstance(self.controlnet, MultiControlNetModel):
631
+ if len(control_guidance_start) != len(self.controlnet.nets):
632
+ raise ValueError(
633
+ f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
634
+ )
635
+
636
+ for start, end in zip(control_guidance_start, control_guidance_end):
637
+ if start >= end:
638
+ raise ValueError(
639
+ f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
640
+ )
641
+ if start < 0.0:
642
+ raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
643
+ if end > 1.0:
644
+ raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
645
+
646
+ # Copied from ppdiffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.check_image
647
+ def check_image(self, image, prompt, prompt_embeds):
648
+ image_is_pil = isinstance(image, PIL.Image.Image)
649
+ image_is_tensor = isinstance(image, paddle.Tensor)
650
+ image_is_np = isinstance(image, np.ndarray)
651
+ image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
652
+ image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor)
653
+ image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
654
+
655
+ if (
656
+ not image_is_pil
657
+ and not image_is_tensor
658
+ and not image_is_np
659
+ and not image_is_pil_list
660
+ and not image_is_tensor_list
661
+ and not image_is_np_list
662
+ ):
663
+ raise TypeError(
664
+ f"image must be passed and be one of PIL image, numpy array, paddle tensor, list of PIL images, list of numpy arrays or list of paddle tensors, but is {type(image)}"
665
+ )
666
+
667
+ if image_is_pil:
668
+ image_batch_size = 1
669
+ else:
670
+ image_batch_size = len(image)
671
+
672
+ if prompt is not None and isinstance(prompt, str):
673
+ prompt_batch_size = 1
674
+ elif prompt is not None and isinstance(prompt, list):
675
+ prompt_batch_size = len(prompt)
676
+ elif prompt_embeds is not None:
677
+ prompt_batch_size = prompt_embeds.shape[0]
678
+
679
+ if image_batch_size != 1 and image_batch_size != prompt_batch_size:
680
+ raise ValueError(
681
+ f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
682
+ )
683
+
684
+ # Copied from ppdiffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.prepare_image
685
+ def prepare_control_image(
686
+ self,
687
+ image,
688
+ width,
689
+ height,
690
+ batch_size,
691
+ num_images_per_prompt,
692
+ dtype,
693
+ do_classifier_free_guidance=False,
694
+ guess_mode=False,
695
+ ):
696
+ image = self.control_image_processor.preprocess(image, height=height, width=width).cast(dtype=paddle.float32)
697
+ image_batch_size = image.shape[0]
698
+
699
+ if image_batch_size == 1:
700
+ repeat_by = batch_size
701
+ else:
702
+ # image batch size is the same as prompt batch size
703
+ repeat_by = num_images_per_prompt
704
+
705
+ image = image.repeat_interleave(repeat_by, axis=0)
706
+
707
+ image = image.cast(dtype=dtype)
708
+
709
+ if do_classifier_free_guidance and not guess_mode:
710
+ image = paddle.concat([image] * 2)
711
+
712
+ return image
713
+
714
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
715
+ def get_timesteps(self, num_inference_steps, strength):
716
+ # get the original timestep using init_timestep
717
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
718
+
719
+ t_start = max(num_inference_steps - init_timestep, 0)
720
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
721
+
722
+ return timesteps, num_inference_steps - t_start
723
+
724
+ # Copied from ppdiffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.prepare_latents
725
+ def prepare_latents(
726
+ self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None, add_noise=True
727
+ ):
728
+ if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)):
729
+ raise ValueError(
730
+ f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
731
+ )
732
+
733
+ image = image.cast(dtype=dtype)
734
+
735
+ batch_size = batch_size * num_images_per_prompt
736
+
737
+ if image.shape[1] == 4:
738
+ init_latents = image
739
+
740
+ else:
741
+ # make sure the VAE is in float32 mode, as it overflows in float16
742
+ if self.vae.config.force_upcast:
743
+ image = image.cast(paddle.float32)
744
+ self.vae.to(dtype=paddle.float32)
745
+
746
+ if isinstance(generator, list) and len(generator) != batch_size:
747
+ raise ValueError(
748
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
749
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
750
+ )
751
+
752
+ elif isinstance(generator, list):
753
+ init_latents = [
754
+ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
755
+ for i in range(batch_size)
756
+ ]
757
+ init_latents = paddle.concat(init_latents, axis=0)
758
+ else:
759
+ init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
760
+
761
+ if self.vae.config.force_upcast:
762
+ self.vae.to(dtype=dtype)
763
+
764
+ init_latents = init_latents.cast(dtype)
765
+ init_latents = self.vae.config.scaling_factor * init_latents
766
+
767
+ if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
768
+ # expand init_latents for batch_size
769
+ additional_image_per_prompt = batch_size // init_latents.shape[0]
770
+ init_latents = paddle.concat([init_latents] * additional_image_per_prompt, axis=0)
771
+ elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
772
+ raise ValueError(
773
+ f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
774
+ )
775
+ else:
776
+ init_latents = paddle.concat([init_latents], axis=0)
777
+
778
+ if add_noise:
779
+ shape = init_latents.shape
780
+ noise = randn_tensor(shape, generator=generator, dtype=dtype)
781
+ # get latents
782
+ init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
783
+
784
+ latents = init_latents
785
+
786
+ return latents
787
+
788
+ # Copied from ppdiffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline._get_add_time_ids
789
+ def _get_add_time_ids(
790
+ self,
791
+ original_size,
792
+ crops_coords_top_left,
793
+ target_size,
794
+ aesthetic_score,
795
+ negative_aesthetic_score,
796
+ negative_original_size,
797
+ negative_crops_coords_top_left,
798
+ negative_target_size,
799
+ dtype,
800
+ text_encoder_projection_dim=None,
801
+ ):
802
+ if self.config.requires_aesthetics_score:
803
+ add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
804
+ add_neg_time_ids = list(
805
+ negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
806
+ )
807
+ else:
808
+ add_time_ids = list(original_size + crops_coords_top_left + target_size)
809
+ add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
810
+
811
+ passed_add_embed_dim = (
812
+ self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
813
+ )
814
+ expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
815
+
816
+ if (
817
+ expected_add_embed_dim > passed_add_embed_dim
818
+ and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
819
+ ):
820
+ raise ValueError(
821
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
822
+ )
823
+ elif (
824
+ expected_add_embed_dim < passed_add_embed_dim
825
+ and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
826
+ ):
827
+ raise ValueError(
828
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
829
+ )
830
+ elif expected_add_embed_dim != passed_add_embed_dim:
831
+ raise ValueError(
832
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
833
+ )
834
+
835
+ add_time_ids = paddle.to_tensor([add_time_ids], dtype=dtype)
836
+ add_neg_time_ids = paddle.to_tensor([add_neg_time_ids], dtype=dtype)
837
+
838
+ return add_time_ids, add_neg_time_ids
839
+
840
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
841
+ def upcast_vae(self):
842
+ dtype = self.vae.dtype
843
+ self.vae.to(dtype=paddle.float32)
844
+ use_paddle_2_5_or_ppxformers = isinstance(
845
+ self.vae.decoder.mid_block.attentions[0].processor,
846
+ (
847
+ AttnProcessor2_5,
848
+ XFormersAttnProcessor,
849
+ LoRAXFormersAttnProcessor,
850
+ LoRAAttnProcessor2_5,
851
+ ),
852
+ )
853
+ # if xformers or torch_2_0 is used attention block does not need
854
+ # to be in float32 which can save lots of memory
855
+ if use_paddle_2_5_or_ppxformers:
856
+ self.vae.post_quant_conv.to(dtype=dtype)
857
+ self.vae.decoder.conv_in.to(dtype=dtype)
858
+ self.vae.decoder.mid_block.to(dtype=dtype)
859
+
860
+ @property
861
+ def guidance_scale(self):
862
+ return self._guidance_scale
863
+
864
+ @property
865
+ def clip_skip(self):
866
+ return self._clip_skip
867
+
868
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
869
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
870
+ # corresponds to doing no classifier free guidance.
871
+ @property
872
+ def do_classifier_free_guidance(self):
873
+ return self._guidance_scale > 1
874
+
875
+ @property
876
+ def cross_attention_kwargs(self):
877
+ return self._cross_attention_kwargs
878
+
879
+ @property
880
+ def num_timesteps(self):
881
+ return self._num_timesteps
882
+
883
+ @paddle.no_grad()
884
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
885
+ def __call__(
886
+ self,
887
+ prompt: Union[str, List[str]] = None,
888
+ prompt_2: Optional[Union[str, List[str]]] = None,
889
+ image: PipelineImageInput = None,
890
+ control_image: PipelineImageInput = None,
891
+ height: Optional[int] = None,
892
+ width: Optional[int] = None,
893
+ strength: float = 0.8,
894
+ num_inference_steps: int = 50,
895
+ guidance_scale: float = 5.0,
896
+ negative_prompt: Optional[Union[str, List[str]]] = None,
897
+ negative_prompt_2: Optional[Union[str, List[str]]] = None,
898
+ num_images_per_prompt: Optional[int] = 1,
899
+ eta: float = 0.0,
900
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
901
+ latents: Optional[paddle.Tensor] = None,
902
+ prompt_embeds: Optional[paddle.Tensor] = None,
903
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
904
+ pooled_prompt_embeds: Optional[paddle.Tensor] = None,
905
+ negative_pooled_prompt_embeds: Optional[paddle.Tensor] = None,
906
+ output_type: Optional[str] = "pil",
907
+ return_dict: bool = True,
908
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
909
+ controlnet_conditioning_scale: Union[float, List[float]] = 0.8,
910
+ guess_mode: bool = False,
911
+ control_guidance_start: Union[float, List[float]] = 0.0,
912
+ control_guidance_end: Union[float, List[float]] = 1.0,
913
+ original_size: Tuple[int, int] = None,
914
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
915
+ target_size: Tuple[int, int] = None,
916
+ negative_original_size: Optional[Tuple[int, int]] = None,
917
+ negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
918
+ negative_target_size: Optional[Tuple[int, int]] = None,
919
+ aesthetic_score: float = 6.0,
920
+ negative_aesthetic_score: float = 2.5,
921
+ clip_skip: Optional[int] = None,
922
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
923
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
924
+ **kwargs,
925
+ ):
926
+ r"""
927
+ Function invoked when calling the pipeline for generation.
928
+
929
+ Args:
930
+ prompt (`str` or `List[str]`, *optional*):
931
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
932
+ instead.
933
+ prompt_2 (`str` or `List[str]`, *optional*):
934
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
935
+ used in both text-encoders
936
+ image (`paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[paddle.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
937
+ `List[List[paddle.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
938
+ The initial image will be used as the starting point for the image generation process. Can also accept
939
+ image latents as `image`, if passing latents directly, it will not be encoded again.
940
+ control_image (`paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[paddle.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
941
+ `List[List[paddle.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
942
+ The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
943
+ the type is specified as `paddle.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
944
+ also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
945
+ height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
946
+ specified in init, images must be passed as a list such that each element of the list can be correctly
947
+ batched for input to a single controlnet.
948
+ height (`int`, *optional*, defaults to the size of control_image):
949
+ The height in pixels of the generated image. Anything below 512 pixels won't work well for
950
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
951
+ and checkpoints that are not specifically fine-tuned on low resolutions.
952
+ width (`int`, *optional*, defaults to the size of control_image):
953
+ The width in pixels of the generated image. Anything below 512 pixels won't work well for
954
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
955
+ and checkpoints that are not specifically fine-tuned on low resolutions.
956
+ num_inference_steps (`int`, *optional*, defaults to 50):
957
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
958
+ expense of slower inference.
959
+ strength (`float`, *optional*, defaults to 0.3):
960
+ Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
961
+ will be used as a starting point, adding more noise to it the larger the `strength`. The number of
962
+ denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
963
+ be maximum and the denoising process will run for the full number of iterations specified in
964
+ `num_inference_steps`.
965
+ guidance_scale (`float`, *optional*, defaults to 7.5):
966
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
967
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
968
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
969
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
970
+ usually at the expense of lower image quality.
971
+ negative_prompt (`str` or `List[str]`, *optional*):
972
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
973
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
974
+ less than `1`).
975
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
976
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
977
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
978
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
979
+ The number of images to generate per prompt.
980
+ eta (`float`, *optional*, defaults to 0.0):
981
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
982
+ [`schedulers.DDIMScheduler`], will be ignored for others.
983
+ generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
984
+ One or a list of [paddle generator(s)] to make generation deterministic.
985
+ latents (`paddle.Tensor`, *optional*):
986
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
987
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
988
+ tensor will ge generated by sampling using the supplied random `generator`.
989
+ prompt_embeds (`paddle.Tensor`, *optional*):
990
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
991
+ provided, text embeddings will be generated from `prompt` input argument.
992
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
993
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
994
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
995
+ argument.
996
+ pooled_prompt_embeds (`paddle.Tensor`, *optional*):
997
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
998
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
999
+ negative_pooled_prompt_embeds (`paddle.Tensor`, *optional*):
1000
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
1001
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
1002
+ input argument.
1003
+ output_type (`str`, *optional*, defaults to `"pil"`):
1004
+ The output format of the generate image. Choose between
1005
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
1006
+ return_dict (`bool`, *optional*, defaults to `True`):
1007
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
1008
+ plain tuple.
1009
+ cross_attention_kwargs (`dict`, *optional*):
1010
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
1011
+ `self.processor` in
1012
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
1013
+ controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
1014
+ The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
1015
+ to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
1016
+ corresponding scale as a list.
1017
+ guess_mode (`bool`, *optional*, defaults to `False`):
1018
+ In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
1019
+ you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
1020
+ control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
1021
+ The percentage of total steps at which the controlnet starts applying.
1022
+ control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
1023
+ The percentage of total steps at which the controlnet stops applying.
1024
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
1025
+ If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
1026
+ `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
1027
+ explained in section 2.2 of
1028
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
1029
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
1030
+ `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
1031
+ `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
1032
+ `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
1033
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
1034
+ target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
1035
+ For most cases, `target_size` should be set to the desired height and width of the generated image. If
1036
+ not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
1037
+ section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
1038
+ negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
1039
+ To negatively condition the generation process based on a specific image resolution. Part of SDXL's
1040
+ micro-conditioning as explained in section 2.2 of
1041
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
1042
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
1043
+ negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
1044
+ To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
1045
+ micro-conditioning as explained in section 2.2 of
1046
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
1047
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
1048
+ negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
1049
+ To negatively condition the generation process based on a target image resolution. It should be as same
1050
+ as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
1051
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
1052
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
1053
+ aesthetic_score (`float`, *optional*, defaults to 6.0):
1054
+ Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
1055
+ Part of SDXL's micro-conditioning as explained in section 2.2 of
1056
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
1057
+ negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
1058
+ Part of SDXL's micro-conditioning as explained in section 2.2 of
1059
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
1060
+ simulate an aesthetic score of the generated image by influencing the negative text condition.
1061
+ clip_skip (`int`, *optional*):
1062
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
1063
+ the output of the pre-final layer will be used for computing the prompt embeddings.
1064
+ callback_on_step_end (`Callable`, *optional*):
1065
+ A function that calls at the end of each denoising steps during the inference. The function is called
1066
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
1067
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
1068
+ `callback_on_step_end_tensor_inputs`.
1069
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
1070
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
1071
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
1072
+ `._callback_tensor_inputs` attribute of your pipeine class.
1073
+
1074
+ Examples:
1075
+
1076
+ Returns:
1077
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
1078
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple`
1079
+ containing the output images.
1080
+ """
1081
+
1082
+ callback = kwargs.pop("callback", None)
1083
+ callback_steps = kwargs.pop("callback_steps", None)
1084
+
1085
+ if callback is not None:
1086
+ deprecate(
1087
+ "callback",
1088
+ "1.0.0",
1089
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
1090
+ )
1091
+ if callback_steps is not None:
1092
+ deprecate(
1093
+ "callback_steps",
1094
+ "1.0.0",
1095
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
1096
+ )
1097
+
1098
+ controlnet = self.controlnet
1099
+
1100
+ # align format for control guidance
1101
+ if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
1102
+ control_guidance_start = len(control_guidance_end) * [control_guidance_start]
1103
+ elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
1104
+ control_guidance_end = len(control_guidance_start) * [control_guidance_end]
1105
+ elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
1106
+ mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
1107
+ control_guidance_start, control_guidance_end = (
1108
+ mult * [control_guidance_start],
1109
+ mult * [control_guidance_end],
1110
+ )
1111
+
1112
+ # 1. Check inputs. Raise error if not correct
1113
+ self.check_inputs(
1114
+ prompt,
1115
+ prompt_2,
1116
+ control_image,
1117
+ strength,
1118
+ num_inference_steps,
1119
+ callback_steps,
1120
+ negative_prompt,
1121
+ negative_prompt_2,
1122
+ prompt_embeds,
1123
+ negative_prompt_embeds,
1124
+ pooled_prompt_embeds,
1125
+ negative_pooled_prompt_embeds,
1126
+ controlnet_conditioning_scale,
1127
+ control_guidance_start,
1128
+ control_guidance_end,
1129
+ callback_on_step_end_tensor_inputs,
1130
+ )
1131
+
1132
+ self._guidance_scale = guidance_scale
1133
+ self._clip_skip = clip_skip
1134
+ self._cross_attention_kwargs = cross_attention_kwargs
1135
+
1136
+ # 2. Define call parameters
1137
+ if prompt is not None and isinstance(prompt, str):
1138
+ batch_size = 1
1139
+ elif prompt is not None and isinstance(prompt, list):
1140
+ batch_size = len(prompt)
1141
+ else:
1142
+ batch_size = prompt_embeds.shape[0]
1143
+
1144
+ if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
1145
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
1146
+
1147
+ global_pool_conditions = (
1148
+ controlnet.config.global_pool_conditions
1149
+ if isinstance(controlnet, ControlNetModel)
1150
+ else controlnet.nets[0].config.global_pool_conditions
1151
+ )
1152
+ guess_mode = guess_mode or global_pool_conditions
1153
+
1154
+ # 3. Encode input prompt
1155
+ text_encoder_lora_scale = (
1156
+ self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
1157
+ )
1158
+ (
1159
+ prompt_embeds,
1160
+ negative_prompt_embeds,
1161
+ pooled_prompt_embeds,
1162
+ negative_pooled_prompt_embeds,
1163
+ ) = self.encode_prompt(
1164
+ prompt,
1165
+ prompt_2,
1166
+ num_images_per_prompt,
1167
+ self.do_classifier_free_guidance,
1168
+ negative_prompt,
1169
+ negative_prompt_2,
1170
+ prompt_embeds=prompt_embeds,
1171
+ negative_prompt_embeds=negative_prompt_embeds,
1172
+ pooled_prompt_embeds=pooled_prompt_embeds,
1173
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
1174
+ lora_scale=text_encoder_lora_scale,
1175
+ clip_skip=self.clip_skip,
1176
+ )
1177
+
1178
+ # 4. Prepare image and controlnet_conditioning_image
1179
+ image = self.image_processor.preprocess(image, height=height, width=width).cast(dtype=paddle.float32)
1180
+
1181
+ if isinstance(controlnet, ControlNetModel):
1182
+ control_image = self.prepare_control_image(
1183
+ image=control_image,
1184
+ width=width,
1185
+ height=height,
1186
+ batch_size=batch_size * num_images_per_prompt,
1187
+ num_images_per_prompt=num_images_per_prompt,
1188
+ dtype=controlnet.dtype,
1189
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
1190
+ guess_mode=guess_mode,
1191
+ )
1192
+ height, width = control_image.shape[-2:]
1193
+ elif isinstance(controlnet, MultiControlNetModel):
1194
+ control_images = []
1195
+
1196
+ for control_image_ in control_image:
1197
+ control_image_ = self.prepare_control_image(
1198
+ image=control_image_,
1199
+ width=width,
1200
+ height=height,
1201
+ batch_size=batch_size * num_images_per_prompt,
1202
+ num_images_per_prompt=num_images_per_prompt,
1203
+ dtype=controlnet.dtype,
1204
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
1205
+ guess_mode=guess_mode,
1206
+ )
1207
+
1208
+ control_images.append(control_image_)
1209
+
1210
+ control_image = control_images
1211
+ height, width = control_image[0].shape[-2:]
1212
+ else:
1213
+ assert False
1214
+
1215
+ # 5. Prepare timesteps
1216
+ self.scheduler.set_timesteps(num_inference_steps)
1217
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
1218
+ latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
1219
+ self._num_timesteps = len(timesteps)
1220
+
1221
+ # 6. Prepare latent variables
1222
+ latents = self.prepare_latents(
1223
+ image,
1224
+ latent_timestep,
1225
+ batch_size,
1226
+ num_images_per_prompt,
1227
+ prompt_embeds.dtype,
1228
+ generator,
1229
+ True,
1230
+ )
1231
+
1232
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
1233
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1234
+
1235
+ # 7.1 Create tensor stating which controlnets to keep
1236
+ controlnet_keep = []
1237
+ for i in range(len(timesteps)):
1238
+ keeps = [
1239
+ 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
1240
+ for s, e in zip(control_guidance_start, control_guidance_end)
1241
+ ]
1242
+ controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
1243
+
1244
+ # 7.2 Prepare added time ids & embeddings
1245
+ if isinstance(control_image, list):
1246
+ original_size = original_size or tuple(control_image[0].shape[-2:])
1247
+ else:
1248
+ original_size = original_size or tuple(control_image.shape[-2:])
1249
+ target_size = target_size or (height, width)
1250
+
1251
+ if negative_original_size is None:
1252
+ negative_original_size = original_size
1253
+ if negative_target_size is None:
1254
+ negative_target_size = target_size
1255
+ add_text_embeds = pooled_prompt_embeds
1256
+
1257
+ if self.text_encoder_2 is None:
1258
+ text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
1259
+ else:
1260
+ text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
1261
+
1262
+ add_time_ids, add_neg_time_ids = self._get_add_time_ids(
1263
+ original_size,
1264
+ crops_coords_top_left,
1265
+ target_size,
1266
+ aesthetic_score,
1267
+ negative_aesthetic_score,
1268
+ negative_original_size,
1269
+ negative_crops_coords_top_left,
1270
+ negative_target_size,
1271
+ dtype=prompt_embeds.dtype,
1272
+ text_encoder_projection_dim=text_encoder_projection_dim,
1273
+ )
1274
+ add_time_ids = add_time_ids.tile([batch_size * num_images_per_prompt, 1])
1275
+
1276
+ if self.do_classifier_free_guidance:
1277
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds], axis=0)
1278
+ add_text_embeds = paddle.concat([negative_pooled_prompt_embeds, add_text_embeds], axis=0)
1279
+ add_neg_time_ids = add_neg_time_ids.tile([batch_size * num_images_per_prompt, 1])
1280
+ add_time_ids = paddle.concat([add_neg_time_ids, add_time_ids], axis=0)
1281
+
1282
+ # 8. Denoising loop
1283
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
1284
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1285
+ for i, t in enumerate(timesteps):
1286
+ # expand the latents if we are doing classifier free guidance
1287
+ latent_model_input = paddle.concat([latents] * 2) if self.do_classifier_free_guidance else latents
1288
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
1289
+
1290
+ added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
1291
+
1292
+ # controlnet(s) inference
1293
+ if guess_mode and self.do_classifier_free_guidance:
1294
+ # Infer ControlNet only for the conditional batch.
1295
+ control_model_input = latents
1296
+ control_model_input = self.scheduler.scale_model_input(control_model_input, t)
1297
+ controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
1298
+ controlnet_added_cond_kwargs = {
1299
+ "text_embeds": add_text_embeds.chunk(2)[1],
1300
+ "time_ids": add_time_ids.chunk(2)[1],
1301
+ }
1302
+ else:
1303
+ control_model_input = latent_model_input
1304
+ controlnet_prompt_embeds = prompt_embeds
1305
+ controlnet_added_cond_kwargs = added_cond_kwargs
1306
+
1307
+ if isinstance(controlnet_keep[i], list):
1308
+ cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
1309
+ else:
1310
+ controlnet_cond_scale = controlnet_conditioning_scale
1311
+ if isinstance(controlnet_cond_scale, list):
1312
+ controlnet_cond_scale = controlnet_cond_scale[0]
1313
+ cond_scale = controlnet_cond_scale * controlnet_keep[i]
1314
+
1315
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
1316
+ control_model_input,
1317
+ t,
1318
+ encoder_hidden_states=controlnet_prompt_embeds,
1319
+ controlnet_cond=control_image,
1320
+ conditioning_scale=cond_scale,
1321
+ guess_mode=guess_mode,
1322
+ added_cond_kwargs=controlnet_added_cond_kwargs,
1323
+ return_dict=False,
1324
+ )
1325
+
1326
+ if guess_mode and self.do_classifier_free_guidance:
1327
+ # Infered ControlNet only for the conditional batch.
1328
+ # To apply the output of ControlNet to both the unconditional and conditional batches,
1329
+ # add 0 to the unconditional batch to keep it unchanged.
1330
+ down_block_res_samples = [paddle.concat([paddle.zeros_like(d), d]) for d in down_block_res_samples]
1331
+ mid_block_res_sample = paddle.concat(
1332
+ [paddle.zeros_like(mid_block_res_sample), mid_block_res_sample]
1333
+ )
1334
+
1335
+ # predict the noise residual
1336
+ noise_pred = self.unet(
1337
+ latent_model_input,
1338
+ t,
1339
+ encoder_hidden_states=prompt_embeds,
1340
+ cross_attention_kwargs=self.cross_attention_kwargs,
1341
+ down_block_additional_residuals=down_block_res_samples,
1342
+ mid_block_additional_residual=mid_block_res_sample,
1343
+ added_cond_kwargs=added_cond_kwargs,
1344
+ return_dict=False,
1345
+ )[0]
1346
+
1347
+ # perform guidance
1348
+ if self.do_classifier_free_guidance:
1349
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1350
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
1351
+
1352
+ # compute the previous noisy sample x_t -> x_t-1
1353
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1354
+
1355
+ if callback_on_step_end is not None:
1356
+ callback_kwargs = {}
1357
+ for k in callback_on_step_end_tensor_inputs:
1358
+ callback_kwargs[k] = locals()[k]
1359
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
1360
+
1361
+ latents = callback_outputs.pop("latents", latents)
1362
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1363
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
1364
+
1365
+ # call the callback, if provided
1366
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1367
+ progress_bar.update()
1368
+ if callback is not None and i % callback_steps == 0:
1369
+ step_idx = i // getattr(self.scheduler, "order", 1)
1370
+ callback(step_idx, t, latents)
1371
+
1372
+ if not output_type == "latent":
1373
+ # make sure the VAE is in float32 mode, as it overflows in float16
1374
+ needs_upcasting = self.vae.dtype in [paddle.float16, "float16"] and self.vae.config.force_upcast
1375
+
1376
+ if needs_upcasting:
1377
+ self.upcast_vae()
1378
+ latents = latents.cast(dtype=next(iter(self.vae.post_quant_conv.named_parameters()))[1].dtype)
1379
+
1380
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
1381
+
1382
+ # cast back to fp16 if needed
1383
+ if needs_upcasting:
1384
+ self.vae.to(dtype=paddle.float16)
1385
+ else:
1386
+ image = latents
1387
+ return StableDiffusionXLPipelineOutput(images=image)
1388
+
1389
+ # apply watermark if available
1390
+ if self.watermark is not None:
1391
+ image = self.watermark.apply_watermark(image)
1392
+
1393
+ image = self.image_processor.postprocess(image, output_type=output_type)
1394
+
1395
+ if not return_dict:
1396
+ return (image,)
1397
+
1398
+ return StableDiffusionXLPipelineOutput(images=image)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_fastdeploy_stable_diffusion_controlnet.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from ..stable_diffusion.pipeline_fastdeploy_stable_diffusion import (
17
+ FastDeployStableDiffusionPipeline,
18
+ )
19
+
20
+
21
+ class FastDeployStableDiffusionControlNetPipeline(FastDeployStableDiffusionPipeline):
22
+ def __call__(
23
+ self,
24
+ *args,
25
+ **kwargs,
26
+ ):
27
+ controlnet_cond = kwargs.pop("controlnet_cond", None)
28
+ image = kwargs.pop("image", None)
29
+ if controlnet_cond is None:
30
+ kwargs["controlnet_cond"] = image
31
+ else:
32
+ kwargs["controlnet_cond"] = controlnet_cond
33
+ return super().__call__(*args, **kwargs)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/controlnet/pipeline_paddleinfer_stable_diffusion_controlnet.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from ..stable_diffusion.pipeline_paddleinfer_stable_diffusion import (
17
+ PaddleInferStableDiffusionPipeline,
18
+ )
19
+
20
+
21
+ class PaddleInferStableDiffusionControlNetPipeline(PaddleInferStableDiffusionPipeline):
22
+ def __call__(
23
+ self,
24
+ *args,
25
+ **kwargs,
26
+ ):
27
+ controlnet_cond = kwargs.pop("controlnet_cond", None)
28
+ image = kwargs.pop("image", None)
29
+ if controlnet_cond is None:
30
+ kwargs["controlnet_cond"] = image
31
+ else:
32
+ kwargs["controlnet_cond"] = controlnet_cond
33
+ return super().__call__(*args, **kwargs)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/ddim/__init__.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import TYPE_CHECKING
16
+
17
+ from ...utils import PPDIFFUSERS_SLOW_IMPORT, _LazyModule
18
+
19
+ _import_structure = {"pipeline_ddim": ["DDIMPipeline"]}
20
+
21
+ if TYPE_CHECKING or PPDIFFUSERS_SLOW_IMPORT:
22
+ from .pipeline_ddim import DDIMPipeline
23
+ else:
24
+ import sys
25
+
26
+ sys.modules[__name__] = _LazyModule(
27
+ __name__,
28
+ globals()["__file__"],
29
+ _import_structure,
30
+ module_spec=__spec__,
31
+ )
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/ddim/pipeline_ddim.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import List, Optional, Tuple, Union
16
+
17
+ import paddle
18
+
19
+ from ...schedulers import DDIMScheduler
20
+ from ...utils.paddle_utils import randn_tensor
21
+ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
22
+
23
+
24
+ class DDIMPipeline(DiffusionPipeline):
25
+ r"""
26
+ Pipeline for image generation.
27
+
28
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
29
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
30
+
31
+ Parameters:
32
+ unet ([`UNet2DModel`]):
33
+ A `UNet2DModel` to denoise the encoded image latents.
34
+ scheduler ([`SchedulerMixin`]):
35
+ A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
36
+ [`DDPMScheduler`], or [`DDIMScheduler`].
37
+ """
38
+
39
+ model_cpu_offload_seq = "unet"
40
+
41
+ def __init__(self, unet, scheduler):
42
+ super().__init__()
43
+
44
+ # make sure scheduler can always be converted to DDIM
45
+ scheduler = DDIMScheduler.from_config(scheduler.config)
46
+
47
+ self.register_modules(unet=unet, scheduler=scheduler)
48
+
49
+ @paddle.no_grad()
50
+ def __call__(
51
+ self,
52
+ batch_size: int = 1,
53
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
54
+ eta: float = 0.0,
55
+ num_inference_steps: int = 50,
56
+ use_clipped_model_output: Optional[bool] = None,
57
+ output_type: Optional[str] = "pil",
58
+ return_dict: bool = True,
59
+ ) -> Union[ImagePipelineOutput, Tuple]:
60
+ r"""
61
+ The call function to the pipeline for generation.
62
+
63
+ Args:
64
+ batch_size (`int`, *optional*, defaults to 1):
65
+ The number of images to generate.
66
+ generator (`paddle.Generator`, *optional*):
67
+ A [`paddle.Generator`] to make generation deterministic.
68
+ eta (`float`, *optional*, defaults to 0.0):
69
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
70
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. A value of `0` corresponds to
71
+ DDIM and `1` corresponds to DDPM.
72
+ num_inference_steps (`int`, *optional*, defaults to 50):
73
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
74
+ expense of slower inference.
75
+ use_clipped_model_output (`bool`, *optional*, defaults to `None`):
76
+ If `True` or `False`, see documentation for [`DDIMScheduler.step`]. If `None`, nothing is passed
77
+ downstream to the scheduler (use `None` for schedulers which don't support this argument).
78
+ output_type (`str`, *optional*, defaults to `"pil"`):
79
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
80
+ return_dict (`bool`, *optional*, defaults to `True`):
81
+ Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
82
+
83
+ Example:
84
+
85
+ ```py
86
+ >>> from ppdiffusers import DDIMPipeline
87
+ >>> import PIL.Image
88
+ >>> import numpy as np
89
+
90
+ >>> # load model and scheduler
91
+ >>> pipe = DDIMPipeline.from_pretrained("fusing/ddim-lsun-bedroom")
92
+
93
+ >>> # run pipeline in inference (sample random noise and denoise)
94
+ >>> image = pipe(eta=0.0, num_inference_steps=50)
95
+
96
+ >>> # process image to PIL
97
+ >>> image_processed = image.transpose([0, 2, 3, 1]).cpu()
98
+ >>> image_processed = (image_processed + 1.0) * 127.5
99
+ >>> image_processed = image_processed.numpy().astype(np.uint8)
100
+ >>> image_pil = PIL.Image.fromarray(image_processed[0])
101
+
102
+ >>> # save image
103
+ >>> image_pil.save("test.png")
104
+ ```
105
+
106
+ Returns:
107
+ [`~pipelines.ImagePipelineOutput`] or `tuple`:
108
+ If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
109
+ returned where the first element is a list with the generated images
110
+ """
111
+
112
+ # Sample gaussian noise to begin loop
113
+ if isinstance(self.unet.config.sample_size, int):
114
+ image_shape = (
115
+ batch_size,
116
+ self.unet.config.in_channels,
117
+ self.unet.config.sample_size,
118
+ self.unet.config.sample_size,
119
+ )
120
+ else:
121
+ image_shape = (batch_size, self.unet.config.in_channels, *self.unet.config.sample_size)
122
+
123
+ if isinstance(generator, list) and len(generator) != batch_size:
124
+ raise ValueError(
125
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
126
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
127
+ )
128
+
129
+ image = randn_tensor(image_shape, generator=generator, dtype=self.unet.dtype)
130
+
131
+ # set step values
132
+ self.scheduler.set_timesteps(num_inference_steps)
133
+
134
+ for t in self.progress_bar(self.scheduler.timesteps):
135
+ # 1. predict noise model_output
136
+ model_output = self.unet(image, t).sample
137
+
138
+ # 2. predict previous mean of image x_t-1 and add variance depending on eta
139
+ # eta corresponds to η in paper and should be between [0, 1]
140
+ # do x_t -> x_t-1
141
+ image = self.scheduler.step(
142
+ model_output, t, image, eta=eta, use_clipped_model_output=use_clipped_model_output, generator=generator
143
+ ).prev_sample
144
+
145
+ image = (image / 2 + 0.5).clip(0, 1)
146
+ image = image.transpose([0, 2, 3, 1]).cpu().numpy()
147
+ if output_type == "pil":
148
+ image = self.numpy_to_pil(image)
149
+
150
+ if not return_dict:
151
+ return (image,)
152
+
153
+ return ImagePipelineOutput(images=image)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/dit/pipeline_dit.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
2
+ # William Peebles and Saining Xie
3
+ #
4
+ # Copyright (c) 2021 OpenAI
5
+ # MIT License
6
+ #
7
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+
21
+ from typing import Dict, List, Optional, Tuple, Union
22
+
23
+ import paddle
24
+
25
+ from ...models import AutoencoderKL, Transformer2DModel
26
+ from ...schedulers import KarrasDiffusionSchedulers
27
+ from ...utils.paddle_utils import randn_tensor
28
+ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
29
+
30
+ try:
31
+ # paddle.incubate.jit.inference is available in paddle develop but not in paddle 3.0beta, so we add a try except.
32
+ from paddle.incubate.jit import is_inference_mode
33
+ except:
34
+
35
+ def is_inference_mode(func):
36
+ return False
37
+
38
+
39
+ class DiTPipeline(DiffusionPipeline):
40
+ r"""
41
+ Pipeline for image generation based on a Transformer backbone instead of a UNet.
42
+
43
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
44
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
45
+
46
+ Parameters:
47
+ transformer ([`Transformer2DModel`]):
48
+ A class conditioned `Transformer2DModel` to denoise the encoded image latents.
49
+ vae ([`AutoencoderKL`]):
50
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
51
+ scheduler ([`DDIMScheduler`]):
52
+ A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
53
+ """
54
+
55
+ model_cpu_offload_seq = "transformer->vae"
56
+
57
+ def __init__(
58
+ self,
59
+ transformer: Transformer2DModel,
60
+ vae: AutoencoderKL,
61
+ scheduler: KarrasDiffusionSchedulers,
62
+ id2label: Optional[Dict[int, str]] = None,
63
+ ):
64
+ super().__init__()
65
+ self.register_modules(transformer=transformer, vae=vae, scheduler=scheduler)
66
+
67
+ # create a imagenet -> id dictionary for easier use
68
+ self.labels = {}
69
+ if id2label is not None:
70
+ for key, value in id2label.items():
71
+ for label in value.split(","):
72
+ self.labels[label.lstrip().rstrip()] = int(key)
73
+ self.labels = dict(sorted(self.labels.items()))
74
+
75
+ def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
76
+ r"""
77
+
78
+ Map label strings from ImageNet to corresponding class ids.
79
+
80
+ Parameters:
81
+ label (`str` or `dict` of `str`):
82
+ Label strings to be mapped to class ids.
83
+
84
+ Returns:
85
+ `list` of `int`:
86
+ Class ids to be processed by pipeline.
87
+ """
88
+
89
+ if not isinstance(label, list):
90
+ label = list(label)
91
+
92
+ for l in label:
93
+ if l not in self.labels:
94
+ raise ValueError(
95
+ f"{l} does not exist. Please make sure to select one of the following labels: \n {self.labels}."
96
+ )
97
+
98
+ return [self.labels[l] for l in label]
99
+
100
+ @paddle.no_grad()
101
+ def __call__(
102
+ self,
103
+ class_labels: List[int],
104
+ guidance_scale: float = 4.0,
105
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
106
+ num_inference_steps: int = 50,
107
+ output_type: Optional[str] = "pil",
108
+ return_dict: bool = True,
109
+ ) -> Union[ImagePipelineOutput, Tuple]:
110
+ r"""
111
+ The call function to the pipeline for generation.
112
+
113
+ Args:
114
+ class_labels (List[int]):
115
+ List of ImageNet class labels for the images to be generated.
116
+ guidance_scale (`float`, *optional*, defaults to 4.0):
117
+ A higher guidance scale value encourages the model to generate images closely linked to the text
118
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
119
+ generator (`paddle.Generator`, *optional*):
120
+ A [`paddle.Generator`] to make generation deterministic.
121
+ num_inference_steps (`int`, *optional*, defaults to 250):
122
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
123
+ expense of slower inference.
124
+ output_type (`str`, *optional*, defaults to `"pil"`):
125
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
126
+ return_dict (`bool`, *optional*, defaults to `True`):
127
+ Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
128
+
129
+ Examples:
130
+
131
+ ```py
132
+ >>> from ppdiffusers import DiTPipeline, DPMSolverMultistepScheduler
133
+ >>> import paddle
134
+
135
+ >>> pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", paddle_dtype=paddle.float16)
136
+ >>> pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
137
+
138
+ >>> # pick words from Imagenet class labels
139
+ >>> pipe.labels # to print all available words
140
+
141
+ >>> # pick words that exist in ImageNet
142
+ >>> words = ["white shark", "umbrella"]
143
+
144
+ >>> class_ids = pipe.get_label_ids(words)
145
+
146
+ >>> generator = paddle.Generator().manual_seed(33)
147
+ >>> output = pipe(class_labels=class_ids, num_inference_steps=25, generator=generator)
148
+
149
+ >>> image = output.images[0] # label 'white shark'
150
+ ```
151
+
152
+ Returns:
153
+ [`~pipelines.ImagePipelineOutput`] or `tuple`:
154
+ If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
155
+ returned where the first element is a list with the generated images
156
+ """
157
+
158
+ batch_size = len(class_labels)
159
+ latent_size = self.transformer.config.sample_size
160
+ latent_channels = self.transformer.config.in_channels
161
+
162
+ latents = randn_tensor(
163
+ shape=(batch_size, latent_channels, latent_size, latent_size),
164
+ generator=generator,
165
+ dtype=self.transformer.dtype,
166
+ )
167
+ latent_model_input = paddle.concat([latents] * 2) if guidance_scale > 1 else latents
168
+
169
+ class_labels = paddle.to_tensor(class_labels).reshape(
170
+ [
171
+ -1,
172
+ ]
173
+ )
174
+ class_null = paddle.to_tensor([1000] * batch_size)
175
+ class_labels_input = paddle.concat([class_labels, class_null], 0) if guidance_scale > 1 else class_labels
176
+
177
+ # set step values
178
+ self.scheduler.set_timesteps(num_inference_steps)
179
+ for t in self.progress_bar(self.scheduler.timesteps):
180
+ if guidance_scale > 1:
181
+ half = latent_model_input[: len(latent_model_input) // 2]
182
+ latent_model_input = paddle.concat([half, half], axis=0)
183
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
184
+
185
+ timesteps = t
186
+
187
+ if not paddle.is_tensor(timesteps):
188
+ if isinstance(timesteps, float):
189
+ dtype = paddle.float32
190
+ else:
191
+ dtype = paddle.int64
192
+ timesteps = paddle.to_tensor([timesteps], dtype=dtype)
193
+ elif len(timesteps.shape) == 0:
194
+ timesteps = timesteps[None]
195
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
196
+ timesteps = timesteps.expand(
197
+ [
198
+ latent_model_input.shape[0],
199
+ ]
200
+ )
201
+ # predict noise model_output
202
+ noise_pred_out = self.transformer(latent_model_input, timestep=timesteps, class_labels=class_labels_input)
203
+ if is_inference_mode(self.transformer):
204
+ # self.transformer run in paddle inference.
205
+ noise_pred = noise_pred_out
206
+ else:
207
+ noise_pred = noise_pred_out.sample
208
+
209
+ # perform guidance
210
+ if guidance_scale > 1:
211
+ eps, rest = noise_pred[:, :latent_channels], noise_pred[:, latent_channels:]
212
+ cond_eps, uncond_eps = paddle.chunk(eps, 2, axis=0)
213
+
214
+ half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
215
+ eps = paddle.concat([half_eps, half_eps], axis=0)
216
+
217
+ noise_pred = paddle.concat([eps, rest], axis=1)
218
+
219
+ # learned sigma
220
+ if self.transformer.config.out_channels // 2 == latent_channels:
221
+ model_output, _ = paddle.split(
222
+ noise_pred, [latent_channels, noise_pred.shape[1] - latent_channels], axis=1
223
+ )
224
+ else:
225
+ model_output = noise_pred
226
+
227
+ # compute previous image: x_t -> x_t-1
228
+ latent_model_input = self.scheduler.step(model_output, t, latent_model_input).prev_sample
229
+
230
+ if guidance_scale > 1:
231
+ latents, _ = latent_model_input.chunk(2, axis=0)
232
+ else:
233
+ latents = latent_model_input
234
+
235
+ latents = 1 / self.vae.config.scaling_factor * latents
236
+
237
+ samples_out = self.vae.decode(latents)
238
+ if is_inference_mode(self.vae.decode):
239
+ # self.vae.decode run in paddle inference.
240
+ samples = samples_out
241
+ else:
242
+ samples = samples_out.sample
243
+
244
+ samples = (samples / 2 + 0.5).clip(0, 1)
245
+
246
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
247
+ samples = samples.transpose([0, 2, 3, 1]).cast("float32").cpu().numpy()
248
+
249
+ if output_type == "pil":
250
+ samples = self.numpy_to_pil(samples)
251
+
252
+ if not return_dict:
253
+ return (samples,)
254
+
255
+ return ImagePipelineOutput(images=samples)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/hotshot_xl/__init__.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from dataclasses import dataclass
16
+ from typing import Union
17
+
18
+ import numpy as np
19
+ import paddle
20
+
21
+ # don't remove these imports - they are needed to load from pretrain.
22
+ from ppdiffusers.models.hotshot_xl.unet import UNet3DConditionModel # noqa: *
23
+ from ppdiffusers.models.modeling_utils import ModelMixin # noqa: *
24
+ from ppdiffusers.utils import BaseOutput
25
+
26
+
27
+ @dataclass
28
+ class HotshotPipelineXLOutput(BaseOutput):
29
+ videos: Union[paddle.Tensor, np.ndarray]
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/hotshot_xl/hotshot_xl_controlnet_pipeline.py ADDED
@@ -0,0 +1,1067 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ import os
17
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
18
+
19
+ import numpy as np
20
+ import paddle
21
+ import PIL.Image
22
+ from einops import rearrange
23
+ from tqdm import tqdm
24
+
25
+ import ppdiffusers
26
+ from ppdiffusers.models.attention_processor import (
27
+ AttnProcessor2_5,
28
+ LoRAAttnProcessor2_5,
29
+ LoRAXFormersAttnProcessor,
30
+ XFormersAttnProcessor,
31
+ )
32
+ from ppdiffusers.models.hotshot_xl.unet import UNet3DConditionModel
33
+
34
+ from . import HotshotPipelineXLOutput
35
+
36
+ logger = ppdiffusers.utils.logging.get_logger(__name__)
37
+ from ppdiffusers import transformers
38
+ from ppdiffusers.image_processor import VaeImageProcessor
39
+
40
+
41
+ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
42
+ """
43
+ Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
44
+ Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
45
+ """
46
+ std_text = noise_pred_text.std(axis=list(range(1, noise_pred_text.ndim)), keepdim=True)
47
+ std_cfg = noise_cfg.std(axis=list(range(1, noise_cfg.ndim)), keepdim=True)
48
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
49
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
50
+ return noise_cfg
51
+
52
+
53
+ EXAMPLE_DOC_STRING = """
54
+ Examples:
55
+ ```py
56
+ >>> import paddle
57
+ >>> from hotshot_xl import HotshotPipelineXL
58
+ >>> from ppdiffusers import ControlNetModel
59
+
60
+ >>> pipe = HotshotXLPipeline.from_pretrained(
61
+ ... "hotshotco/Hotshot-XL",
62
+ ... controlnet=ControlNetModel.from_pretrained("diffusers/controlnet-canny-sdxl-1.0")
63
+ ... )
64
+ >>> def canny(image):
65
+ >>> image = cv2.Canny(image, 100, 200)
66
+ >>> image = image[:, :, None]
67
+ >>> image = np.concatenate([image, image, image], axis=2)
68
+ >>> return Image.fromarray(image)
69
+ >>> # assuming you have 8 keyframes in current directory...
70
+ >>> keyframes = [f"image_{i}.jpg" for i in range(8)]
71
+ >>> control_images = [canny(Image.open(fp)) for fp in keyframes]
72
+ >>> pipe = pipe.to("cuda")
73
+ >>> prompt = "a photo of an astronaut riding a horse on mars"
74
+ >>> video = pipe(prompt,
75
+ ... width=672, height=384,
76
+ ... original_size=(1920, 1080),
77
+ ... target_size=(512, 512),
78
+ ... output_type="tensor",
79
+ ... controlnet_conditioning_scale=0.7,
80
+ ... control_images=control_images
81
+ ).video
82
+ ```
83
+ """
84
+
85
+
86
+ class HotshotXLControlNetPipeline(
87
+ ppdiffusers.pipelines.pipeline_utils.DiffusionPipeline,
88
+ ppdiffusers.loaders.TextualInversionLoaderMixin,
89
+ ppdiffusers.loaders.LoraLoaderMixin,
90
+ ppdiffusers.loaders.FromSingleFileMixin,
91
+ ):
92
+ """
93
+ Pipeline for text-to-image generation using Stable Diffusion XL with ControlNet guidance.
94
+
95
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
96
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
97
+
98
+ The pipeline also inherits the following loading methods:
99
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
100
+ - [`loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
101
+ - [`loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
102
+
103
+ Args:
104
+ vae ([`AutoencoderKL`]):
105
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
106
+ text_encoder ([`~transformers.CLIPTextModel`]):
107
+ Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
108
+ text_encoder_2 ([`~transformers.CLIPTextModelWithProjection`]):
109
+ Second frozen text-encoder
110
+ ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
111
+ tokenizer ([`~transformers.CLIPTokenizer`]):
112
+ A `CLIPTokenizer` to tokenize text.
113
+ tokenizer_2 ([`~transformers.CLIPTokenizer`]):
114
+ A `CLIPTokenizer` to tokenize text.
115
+ unet ([`UNet3DConditionModel`]):
116
+ A `UNet3DConditionModel` to denoise the encoded image latents.
117
+ controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
118
+ Provides additional conditioning to the `unet` during the denoising process. If you set multiple
119
+ ControlNets as a list, the outputs from each ControlNet are added together to create one combined
120
+ additional conditioning.
121
+ scheduler ([`SchedulerMixin`]):
122
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
123
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
124
+ force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
125
+ Whether the negative prompt embeddings should always be set to 0. Also see the config of
126
+ `stabilityai/stable-diffusion-xl-base-1-0`.
127
+ add_watermarker (`bool`, *optional*):
128
+ Whether to use the [invisible_watermark](https://github.com/ShieldMnt/invisible-watermark/) library to
129
+ watermark output images. If not defined, it defaults to `True` if the package is installed; otherwise no
130
+ watermarker is used.
131
+ """
132
+
133
+ def __init__(
134
+ self,
135
+ vae: ppdiffusers.models.AutoencoderKL,
136
+ text_encoder: transformers.CLIPTextModel,
137
+ text_encoder_2: transformers.CLIPTextModelWithProjection,
138
+ tokenizer: transformers.CLIPTokenizer,
139
+ tokenizer_2: transformers.CLIPTokenizer,
140
+ unet: UNet3DConditionModel,
141
+ controlnet: Union[
142
+ ppdiffusers.models.ControlNetModel,
143
+ List[ppdiffusers.models.ControlNetModel],
144
+ Tuple[ppdiffusers.models.ControlNetModel],
145
+ ppdiffusers.pipelines.controlnet.multicontrolnet.MultiControlNetModel,
146
+ ],
147
+ scheduler: ppdiffusers.schedulers.KarrasDiffusionSchedulers,
148
+ force_zeros_for_empty_prompt: bool = True,
149
+ add_watermarker: Optional[bool] = None,
150
+ ):
151
+ super().__init__()
152
+ if isinstance(controlnet, (list, tuple)):
153
+ controlnet = ppdiffusers.pipelines.controlnet.multicontrolnet.MultiControlNetModel(controlnet)
154
+ self.register_modules(
155
+ vae=vae,
156
+ text_encoder=text_encoder,
157
+ text_encoder_2=text_encoder_2,
158
+ tokenizer=tokenizer,
159
+ tokenizer_2=tokenizer_2,
160
+ unet=unet,
161
+ controlnet=controlnet,
162
+ scheduler=scheduler,
163
+ )
164
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
165
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
166
+ self.control_image_processor = VaeImageProcessor(
167
+ vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
168
+ )
169
+ self.watermark = None
170
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
171
+
172
+ def enable_vae_slicing(self):
173
+ """
174
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
175
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
176
+ """
177
+ self.vae.enable_slicing()
178
+
179
+ def disable_vae_slicing(self):
180
+ """
181
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
182
+ computing decoding in one step.
183
+ """
184
+ self.vae.disable_slicing()
185
+
186
+ def enable_vae_tiling(self):
187
+ """
188
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
189
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
190
+ processing larger images.
191
+ """
192
+ self.vae.enable_tiling()
193
+
194
+ def disable_vae_tiling(self):
195
+ """
196
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
197
+ computing decoding in one step.
198
+ """
199
+ self.vae.disable_tiling()
200
+
201
+ def enable_model_cpu_offload(self, gpu_id=0):
202
+ pass
203
+ """
204
+ Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
205
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
206
+ method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
207
+ `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
208
+ """
209
+
210
+ def encode_prompt(
211
+ self,
212
+ prompt: str,
213
+ prompt_2: Optional[str] = None,
214
+ device: Optional[str] = None,
215
+ num_images_per_prompt: int = 1,
216
+ do_classifier_free_guidance: bool = True,
217
+ negative_prompt: Optional[str] = None,
218
+ negative_prompt_2: Optional[str] = None,
219
+ prompt_embeds: Optional[float] = None,
220
+ negative_prompt_embeds: Optional[float] = None,
221
+ pooled_prompt_embeds: Optional[float] = None,
222
+ negative_pooled_prompt_embeds: Optional[float] = None,
223
+ lora_scale: Optional[float] = None,
224
+ ):
225
+ """
226
+ Encodes the prompt into text encoder hidden states.
227
+
228
+ Args:
229
+ prompt (`str` or `List[str]`, *optional*):
230
+ prompt to be encoded
231
+ prompt_2 (`str` or `List[str]`, *optional*):
232
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
233
+ used in both text-encoders
234
+ device: (`paddle.device`):
235
+ paddle device
236
+ num_images_per_prompt (`int`):
237
+ number of images that should be generated per prompt
238
+ do_classifier_free_guidance (`bool`):
239
+ whether to use classifier free guidance or not
240
+ negative_prompt (`str` or `List[str]`, *optional*):
241
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
242
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
243
+ less than `1`).
244
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
245
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
246
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
247
+ prompt_embeds (`paddle.FloatTensor`, *optional*):
248
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
249
+ provided, text embeddings will be generated from `prompt` input argument.
250
+ negative_prompt_embeds (`paddle.FloatTensor`, *optional*):
251
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
252
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
253
+ argument.
254
+ pooled_prompt_embeds (`paddle.FloatTensor`, *optional*):
255
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
256
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
257
+ negative_pooled_prompt_embeds (`paddle.FloatTensor`, *optional*):
258
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
259
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
260
+ input argument.
261
+ lora_scale (`float`, *optional*):
262
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
263
+ """
264
+ # device = device or self._execution_device
265
+ if lora_scale is not None and isinstance(self, ppdiffusers.loaders.LoraLoaderMixin):
266
+ self._lora_scale = lora_scale
267
+ if prompt is not None and isinstance(prompt, str):
268
+ batch_size = 1
269
+ elif prompt is not None and isinstance(prompt, list):
270
+ batch_size = len(prompt)
271
+ else:
272
+ batch_size = tuple(prompt_embeds.shape)[0]
273
+ tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
274
+ text_encoders = (
275
+ [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
276
+ )
277
+ if prompt_embeds is None:
278
+ prompt_2 = prompt_2 or prompt
279
+ prompt_embeds_list = []
280
+ prompts = [prompt, prompt_2]
281
+ for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
282
+ if isinstance(self, ppdiffusers.loaders.TextualInversionLoaderMixin):
283
+ prompt = self.maybe_convert_prompt(prompt, tokenizer)
284
+ text_inputs = tokenizer(
285
+ prompt,
286
+ padding="max_length",
287
+ max_length=tokenizer.model_max_length,
288
+ truncation=True,
289
+ return_tensors="pd",
290
+ )
291
+ text_input_ids = text_inputs.input_ids
292
+ untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
293
+ if (
294
+ tuple(untruncated_ids.shape)[-1] >= tuple(text_input_ids.shape)[-1]
295
+ and not paddle.equal_all(x=text_input_ids, y=untruncated_ids).item()
296
+ ):
297
+ removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
298
+ logger.warning(
299
+ f"The following part of your input was truncated because CLIP can only handle sequences up to {tokenizer.model_max_length} tokens: {removed_text}"
300
+ )
301
+ prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
302
+ pooled_prompt_embeds = prompt_embeds[0]
303
+ prompt_embeds = prompt_embeds.hidden_states[-2]
304
+ prompt_embeds_list.append(prompt_embeds)
305
+ prompt_embeds = paddle.concat(x=prompt_embeds_list, axis=-1)
306
+ zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
307
+ if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
308
+ negative_prompt_embeds = paddle.zeros_like(x=prompt_embeds)
309
+ negative_pooled_prompt_embeds = paddle.zeros_like(x=pooled_prompt_embeds)
310
+ elif do_classifier_free_guidance and negative_prompt_embeds is None:
311
+ negative_prompt = negative_prompt or ""
312
+ negative_prompt_2 = negative_prompt_2 or negative_prompt
313
+ uncond_tokens: List[str]
314
+ if prompt is not None and type(prompt) is not type(negative_prompt):
315
+ raise TypeError(
316
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} != {type(prompt)}."
317
+ )
318
+ elif isinstance(negative_prompt, str):
319
+ uncond_tokens = [negative_prompt, negative_prompt_2]
320
+ elif batch_size != len(negative_prompt):
321
+ raise ValueError(
322
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`: {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches the batch size of `prompt`."
323
+ )
324
+ else:
325
+ uncond_tokens = [negative_prompt, negative_prompt_2]
326
+ negative_prompt_embeds_list = []
327
+ for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
328
+ if isinstance(self, ppdiffusers.loaders.TextualInversionLoaderMixin):
329
+ negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
330
+ max_length = tuple(prompt_embeds.shape)[1]
331
+ uncond_input = tokenizer(
332
+ negative_prompt, padding="max_length", max_length=max_length, truncation=True, return_tensors="pd"
333
+ )
334
+ negative_prompt_embeds = text_encoder(uncond_input.input_ids.to(device), output_hidden_states=True)
335
+ negative_pooled_prompt_embeds = negative_prompt_embeds[0]
336
+ negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
337
+ negative_prompt_embeds_list.append(negative_prompt_embeds)
338
+ negative_prompt_embeds = paddle.concat(x=negative_prompt_embeds_list, axis=-1)
339
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
340
+ bs_embed, seq_len, _ = tuple(prompt_embeds.shape)
341
+ prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
342
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
343
+ if do_classifier_free_guidance:
344
+ seq_len = tuple(negative_prompt_embeds.shape)[1]
345
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
346
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
347
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
348
+ pooled_prompt_embeds = pooled_prompt_embeds.tile([1, num_images_per_prompt]).reshape(
349
+ [bs_embed * num_images_per_prompt, -1]
350
+ )
351
+ if do_classifier_free_guidance:
352
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.tile([1, num_images_per_prompt]).reshape(
353
+ [bs_embed * num_images_per_prompt, -1]
354
+ )
355
+ return (prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds)
356
+
357
+ def prepare_extra_step_kwargs(self, generator, eta):
358
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
359
+ extra_step_kwargs = {}
360
+ if accepts_eta:
361
+ extra_step_kwargs["eta"] = eta
362
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
363
+ if accepts_generator:
364
+ extra_step_kwargs["generator"] = generator
365
+ return extra_step_kwargs
366
+
367
+ def check_inputs(
368
+ self,
369
+ prompt,
370
+ prompt_2,
371
+ control_images,
372
+ video_length,
373
+ callback_steps,
374
+ negative_prompt=None,
375
+ negative_prompt_2=None,
376
+ prompt_embeds=None,
377
+ negative_prompt_embeds=None,
378
+ pooled_prompt_embeds=None,
379
+ negative_pooled_prompt_embeds=None,
380
+ controlnet_conditioning_scale=1.0,
381
+ control_guidance_start=0.0,
382
+ control_guidance_end=1.0,
383
+ ):
384
+ if (
385
+ callback_steps is None
386
+ or callback_steps is not None
387
+ and (not isinstance(callback_steps, int) or callback_steps <= 0)
388
+ ):
389
+ raise ValueError(
390
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}."
391
+ )
392
+ if prompt is not None and prompt_embeds is not None:
393
+ raise ValueError(
394
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to only forward one of the two."
395
+ )
396
+ elif prompt_2 is not None and prompt_embeds is not None:
397
+ raise ValueError(
398
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to only forward one of the two."
399
+ )
400
+ elif prompt is None and prompt_embeds is None:
401
+ raise ValueError(
402
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
403
+ )
404
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
405
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
406
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
407
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
408
+ if negative_prompt is not None and negative_prompt_embeds is not None:
409
+ raise ValueError(
410
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two."
411
+ )
412
+ elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
413
+ raise ValueError(
414
+ f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two."
415
+ )
416
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
417
+ if tuple(prompt_embeds.shape) != tuple(negative_prompt_embeds.shape):
418
+ raise ValueError(
419
+ f"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` {tuple(prompt_embeds.shape)} != `negative_prompt_embeds` {tuple(negative_prompt_embeds.shape)}."
420
+ )
421
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
422
+ raise ValueError(
423
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
424
+ )
425
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
426
+ raise ValueError(
427
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
428
+ )
429
+ if isinstance(self.controlnet, ppdiffusers.pipelines.controlnet.multicontrolnet.MultiControlNetModel):
430
+ if isinstance(prompt, list):
431
+ logger.warning(
432
+ f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)} prompts. The conditionings will be fixed across the prompts."
433
+ )
434
+ # is_compiled = hasattr(paddle.nn.functional, "scaled_dot_product_attention") and isinstance(
435
+ # self.controlnet, paddle._dynamo.eval_frame.OptimizedModule
436
+ # )
437
+ is_compiled = False
438
+ if (
439
+ isinstance(self.controlnet, ppdiffusers.models.ControlNetModel)
440
+ or is_compiled
441
+ and isinstance(self.controlnet._orig_mod, ppdiffusers.models.ControlNetModel)
442
+ ):
443
+ assert len(control_images) == video_length
444
+ elif (
445
+ isinstance(self.controlnet, ppdiffusers.pipelines.controlnet.multicontrolnet.MultiControlNetModel)
446
+ or is_compiled
447
+ and isinstance(
448
+ self.controlnet._orig_mod, ppdiffusers.pipelines.controlnet.multicontrolnet.MultiControlNetModel
449
+ )
450
+ ):
451
+ ...
452
+ else:
453
+ assert False
454
+ if (
455
+ isinstance(self.controlnet, ppdiffusers.models.ControlNetModel)
456
+ or is_compiled
457
+ and isinstance(self.controlnet._orig_mod, ppdiffusers.models.ControlNetModel)
458
+ ):
459
+ if not isinstance(controlnet_conditioning_scale, float):
460
+ raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
461
+ elif (
462
+ isinstance(self.controlnet, ppdiffusers.pipelines.controlnet.multicontrolnet.MultiControlNetModel)
463
+ or is_compiled
464
+ and isinstance(
465
+ self.controlnet._orig_mod, ppdiffusers.pipelines.controlnet.multicontrolnet.MultiControlNetModel
466
+ )
467
+ ):
468
+ if isinstance(controlnet_conditioning_scale, list):
469
+ if any(isinstance(i, list) for i in controlnet_conditioning_scale):
470
+ raise ValueError("A single batch of multiple conditionings are supported at the moment.")
471
+ elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
472
+ self.controlnet.nets
473
+ ):
474
+ raise ValueError(
475
+ "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have the same length as the number of controlnets"
476
+ )
477
+ else:
478
+ assert False
479
+ if not isinstance(control_guidance_start, (tuple, list)):
480
+ control_guidance_start = [control_guidance_start]
481
+ if not isinstance(control_guidance_end, (tuple, list)):
482
+ control_guidance_end = [control_guidance_end]
483
+ if len(control_guidance_start) != len(control_guidance_end):
484
+ raise ValueError(
485
+ f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
486
+ )
487
+ if isinstance(self.controlnet, ppdiffusers.pipelines.controlnet.multicontrolnet.MultiControlNetModel):
488
+ if len(control_guidance_start) != len(self.controlnet.nets):
489
+ raise ValueError(
490
+ f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
491
+ )
492
+ for start, end in zip(control_guidance_start, control_guidance_end):
493
+ if start >= end:
494
+ raise ValueError(
495
+ f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
496
+ )
497
+ if start < 0.0:
498
+ raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
499
+ if end > 1.0:
500
+ raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
501
+
502
+ def check_image(self, image, prompt, prompt_embeds):
503
+ image_is_pil = isinstance(image, PIL.Image.Image)
504
+ image_is_tensor = isinstance(image, paddle.Tensor)
505
+ image_is_np = isinstance(image, np.ndarray)
506
+ image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
507
+ image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor)
508
+ image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
509
+ if (
510
+ not image_is_pil
511
+ and not image_is_tensor
512
+ and not image_is_np
513
+ and not image_is_pil_list
514
+ and not image_is_tensor_list
515
+ and not image_is_np_list
516
+ ):
517
+ raise TypeError(
518
+ f"image must be passed and be one of PIL image, numpy array, paddle tensor, list of PIL images, list of numpy arrays or list of paddle tensors, but is {type(image)}"
519
+ )
520
+ if image_is_pil:
521
+ image_batch_size = 1
522
+ else:
523
+ image_batch_size = len(image)
524
+ if prompt is not None and isinstance(prompt, str):
525
+ prompt_batch_size = 1
526
+ elif prompt is not None and isinstance(prompt, list):
527
+ prompt_batch_size = len(prompt)
528
+ elif prompt_embeds is not None:
529
+ prompt_batch_size = tuple(prompt_embeds.shape)[0]
530
+ if image_batch_size != 1 and image_batch_size != prompt_batch_size:
531
+ raise ValueError(
532
+ f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
533
+ )
534
+
535
+ def prepare_images(
536
+ self,
537
+ images,
538
+ width,
539
+ height,
540
+ batch_size,
541
+ num_images_per_prompt,
542
+ device,
543
+ dtype,
544
+ do_classifier_free_guidance=False,
545
+ guess_mode=False,
546
+ ):
547
+ images_pre_processed = [
548
+ self.control_image_processor.preprocess(image, height=height, width=width).to(dtype="float32")
549
+ for image in images
550
+ ]
551
+ images_pre_processed = paddle.concat(x=images_pre_processed, axis=0)
552
+ repeat_factor = [1] * len(tuple(images_pre_processed.shape))
553
+ repeat_factor[0] = batch_size * num_images_per_prompt
554
+ images_pre_processed = images_pre_processed.tile(repeat_factor)
555
+ images = images_pre_processed.unsqueeze(axis=0)
556
+ images = images.to(device=device, dtype=dtype)
557
+ if do_classifier_free_guidance and not guess_mode:
558
+ repeat_factor = [1] * len(tuple(images.shape))
559
+ repeat_factor[0] = 2
560
+ images = images.tile(repeat_factor)
561
+ return images
562
+
563
+ def prepare_latents(
564
+ self, batch_size, num_channels_latents, video_length, height, width, dtype, device, generator, latents=None
565
+ ):
566
+ shape = (
567
+ batch_size,
568
+ num_channels_latents,
569
+ video_length,
570
+ height // self.vae_scale_factor,
571
+ width // self.vae_scale_factor,
572
+ )
573
+ if isinstance(generator, list) and len(generator) != batch_size:
574
+ raise ValueError(
575
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators."
576
+ )
577
+ if latents is None:
578
+ latents = ppdiffusers.utils.paddle_utils.randn_tensor(shape, generator=generator, dtype=dtype)
579
+ else:
580
+ latents = latents.to(device)
581
+ latents = latents * self.scheduler.init_noise_sigma
582
+ return latents
583
+
584
+ def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype):
585
+ add_time_ids = list(original_size + crops_coords_top_left + target_size)
586
+ passed_add_embed_dim = (
587
+ self.unet.config.addition_time_embed_dim * len(add_time_ids) + self.text_encoder_2.config.projection_dim
588
+ )
589
+ expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
590
+ if expected_add_embed_dim != passed_add_embed_dim:
591
+ raise ValueError(
592
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
593
+ )
594
+ add_time_ids = paddle.to_tensor(data=[add_time_ids], dtype=dtype)
595
+ return add_time_ids
596
+
597
+ def upcast_vae(self):
598
+ dtype = self.vae.dtype
599
+ self.vae.to(dtype="float32")
600
+ use_paddle_2_0_or_xformers = isinstance(
601
+ self.vae.decoder.mid_block.attentions[0].processor,
602
+ (AttnProcessor2_5, XFormersAttnProcessor, LoRAXFormersAttnProcessor, LoRAAttnProcessor2_5),
603
+ )
604
+ if use_paddle_2_0_or_xformers:
605
+ self.vae.post_quant_conv.to(dtype)
606
+ self.vae.decoder.conv_in.to(dtype)
607
+ self.vae.decoder.mid_block.to(dtype)
608
+
609
+ @paddle.no_grad()
610
+ @ppdiffusers.utils.replace_example_docstring(EXAMPLE_DOC_STRING)
611
+ def __call__(
612
+ self,
613
+ prompt: Union[str, List[str]] = None,
614
+ prompt_2: Optional[Union[str, List[str]]] = None,
615
+ video_length: Optional[int] = 8,
616
+ control_images: List[PIL.Image.Image] = None,
617
+ height: Optional[int] = None,
618
+ width: Optional[int] = None,
619
+ num_inference_steps: int = 50,
620
+ guidance_scale: float = 5.0,
621
+ negative_prompt: Optional[Union[str, List[str]]] = None,
622
+ negative_prompt_2: Optional[Union[str, List[str]]] = None,
623
+ num_images_per_prompt: Optional[int] = 1,
624
+ eta: float = 0.0,
625
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
626
+ latents: Optional[float] = None,
627
+ prompt_embeds: Optional[float] = None,
628
+ negative_prompt_embeds: Optional[float] = None,
629
+ pooled_prompt_embeds: Optional[float] = None,
630
+ negative_pooled_prompt_embeds: Optional[float] = None,
631
+ output_type: Optional[str] = "pil",
632
+ return_dict: bool = True,
633
+ callback: Optional[Callable[[int, int, float], None]] = None,
634
+ callback_steps: int = 1,
635
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
636
+ guidance_rescale: float = 0.0,
637
+ controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
638
+ guess_mode: bool = False,
639
+ control_guidance_start: Union[float, List[float]] = 0.0,
640
+ control_guidance_end: Union[float, List[float]] = 1.0,
641
+ original_size: Tuple[int, int] = None,
642
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
643
+ target_size: Tuple[int, int] = None,
644
+ negative_original_size: Optional[Tuple[int, int]] = None,
645
+ negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
646
+ negative_target_size: Optional[Tuple[int, int]] = None,
647
+ ):
648
+ """
649
+ The call function to the pipeline for generation.
650
+
651
+ Args:
652
+ prompt (`str` or `List[str]`, *optional*):
653
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
654
+ prompt_2 (`str` or `List[str]`, *optional*):
655
+ The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
656
+ used in both text-encoders.
657
+ image (`paddle.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[paddle.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
658
+ `List[List[paddle.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
659
+ The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
660
+ specified as `paddle.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
661
+ accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
662
+ and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
663
+ `init`, images must be passed as a list such that each element of the list can be correctly batched for
664
+ input to a single ControlNet.
665
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
666
+ The height in pixels of the generated image.
667
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
668
+ The width in pixels of the generated image.
669
+ num_inference_steps (`int`, *optional*, defaults to 50):
670
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
671
+ expense of slower inference.
672
+ guidance_scale (`float`, *optional*, defaults to 5.0):
673
+ A higher guidance scale value encourages the model to generate images closely linked to the text
674
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
675
+ negative_prompt (`str` or `List[str]`, *optional*):
676
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
677
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
678
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
679
+ The prompt or prompts to guide what to not include in image generation. This is sent to `tokenizer_2`
680
+ and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders.
681
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
682
+ The number of images to generate per prompt.
683
+ eta (`float`, *optional*, defaults to 0.0):
684
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
685
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
686
+ generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
687
+ A [`paddle.Generator`]() to make
688
+ generation deterministic.
689
+ latents (`paddle.FloatTensor`, *optional*):
690
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
691
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
692
+ tensor is generated by sampling using the supplied random `generator`.
693
+ prompt_embeds (`paddle.FloatTensor`, *optional*):
694
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
695
+ provided, text embeddings are generated from the `prompt` input argument.
696
+ negative_prompt_embeds (`paddle.FloatTensor`, *optional*):
697
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
698
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
699
+ pooled_prompt_embeds (`paddle.FloatTensor`, *optional*):
700
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
701
+ not provided, pooled text embeddings are generated from `prompt` input argument.
702
+ negative_pooled_prompt_embeds (`paddle.FloatTensor`, *optional*):
703
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt
704
+ weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input
705
+ argument.
706
+ output_type (`str`, *optional*, defaults to `"pil"`):
707
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
708
+ return_dict (`bool`, *optional*, defaults to `True`):
709
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
710
+ plain tuple.
711
+ callback (`Callable`, *optional*):
712
+ A function that calls every `callback_steps` steps during inference. The function is called with the
713
+ following arguments: `callback(step: int, timestep: int, latents: paddle.FloatTensor)`.
714
+ callback_steps (`int`, *optional*, defaults to 1):
715
+ The frequency at which the `callback` function is called. If not specified, the callback is called at
716
+ every step.
717
+ cross_attention_kwargs (`dict`, *optional*):
718
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
719
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
720
+ controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
721
+ The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
722
+ to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
723
+ the corresponding scale as a list.
724
+ guess_mode (`bool`, *optional*, defaults to `False`):
725
+ The ControlNet encoder tries to recognize the content of the input image even if you remove all
726
+ prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
727
+ control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
728
+ The percentage of total steps at which the ControlNet starts applying.
729
+ control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
730
+ The percentage of total steps at which the ControlNet stops applying.
731
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
732
+ If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
733
+ `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
734
+ explained in section 2.2 of
735
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
736
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
737
+ `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
738
+ `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
739
+ `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
740
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
741
+ target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
742
+ For most cases, `target_size` should be set to the desired height and width of the generated image. If
743
+ not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
744
+ section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
745
+ negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
746
+ To negatively condition the generation process based on a specific image resolution. Part of SDXL's
747
+ micro-conditioning as explained in section 2.2 of
748
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
749
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
750
+ negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
751
+ To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
752
+ micro-conditioning as explained in section 2.2 of
753
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
754
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
755
+ negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
756
+ To negatively condition the generation process based on a target image resolution. It should be as same
757
+ as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
758
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
759
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
760
+
761
+ Examples:
762
+
763
+ Returns:
764
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
765
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
766
+ otherwise a `tuple` is returned containing the output images.
767
+ """
768
+ if video_length > 1 and num_images_per_prompt > 1:
769
+ print(f"Warning - setting num_images_per_prompt = 1 because video_length = {video_length}")
770
+ num_images_per_prompt = 1
771
+ controlnet = (
772
+ self.controlnet._orig_mod
773
+ if ppdiffusers.utils.paddle_utils.is_compiled_module(self.controlnet)
774
+ else self.controlnet
775
+ )
776
+ if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
777
+ control_guidance_start = len(control_guidance_end) * [control_guidance_start]
778
+ elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
779
+ control_guidance_end = len(control_guidance_start) * [control_guidance_end]
780
+ elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
781
+ mult = (
782
+ len(controlnet.nets)
783
+ if isinstance(controlnet, ppdiffusers.pipelines.controlnet.multicontrolnet.MultiControlNetModel)
784
+ else 1
785
+ )
786
+ control_guidance_start, control_guidance_end = mult * [control_guidance_start], mult * [
787
+ control_guidance_end
788
+ ]
789
+ self.check_inputs(
790
+ prompt,
791
+ prompt_2,
792
+ control_images,
793
+ video_length,
794
+ callback_steps,
795
+ negative_prompt,
796
+ negative_prompt_2,
797
+ prompt_embeds,
798
+ negative_prompt_embeds,
799
+ pooled_prompt_embeds,
800
+ negative_pooled_prompt_embeds,
801
+ controlnet_conditioning_scale,
802
+ control_guidance_start,
803
+ control_guidance_end,
804
+ )
805
+ if prompt is not None and isinstance(prompt, str):
806
+ batch_size = 1
807
+ elif prompt is not None and isinstance(prompt, list):
808
+ batch_size = len(prompt)
809
+ else:
810
+ batch_size = tuple(prompt_embeds.shape)[0]
811
+ device = paddle.device.get_device()
812
+ do_classifier_free_guidance = guidance_scale > 1.0
813
+ if isinstance(
814
+ controlnet, ppdiffusers.pipelines.controlnet.multicontrolnet.MultiControlNetModel
815
+ ) and isinstance(controlnet_conditioning_scale, float):
816
+ controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
817
+ global_pool_conditions = (
818
+ controlnet.config.global_pool_conditions
819
+ if isinstance(controlnet, ppdiffusers.models.ControlNetModel)
820
+ else controlnet.nets[0].config.global_pool_conditions
821
+ )
822
+ guess_mode = guess_mode or global_pool_conditions
823
+ text_encoder_lora_scale = (
824
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
825
+ )
826
+ (
827
+ prompt_embeds,
828
+ negative_prompt_embeds,
829
+ pooled_prompt_embeds,
830
+ negative_pooled_prompt_embeds,
831
+ ) = self.encode_prompt(
832
+ prompt,
833
+ prompt_2,
834
+ device,
835
+ num_images_per_prompt,
836
+ do_classifier_free_guidance,
837
+ negative_prompt,
838
+ negative_prompt_2,
839
+ prompt_embeds=prompt_embeds,
840
+ negative_prompt_embeds=negative_prompt_embeds,
841
+ pooled_prompt_embeds=pooled_prompt_embeds,
842
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
843
+ lora_scale=text_encoder_lora_scale,
844
+ )
845
+ if isinstance(controlnet, ppdiffusers.models.ControlNetModel):
846
+ assert len(control_images) == video_length * batch_size
847
+ images = self.prepare_images(
848
+ images=control_images,
849
+ width=width,
850
+ height=height,
851
+ batch_size=batch_size * num_images_per_prompt,
852
+ num_images_per_prompt=num_images_per_prompt,
853
+ device=device,
854
+ dtype=controlnet.dtype,
855
+ do_classifier_free_guidance=do_classifier_free_guidance,
856
+ guess_mode=guess_mode,
857
+ )
858
+ height, width = tuple(images.shape)[-2:]
859
+ elif isinstance(controlnet, ppdiffusers.pipelines.controlnet.multicontrolnet.MultiControlNetModel):
860
+ raise Exception("not supported yet")
861
+ else:
862
+ assert False
863
+ self.scheduler.set_timesteps(num_inference_steps)
864
+ timesteps = self.scheduler.timesteps
865
+ num_channels_latents = self.unet.config.in_channels
866
+ latents = self.prepare_latents(
867
+ batch_size * num_images_per_prompt,
868
+ num_channels_latents,
869
+ video_length,
870
+ height,
871
+ width,
872
+ prompt_embeds.dtype,
873
+ device,
874
+ generator,
875
+ latents,
876
+ )
877
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
878
+ controlnet_keep = []
879
+ for i in range(len(timesteps)):
880
+ keeps = [
881
+ (1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e))
882
+ for s, e in zip(control_guidance_start, control_guidance_end)
883
+ ]
884
+ controlnet_keep.append(keeps[0] if isinstance(controlnet, ppdiffusers.models.ControlNetModel) else keeps)
885
+ original_size = original_size or tuple(images.shape)[-2:]
886
+ target_size = target_size or (height, width)
887
+ add_text_embeds = pooled_prompt_embeds
888
+ add_time_ids = self._get_add_time_ids(
889
+ original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype
890
+ )
891
+ if negative_original_size is not None and negative_target_size is not None:
892
+ negative_add_time_ids = self._get_add_time_ids(
893
+ negative_original_size, negative_crops_coords_top_left, negative_target_size, dtype=prompt_embeds.dtype
894
+ )
895
+ else:
896
+ negative_add_time_ids = add_time_ids
897
+ if do_classifier_free_guidance:
898
+ prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds], axis=0)
899
+ add_text_embeds = paddle.concat(x=[negative_pooled_prompt_embeds, add_text_embeds], axis=0)
900
+ add_time_ids = paddle.concat(x=[negative_add_time_ids, add_time_ids], axis=0)
901
+ prompt_embeds = prompt_embeds.to(device)
902
+ add_text_embeds = add_text_embeds.to(device)
903
+ add_time_ids = add_time_ids.to(device).tile([batch_size * num_images_per_prompt, 1])
904
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
905
+ images = rearrange(images, "b f c h w -> (b f) c h w")
906
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
907
+ for i, t in enumerate(timesteps):
908
+ latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents
909
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
910
+ added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
911
+ if guess_mode and do_classifier_free_guidance:
912
+ control_model_input = latents
913
+ control_model_input = self.scheduler.scale_model_input(control_model_input, t)
914
+ controlnet_prompt_embeds = prompt_embeds.chunk(chunks=2)[1]
915
+ controlnet_added_cond_kwargs = {
916
+ "text_embeds": add_text_embeds.chunk(chunks=2)[1],
917
+ "time_ids": add_time_ids.chunk(chunks=2)[1],
918
+ }
919
+ else:
920
+ control_model_input = latent_model_input
921
+ controlnet_prompt_embeds = prompt_embeds
922
+ controlnet_added_cond_kwargs = added_cond_kwargs
923
+ if isinstance(controlnet_keep[i], list):
924
+ cond_scale = [(c * s) for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
925
+ else:
926
+ controlnet_cond_scale = controlnet_conditioning_scale
927
+ if isinstance(controlnet_cond_scale, list):
928
+ controlnet_cond_scale = controlnet_cond_scale[0]
929
+ cond_scale = controlnet_cond_scale * controlnet_keep[i]
930
+ control_model_input = rearrange(control_model_input, "b c f h w -> (b f) c h w")
931
+ if video_length > 1:
932
+ controlnet_prompt_embeds = controlnet_prompt_embeds.repeat_interleave(repeats=video_length, axis=0)
933
+ controlnet_added_cond_kwargs = {
934
+ "text_embeds": controlnet_added_cond_kwargs["text_embeds"].repeat_interleave(
935
+ repeats=video_length, axis=0
936
+ ),
937
+ "time_ids": controlnet_added_cond_kwargs["time_ids"].repeat_interleave(
938
+ repeats=video_length, axis=0
939
+ ),
940
+ }
941
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
942
+ control_model_input,
943
+ t,
944
+ encoder_hidden_states=controlnet_prompt_embeds,
945
+ controlnet_cond=images,
946
+ conditioning_scale=cond_scale,
947
+ guess_mode=guess_mode,
948
+ added_cond_kwargs=controlnet_added_cond_kwargs,
949
+ return_dict=False,
950
+ )
951
+ for j, sample in enumerate(down_block_res_samples):
952
+ down_block_res_samples[j] = rearrange(sample, "(b f) c h w -> b c f h w", f=video_length)
953
+ mid_block_res_sample = rearrange(mid_block_res_sample, "(b f) c h w -> b c f h w", f=video_length)
954
+ if guess_mode and do_classifier_free_guidance:
955
+ down_block_res_samples = [
956
+ paddle.concat(x=[paddle.zeros_like(x=d), d]) for d in down_block_res_samples
957
+ ]
958
+ mid_block_res_sample = paddle.concat(
959
+ x=[paddle.zeros_like(x=mid_block_res_sample), mid_block_res_sample]
960
+ )
961
+ noise_pred = self.unet(
962
+ latent_model_input,
963
+ t,
964
+ encoder_hidden_states=prompt_embeds,
965
+ cross_attention_kwargs=cross_attention_kwargs,
966
+ down_block_additional_residuals=down_block_res_samples,
967
+ mid_block_additional_residual=mid_block_res_sample,
968
+ added_cond_kwargs=added_cond_kwargs,
969
+ return_dict=False,
970
+ enable_temporal_attentions=video_length > 1,
971
+ )[0]
972
+ if do_classifier_free_guidance:
973
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
974
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
975
+ if do_classifier_free_guidance and guidance_rescale > 0.0:
976
+ noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
977
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
978
+ if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
979
+ progress_bar.update()
980
+ if callback is not None and i % callback_steps == 0:
981
+ callback(i, t, latents)
982
+ if self.vae.dtype == "float16" and self.vae.config.force_upcast:
983
+ self.upcast_vae()
984
+ latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
985
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
986
+ self.unet.to("cpu")
987
+ self.controlnet.to("cpu")
988
+ paddle.device.cuda.empty_cache()
989
+ video = self.decode_latents(latents)
990
+ if output_type == "tensor":
991
+ video = paddle.to_tensor(data=video)
992
+ if not return_dict:
993
+ return video
994
+ return HotshotPipelineXLOutput(videos=video)
995
+
996
+ def decode_latents(self, latents):
997
+ video_length = tuple(latents.shape)[2]
998
+ latents = 1 / self.vae.config.scaling_factor * latents
999
+ latents = rearrange(latents, "b c f h w -> (b f) c h w")
1000
+ video = []
1001
+ for frame_idx in tqdm(range(tuple(latents.shape)[0])):
1002
+ video.append(self.vae.decode(latents[frame_idx : frame_idx + 1]).sample)
1003
+ video = paddle.concat(x=video)
1004
+ video = rearrange(video, "(b f) c h w -> b c f h w", f=video_length)
1005
+ video = (video / 2.0 + 0.5).clip(min=0, max=1)
1006
+ video = video.cpu().astype(dtype="float32").numpy()
1007
+ return video
1008
+
1009
+ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, paddle.Tensor]], **kwargs):
1010
+ state_dict, network_alphas = self.lora_state_dict(
1011
+ pretrained_model_name_or_path_or_dict, unet_config=self.unet.config, **kwargs
1012
+ )
1013
+ self.load_lora_into_unet(state_dict, network_alphas=network_alphas, unet=self.unet)
1014
+ text_encoder_state_dict = {k: v for k, v in state_dict.items() if "text_encoder." in k}
1015
+ if len(text_encoder_state_dict) > 0:
1016
+ self.load_lora_into_text_encoder(
1017
+ text_encoder_state_dict,
1018
+ network_alphas=network_alphas,
1019
+ text_encoder=self.text_encoder,
1020
+ prefix="text_encoder",
1021
+ lora_scale=self.lora_scale,
1022
+ )
1023
+ text_encoder_2_state_dict = {k: v for k, v in state_dict.items() if "text_encoder_2." in k}
1024
+ if len(text_encoder_2_state_dict) > 0:
1025
+ self.load_lora_into_text_encoder(
1026
+ text_encoder_2_state_dict,
1027
+ network_alphas=network_alphas,
1028
+ text_encoder=self.text_encoder_2,
1029
+ prefix="text_encoder_2",
1030
+ lora_scale=self.lora_scale,
1031
+ )
1032
+
1033
+ @classmethod
1034
+ def save_lora_weights(
1035
+ self,
1036
+ save_directory: Union[str, os.PathLike],
1037
+ unet_lora_layers: Dict[str, Union[paddle.nn.Layer, paddle.Tensor]] = None,
1038
+ text_encoder_lora_layers: Dict[str, Union[paddle.nn.Layer, paddle.Tensor]] = None,
1039
+ text_encoder_2_lora_layers: Dict[str, Union[paddle.nn.Layer, paddle.Tensor]] = None,
1040
+ is_main_process: bool = True,
1041
+ weight_name: str = None,
1042
+ save_function: Callable = None,
1043
+ safe_serialization: bool = True,
1044
+ ):
1045
+ state_dict = {}
1046
+
1047
+ def pack_weights(layers, prefix):
1048
+ layers_weights = layers.state_dict() if isinstance(layers, paddle.nn.Layer) else layers
1049
+ layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()}
1050
+ return layers_state_dict
1051
+
1052
+ state_dict.update(pack_weights(unet_lora_layers, "unet"))
1053
+ if text_encoder_lora_layers and text_encoder_2_lora_layers:
1054
+ state_dict.update(pack_weights(text_encoder_lora_layers, "text_encoder"))
1055
+ state_dict.update(pack_weights(text_encoder_2_lora_layers, "text_encoder_2"))
1056
+ self.write_lora_layers(
1057
+ state_dict=state_dict,
1058
+ save_directory=save_directory,
1059
+ is_main_process=is_main_process,
1060
+ weight_name=weight_name,
1061
+ save_function=save_function,
1062
+ safe_serialization=safe_serialization,
1063
+ )
1064
+
1065
+ def _remove_text_encoder_monkey_patch(self):
1066
+ self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder)
1067
+ self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder_2)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import TYPE_CHECKING
16
+
17
+ from ...utils import (
18
+ PPDIFFUSERS_SLOW_IMPORT,
19
+ OptionalDependencyNotAvailable,
20
+ _LazyModule,
21
+ get_objects_from_module,
22
+ is_paddle_available,
23
+ is_paddlenlp_available,
24
+ )
25
+
26
+ _dummy_objects = {}
27
+ _import_structure = {}
28
+
29
+ try:
30
+ if not (is_paddlenlp_available() and is_paddle_available()):
31
+ raise OptionalDependencyNotAvailable()
32
+ except OptionalDependencyNotAvailable:
33
+ from ...utils import dummy_paddle_and_paddlenlp_objects # noqa F403
34
+
35
+ _dummy_objects.update(get_objects_from_module(dummy_paddle_and_paddlenlp_objects))
36
+ else:
37
+ _import_structure["pipeline_latent_diffusion"] = ["LDMBertModel", "LDMBertConfig", "LDMTextToImagePipeline"]
38
+ _import_structure["pipeline_latent_diffusion_uvit"] = ["LDMTextToImageUViTPipeline"]
39
+ _import_structure["pipeline_latent_diffusion_largedit"] = ["LDMTextToImageLargeDiTPipeline"]
40
+ _import_structure["pipeline_latent_diffusion_superresolution"] = ["LDMSuperResolutionPipeline"]
41
+
42
+
43
+ if TYPE_CHECKING or PPDIFFUSERS_SLOW_IMPORT:
44
+ try:
45
+ if not (is_paddlenlp_available() and is_paddle_available()):
46
+ raise OptionalDependencyNotAvailable()
47
+
48
+ except OptionalDependencyNotAvailable:
49
+ from ...utils.dummy_paddle_and_paddlenlp_objects import *
50
+ else:
51
+ from .pipeline_latent_diffusion import (
52
+ LDMBertConfig,
53
+ LDMBertModel,
54
+ LDMTextToImagePipeline,
55
+ )
56
+ from .pipeline_latent_diffusion_largedit import LDMTextToImageLargeDiTPipeline
57
+ from .pipeline_latent_diffusion_superresolution import (
58
+ LDMSuperResolutionPipeline,
59
+ )
60
+ from .pipeline_latent_diffusion_uvit import LDMTextToImageUViTPipeline
61
+
62
+ else:
63
+ import sys
64
+
65
+ sys.modules[__name__] = _LazyModule(
66
+ __name__,
67
+ globals()["__file__"],
68
+ _import_structure,
69
+ module_spec=__spec__,
70
+ )
71
+
72
+ for name, value in _dummy_objects.items():
73
+ setattr(sys.modules[__name__], name, value)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py ADDED
@@ -0,0 +1,787 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import inspect
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import paddle
20
+ import paddle.nn as nn
21
+ from paddle.amp.auto_cast import amp_state
22
+ from paddle.distributed.fleet.utils import recompute
23
+ from paddlenlp.transformers.activations import ACT2FN
24
+ from paddlenlp.transformers.model_outputs import BaseModelOutput
25
+ from paddlenlp.utils.converter import StateDictNameMapping
26
+
27
+ from ppdiffusers.transformers import BertTokenizer, PretrainedConfig, PretrainedModel
28
+
29
+ from ...models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel
30
+ from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
31
+ from ...utils import logging
32
+ from ...utils.paddle_utils import randn_tensor
33
+ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
34
+
35
+
36
+ class LDMTextToImagePipeline(DiffusionPipeline):
37
+ r"""
38
+ Pipeline for text-to-image generation using latent diffusion.
39
+
40
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
41
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
42
+
43
+ Parameters:
44
+ vqvae ([`VQModel`]):
45
+ Vector-quantized (VQ) model to encode and decode images to and from latent representations.
46
+ bert ([`LDMBertModel`]):
47
+ Text-encoder model based on [`~transformers.BERT`].
48
+ tokenizer ([`~transformers.BertTokenizer`]):
49
+ A `BertTokenizer` to tokenize text.
50
+ unet ([`UNet2DConditionModel`]):
51
+ A `UNet2DConditionModel` to denoise the encoded image latents.
52
+ scheduler ([`SchedulerMixin`]):
53
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
54
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
55
+ """
56
+
57
+ model_cpu_offload_seq = "bert->unet->vqvae"
58
+
59
+ def __init__(
60
+ self,
61
+ vqvae: Union[VQModel, AutoencoderKL],
62
+ bert: PretrainedConfig,
63
+ tokenizer: BertTokenizer,
64
+ unet: Union[UNet2DModel, UNet2DConditionModel],
65
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
66
+ ):
67
+ super().__init__()
68
+ self.register_modules(vqvae=vqvae, bert=bert, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
69
+ self.vae_scale_factor = 2 ** (len(self.vqvae.config.block_out_channels) - 1)
70
+
71
+ @paddle.no_grad()
72
+ def __call__(
73
+ self,
74
+ prompt: Union[str, List[str]],
75
+ height: Optional[int] = None,
76
+ width: Optional[int] = None,
77
+ num_inference_steps: Optional[int] = 50,
78
+ guidance_scale: Optional[float] = 1.0,
79
+ eta: Optional[float] = 0.0,
80
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
81
+ latents: Optional[paddle.Tensor] = None,
82
+ output_type: Optional[str] = "pil",
83
+ return_dict: bool = True,
84
+ **kwargs,
85
+ ) -> Union[Tuple, ImagePipelineOutput]:
86
+ r"""
87
+ The call function to the pipeline for generation.
88
+
89
+ Args:
90
+ prompt (`str` or `List[str]`):
91
+ The prompt or prompts to guide the image generation.
92
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
93
+ The height in pixels of the generated image.
94
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
95
+ The width in pixels of the generated image.
96
+ num_inference_steps (`int`, *optional*, defaults to 50):
97
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
98
+ expense of slower inference.
99
+ guidance_scale (`float`, *optional*, defaults to 1.0):
100
+ A higher guidance scale value encourages the model to generate images closely linked to the text
101
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
102
+ generator (`paddle.Generator`, *optional*):
103
+ A [`paddle.Generator`] to make generation deterministic.
104
+ latents (`paddle.Tensor`, *optional*):
105
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
106
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
107
+ tensor is generated by sampling using the supplied random `generator`.
108
+ output_type (`str`, *optional*, defaults to `"pil"`):
109
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
110
+ return_dict (`bool`, *optional*, defaults to `True`):
111
+ Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
112
+
113
+ Example:
114
+
115
+ ```py
116
+ >>> from ppdiffusers import DiffusionPipeline
117
+
118
+ >>> # load model and scheduler
119
+ >>> ldm = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
120
+
121
+ >>> # run pipeline in inference (sample random noise and denoise)
122
+ >>> prompt = "A painting of a squirrel eating a burger"
123
+ >>> images = ldm([prompt], num_inference_steps=50, eta=0.3, guidance_scale=6).images
124
+
125
+ >>> # save images
126
+ >>> for idx, image in enumerate(images):
127
+ ... image.save(f"squirrel-{idx}.png")
128
+ ```
129
+
130
+ Returns:
131
+ [`~pipelines.ImagePipelineOutput`] or `tuple`:
132
+ If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
133
+ returned where the first element is a list with the generated images.
134
+ """
135
+ # 0. Default height and width to unet
136
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
137
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
138
+
139
+ if isinstance(prompt, str):
140
+ batch_size = 1
141
+ elif isinstance(prompt, list):
142
+ batch_size = len(prompt)
143
+ else:
144
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
145
+
146
+ if height % 8 != 0 or width % 8 != 0:
147
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
148
+
149
+ # get unconditional embeddings for classifier free guidance
150
+ if guidance_scale != 1.0:
151
+ uncond_input = self.tokenizer(
152
+ [""] * batch_size,
153
+ padding="max_length",
154
+ max_length=self.tokenizer.model_max_length,
155
+ truncation=True,
156
+ return_tensors="pd",
157
+ )
158
+ negative_prompt_embeds = self.bert(uncond_input.input_ids)[0]
159
+
160
+ # get prompt text embeddings
161
+ text_input = self.tokenizer(
162
+ prompt,
163
+ padding="max_length",
164
+ max_length=self.tokenizer.model_max_length,
165
+ truncation=True,
166
+ return_tensors="pd",
167
+ )
168
+ prompt_embeds = self.bert(text_input.input_ids)[0]
169
+
170
+ # get the initial random noise unless the user supplied it
171
+ latents_shape = [batch_size, self.unet.config.in_channels, height // 8, width // 8]
172
+ if isinstance(generator, list) and len(generator) != batch_size:
173
+ raise ValueError(
174
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
175
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
176
+ )
177
+
178
+ if latents is None:
179
+ latents = randn_tensor(latents_shape, generator=generator, dtype=prompt_embeds.dtype)
180
+ else:
181
+ if latents.shape != latents_shape:
182
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
183
+
184
+ self.scheduler.set_timesteps(num_inference_steps)
185
+
186
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
187
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
188
+
189
+ extra_kwargs = {}
190
+ if accepts_eta:
191
+ extra_kwargs["eta"] = eta
192
+
193
+ for t in self.progress_bar(self.scheduler.timesteps):
194
+ if guidance_scale == 1.0:
195
+ # guidance_scale of 1 means no guidance
196
+ latents_input = latents
197
+ context = prompt_embeds
198
+ else:
199
+ # For classifier free guidance, we need to do two forward passes.
200
+ # Here we concatenate the unconditional and text embeddings into a single batch
201
+ # to avoid doing two forward passes
202
+ latents_input = paddle.concat([latents] * 2)
203
+ context = paddle.concat([negative_prompt_embeds, prompt_embeds])
204
+
205
+ # predict the noise residual
206
+ noise_pred = self.unet(latents_input, t, encoder_hidden_states=context).sample
207
+ # perform guidance
208
+ if guidance_scale != 1.0:
209
+ noise_pred_uncond, noise_prediction_text = noise_pred.chunk(2)
210
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
211
+
212
+ # compute the previous noisy sample x_t -> x_t-1
213
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_kwargs).prev_sample
214
+
215
+ # scale and decode the image latents with vae
216
+ latents = 1 / self.vqvae.config.scaling_factor * latents
217
+ image = self.vqvae.decode(latents).sample
218
+
219
+ image = (image / 2 + 0.5).clip(0, 1)
220
+ image = image.astype("float32").transpose([0, 2, 3, 1]).cpu().numpy()
221
+ if output_type == "pil":
222
+ image = self.numpy_to_pil(image)
223
+
224
+ if not return_dict:
225
+ return (image,)
226
+
227
+ return ImagePipelineOutput(images=image)
228
+
229
+
230
+ ################################################################################
231
+ # Code for the text transformer model
232
+ ################################################################################
233
+ """ Paddle LDMBERT model."""
234
+
235
+
236
+ logger = logging.get_logger(__name__)
237
+
238
+ LDMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
239
+ "ldm-bert",
240
+ # See all LDMBert models at https://huggingface.co/models?filter=ldmbert
241
+ ]
242
+
243
+
244
+ LDMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
245
+ "ldm-bert": "https://huggingface.co/valhalla/ldm-bert/blob/main/config.json",
246
+ }
247
+
248
+
249
+ """ LDMBERT model configuration"""
250
+
251
+
252
+ class LDMBertConfig(PretrainedConfig):
253
+ model_type = "ldmbert"
254
+ keys_to_ignore_at_inference = ["past_key_values"]
255
+ attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
256
+
257
+ def __init__(
258
+ self,
259
+ vocab_size=30522,
260
+ max_position_embeddings=77,
261
+ encoder_layers=32,
262
+ encoder_ffn_dim=5120,
263
+ encoder_attention_heads=8,
264
+ head_dim=64,
265
+ encoder_layerdrop=0.0,
266
+ activation_function="gelu",
267
+ d_model=1280,
268
+ dropout=0.1,
269
+ attention_dropout=0.0,
270
+ activation_dropout=0.0,
271
+ init_std=0.02,
272
+ classifier_dropout=0.0,
273
+ scale_embedding=False,
274
+ use_cache=True,
275
+ pad_token_id=0,
276
+ **kwargs,
277
+ ):
278
+ self.vocab_size = vocab_size
279
+ self.max_position_embeddings = max_position_embeddings
280
+ self.d_model = d_model
281
+ self.encoder_ffn_dim = encoder_ffn_dim
282
+ self.encoder_layers = encoder_layers
283
+ self.encoder_attention_heads = encoder_attention_heads
284
+ self.head_dim = head_dim
285
+ self.dropout = dropout
286
+ self.attention_dropout = attention_dropout
287
+ self.activation_dropout = activation_dropout
288
+ self.activation_function = activation_function
289
+ self.init_std = init_std
290
+ self.encoder_layerdrop = encoder_layerdrop
291
+ self.classifier_dropout = classifier_dropout
292
+ self.use_cache = use_cache
293
+ self.num_hidden_layers = encoder_layers
294
+ self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
295
+ kwargs["return_dict"] = kwargs.pop("return_dict", True)
296
+ super().__init__(pad_token_id=pad_token_id, **kwargs)
297
+
298
+
299
+ def _expand_mask(mask: paddle.Tensor, dtype, tgt_len: Optional[int] = None):
300
+ """
301
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
302
+ """
303
+ bsz, src_len = mask.shape
304
+ tgt_len = tgt_len if tgt_len is not None else src_len
305
+
306
+ expanded_mask = mask[:, None, None, :].expand([bsz, 1, tgt_len, src_len]).cast(dtype)
307
+
308
+ inverted_mask = 1.0 - expanded_mask
309
+
310
+ return paddle.masked_fill(inverted_mask, inverted_mask.cast(paddle.bool), paddle.finfo(dtype).min)
311
+
312
+
313
+ class LDMBertAttention(nn.Layer):
314
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
315
+
316
+ def __init__(
317
+ self,
318
+ embed_dim: int,
319
+ num_heads: int,
320
+ head_dim: int,
321
+ dropout: float = 0.0,
322
+ is_decoder: bool = False,
323
+ bias: bool = False,
324
+ ):
325
+ super().__init__()
326
+ self.embed_dim = embed_dim
327
+ self.num_heads = num_heads
328
+ self.dropout = dropout
329
+ self.head_dim = head_dim
330
+ self.inner_dim = head_dim * num_heads
331
+
332
+ self.scaling = self.head_dim**-0.5
333
+ self.is_decoder = is_decoder
334
+
335
+ self.k_proj = nn.Linear(embed_dim, self.inner_dim, bias_attr=bias)
336
+ self.v_proj = nn.Linear(embed_dim, self.inner_dim, bias_attr=bias)
337
+ self.q_proj = nn.Linear(embed_dim, self.inner_dim, bias_attr=bias)
338
+ self.out_proj = nn.Linear(self.inner_dim, embed_dim)
339
+
340
+ def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
341
+ return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
342
+
343
+ def forward(
344
+ self,
345
+ hidden_states: paddle.Tensor,
346
+ key_value_states: Optional[paddle.Tensor] = None,
347
+ past_key_value: Optional[Tuple[paddle.Tensor]] = None,
348
+ attention_mask: Optional[paddle.Tensor] = None,
349
+ output_attentions: bool = False,
350
+ ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
351
+ """Input shape: Batch x Time x Channel"""
352
+
353
+ # if key_value_states are provided this layer is used as a cross-attention layer
354
+ # for the decoder
355
+ is_cross_attention = key_value_states is not None
356
+
357
+ bsz, tgt_len, _ = hidden_states.shape
358
+
359
+ # get query proj
360
+ query_states = self.q_proj(hidden_states) * self.scaling
361
+ # get key, value proj
362
+ if is_cross_attention and past_key_value is not None:
363
+ # reuse k,v, cross_attentions
364
+ key_states = past_key_value[0]
365
+ value_states = past_key_value[1]
366
+ elif is_cross_attention:
367
+ # cross_attentions
368
+ key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
369
+ value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
370
+ elif past_key_value is not None:
371
+ # reuse k, v, self_attention
372
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
373
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
374
+ key_states = paddle.concat([past_key_value[0], key_states], axis=2)
375
+ value_states = paddle.concat([past_key_value[1], value_states], axis=2)
376
+ else:
377
+ # self_attention
378
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
379
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
380
+
381
+ if self.is_decoder:
382
+ # if cross_attention save Tuple(paddle.Tensor, paddle.Tensor) of all cross attention key/value_states.
383
+ # Further calls to cross_attention layer can then reuse all cross-attention
384
+ # key/value_states (first "if" case)
385
+ # if uni-directional self-attention (decoder) save Tuple(paddle.Tensor, paddle.Tensor) of
386
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
387
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
388
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
389
+ past_key_value = (key_states, value_states)
390
+
391
+ proj_shape = (bsz * self.num_heads, -1, self.head_dim)
392
+ query_states = self._shape(query_states, tgt_len, bsz).reshape(proj_shape)
393
+ key_states = key_states.reshape(proj_shape)
394
+ value_states = value_states.reshape(proj_shape)
395
+
396
+ src_len = key_states.shape[1]
397
+ attn_weights = paddle.matmul(query_states, key_states, transpose_y=True)
398
+
399
+ if attn_weights.shape != [bsz * self.num_heads, tgt_len, src_len]:
400
+ raise ValueError(
401
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
402
+ f" {attn_weights.shape}"
403
+ )
404
+
405
+ if attention_mask is not None:
406
+ if attention_mask.shape != [bsz, 1, tgt_len, src_len]:
407
+ raise ValueError(
408
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.shape}"
409
+ )
410
+ attn_weights = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len]) + attention_mask
411
+ attn_weights = attn_weights.reshape([bsz * self.num_heads, tgt_len, src_len])
412
+
413
+ attn_weights = nn.functional.softmax(attn_weights, axis=-1)
414
+
415
+ if output_attentions:
416
+ # this operation is a bit awkward, but it's required to
417
+ # make sure that attn_weights keeps its gradient.
418
+ # In order to do so, attn_weights have to be reshaped
419
+ # twice and have to be reused in the following
420
+ attn_weights_reshaped = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len])
421
+ attn_weights = attn_weights_reshaped.reshape([bsz * self.num_heads, tgt_len, src_len])
422
+ else:
423
+ attn_weights_reshaped = None
424
+
425
+ attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
426
+
427
+ attn_output = paddle.matmul(attn_probs, value_states)
428
+
429
+ if attn_output.shape != [bsz * self.num_heads, tgt_len, self.head_dim]:
430
+ raise ValueError(
431
+ f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
432
+ f" {attn_output.shape}"
433
+ )
434
+
435
+ attn_output = attn_output.reshape([bsz, self.num_heads, tgt_len, self.head_dim])
436
+ attn_output = attn_output.transpose([0, 2, 1, 3])
437
+
438
+ # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
439
+ # partitioned across GPUs when using tensor-parallelism.
440
+ attn_output = attn_output.reshape([bsz, tgt_len, self.inner_dim])
441
+
442
+ attn_output = self.out_proj(attn_output)
443
+
444
+ return attn_output, attn_weights_reshaped, past_key_value
445
+
446
+
447
+ class LDMBertEncoderLayer(nn.Layer):
448
+ def __init__(self, config: LDMBertConfig):
449
+ super().__init__()
450
+ self.embed_dim = config.d_model
451
+ self.self_attn = LDMBertAttention(
452
+ embed_dim=self.embed_dim,
453
+ num_heads=config.encoder_attention_heads,
454
+ head_dim=config.head_dim,
455
+ dropout=config.attention_dropout,
456
+ )
457
+ self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
458
+ self.dropout = config.dropout
459
+ self.activation_fn = ACT2FN[config.activation_function]
460
+ self.activation_dropout = config.activation_dropout
461
+ self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
462
+ self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
463
+ self.final_layer_norm = nn.LayerNorm(self.embed_dim)
464
+
465
+ def forward(
466
+ self,
467
+ hidden_states: paddle.Tensor,
468
+ attention_mask: paddle.Tensor,
469
+ output_attentions: Optional[bool] = False,
470
+ ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor]]:
471
+ """
472
+ Args:
473
+ hidden_states (`paddle.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
474
+ attention_mask (`paddle.Tensor`): attention mask of size
475
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
476
+ output_attentions (`bool`, *optional*):
477
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
478
+ returned tensors for more detail.
479
+ """
480
+ residual = hidden_states
481
+ hidden_states = self.self_attn_layer_norm(hidden_states)
482
+ hidden_states, attn_weights, _ = self.self_attn(
483
+ hidden_states=hidden_states,
484
+ attention_mask=attention_mask,
485
+ output_attentions=output_attentions,
486
+ )
487
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
488
+ hidden_states = residual + hidden_states
489
+
490
+ residual = hidden_states
491
+ hidden_states = self.final_layer_norm(hidden_states)
492
+ hidden_states = self.activation_fn(self.fc1(hidden_states))
493
+ hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
494
+ hidden_states = self.fc2(hidden_states)
495
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
496
+ hidden_states = residual + hidden_states
497
+
498
+ if (
499
+ amp_state()
500
+ and hidden_states.dtype == paddle.float16
501
+ and (paddle.isinf(hidden_states).any() or paddle.isnan(hidden_states).any())
502
+ ):
503
+ clamp_value = paddle.finfo(hidden_states.dtype).max - 1000
504
+ hidden_states = paddle.clip(hidden_states, min=-clamp_value, max=clamp_value)
505
+
506
+ outputs = (hidden_states,)
507
+
508
+ if output_attentions:
509
+ outputs += (attn_weights,)
510
+
511
+ return outputs
512
+
513
+
514
+ class LDMBertPretrainedModel(PretrainedModel):
515
+ config_class = LDMBertConfig
516
+ base_model_prefix = "model"
517
+ supports_gradient_checkpointing = True
518
+ _keys_to_ignore_on_load_unexpected = [r"encoder\.version", r"decoder\.version"]
519
+
520
+ _deprecated_dict = {
521
+ "key": "encoder.layers",
522
+ "name_mapping": {
523
+ "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
524
+ "embeddings.position_embeddings.weight": "model.embed_positions.weight",
525
+ "final_layer_norm.": "model.layer_norm.",
526
+ "encoder.layers.": "model.layers.",
527
+ ".norm1.": ".self_attn_layer_norm.",
528
+ ".norm2.": ".final_layer_norm.",
529
+ ".linear1.": ".fc1.",
530
+ ".linear2.": ".fc2.",
531
+ },
532
+ }
533
+
534
+ @classmethod
535
+ def _get_name_mappings(cls, config):
536
+ architectures = config.architectures + [cls.__name__]
537
+
538
+ mappings = []
539
+ model_mappings = [
540
+ ["embed_tokens.weight", "embed_tokens.weight"],
541
+ ["embed_positions.weight", "embed_positions.weight"],
542
+ # final layer norm
543
+ ["layer_norm.weight", "layer_norm.weight"],
544
+ ["layer_norm.bias", "layer_norm.bias"],
545
+ ]
546
+ for layer_index in range(config.num_hidden_layers):
547
+ for name in [
548
+ "self_attn.q_proj",
549
+ "self_attn.k_proj",
550
+ "self_attn.v_proj",
551
+ "self_attn.out_proj",
552
+ "self_attn_layer_norm",
553
+ "final_layer_norm",
554
+ "fc1",
555
+ "fc2",
556
+ ]:
557
+ action = None if "layer_norm" in name else "transpose"
558
+ model_mappings.extend(
559
+ [
560
+ [
561
+ f"layers.{layer_index}.{name}.weight",
562
+ f"layers.{layer_index}.{name}.weight",
563
+ action,
564
+ ],
565
+ [
566
+ f"layers.{layer_index}.{name}.bias",
567
+ f"layers.{layer_index}.{name}.bias",
568
+ ],
569
+ ]
570
+ )
571
+
572
+ if "LDMBertModel" in architectures:
573
+ for mapping in model_mappings:
574
+ mapping[0] = "model." + mapping[0]
575
+ mapping[1] = "model." + mapping[1]
576
+
577
+ model_mappings.extend(
578
+ ["to_logits.weight", "to_logits.weight", "transpose"],
579
+ ["to_logits.bias", "to_logits.bias"],
580
+ )
581
+
582
+ mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
583
+ return mappings
584
+
585
+ @paddle.no_grad()
586
+ def _init_weights(self, module):
587
+ std = self.config.init_std
588
+ if isinstance(module, nn.Linear):
589
+ nn.init.normal_(module.weight, mean=0.0, std=std)
590
+ if module.bias is not None:
591
+ module.bias.zero_()
592
+ elif isinstance(module, nn.Embedding):
593
+ nn.init.normal_(module.weight, mean=0.0, std=std)
594
+ if hasattr(module, "padding_idx") and module.padding_idx is not None:
595
+ module.weight[module.padding_idx] = 0
596
+
597
+ @property
598
+ def dummy_inputs(self):
599
+ pad_token = self.config.pad_token_id
600
+ input_ids = paddle.to_tensor(
601
+ [[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]],
602
+ )
603
+ dummy_inputs = {
604
+ "attention_mask": (input_ids != pad_token),
605
+ "input_ids": input_ids,
606
+ }
607
+ return dummy_inputs
608
+
609
+
610
+ class LDMBertEncoder(LDMBertPretrainedModel):
611
+ """
612
+ Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
613
+ [`LDMBertEncoderLayer`].
614
+
615
+ Args:
616
+ config: LDMBertConfig
617
+ embed_tokens (nn.Embedding): output embedding
618
+ """
619
+
620
+ def __init__(self, config: LDMBertConfig):
621
+ super().__init__(config)
622
+
623
+ self.dropout = config.dropout
624
+
625
+ embed_dim = config.d_model
626
+ self.padding_idx = config.pad_token_id
627
+ self.max_source_positions = config.max_position_embeddings
628
+
629
+ self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim)
630
+ self.embed_positions = nn.Embedding(config.max_position_embeddings, embed_dim)
631
+ self.layers = nn.LayerList([LDMBertEncoderLayer(config) for _ in range(config.encoder_layers)])
632
+ self.layer_norm = nn.LayerNorm(embed_dim)
633
+
634
+ self.gradient_checkpointing = False
635
+ # Initialize weights and apply final processing
636
+ self.post_init()
637
+
638
+ def get_input_embeddings(self):
639
+ return self.embed_tokens
640
+
641
+ def set_input_embeddings(self, value):
642
+ self.embed_tokens = value
643
+
644
+ def forward(
645
+ self,
646
+ input_ids: paddle.Tensor = None,
647
+ attention_mask: Optional[paddle.Tensor] = None,
648
+ position_ids: Optional[paddle.Tensor] = None,
649
+ inputs_embeds: Optional[paddle.Tensor] = None,
650
+ output_attentions: Optional[bool] = None,
651
+ output_hidden_states: Optional[bool] = None,
652
+ return_dict: Optional[bool] = None,
653
+ ) -> Union[Tuple, BaseModelOutput]:
654
+ r"""
655
+ Args:
656
+ input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
657
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
658
+ provide it.
659
+
660
+ Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
661
+ [`PreTrainedTokenizer.__call__`] for details.
662
+
663
+ [What are input IDs?](../glossary#input-ids)
664
+ attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
665
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
666
+
667
+ - 1 for tokens that are **not masked**,
668
+ - 0 for tokens that are **masked**.
669
+
670
+ [What are attention masks?](../glossary#attention-mask)
671
+
672
+ inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
673
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
674
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
675
+ than the model's internal embedding lookup matrix.
676
+ output_attentions (`bool`, *optional*):
677
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
678
+ returned tensors for more detail.
679
+ output_hidden_states (`bool`, *optional*):
680
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
681
+ for more detail.
682
+ return_dict (`bool`, *optional*):
683
+ Whether or not to return a [`~utils.BaseModelOutput`] instead of a plain tuple.
684
+ """
685
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
686
+ output_hidden_states = (
687
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
688
+ )
689
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
690
+
691
+ # retrieve input_ids and inputs_embeds
692
+ if input_ids is not None and inputs_embeds is not None:
693
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
694
+ elif input_ids is not None:
695
+ input_shape = input_ids.shape
696
+ elif inputs_embeds is not None:
697
+ input_shape = inputs_embeds.shape[:-1]
698
+ else:
699
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
700
+
701
+ if inputs_embeds is None:
702
+ inputs_embeds = self.embed_tokens(input_ids)
703
+
704
+ seq_len = input_shape[1]
705
+ if position_ids is None:
706
+ position_ids = paddle.arange(seq_len, dtype=paddle.int64).expand((1, -1))
707
+ embed_pos = self.embed_positions(position_ids)
708
+
709
+ hidden_states = inputs_embeds + embed_pos
710
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
711
+
712
+ # expand attention_mask
713
+ if attention_mask is not None:
714
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
715
+ attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
716
+
717
+ encoder_states = () if output_hidden_states else None
718
+ all_attentions = () if output_attentions else None
719
+
720
+ for idx, encoder_layer in enumerate(self.layers):
721
+ if output_hidden_states:
722
+ encoder_states = encoder_states + (hidden_states,)
723
+ if self.gradient_checkpointing and not hidden_states.stop_gradient:
724
+
725
+ def create_custom_forward(module):
726
+ def custom_forward(*inputs):
727
+ return module(*inputs, output_attentions)
728
+
729
+ return custom_forward
730
+
731
+ layer_outputs = recompute(
732
+ create_custom_forward(encoder_layer),
733
+ hidden_states,
734
+ attention_mask,
735
+ )
736
+ else:
737
+ layer_outputs = encoder_layer(
738
+ hidden_states,
739
+ attention_mask,
740
+ output_attentions=output_attentions,
741
+ )
742
+
743
+ hidden_states = layer_outputs[0]
744
+
745
+ if output_attentions:
746
+ all_attentions = all_attentions + (layer_outputs[1],)
747
+
748
+ hidden_states = self.layer_norm(hidden_states)
749
+
750
+ if output_hidden_states:
751
+ encoder_states = encoder_states + (hidden_states,)
752
+
753
+ if not return_dict:
754
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
755
+ return BaseModelOutput(
756
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
757
+ )
758
+
759
+
760
+ class LDMBertModel(LDMBertPretrainedModel):
761
+ _no_split_modules = []
762
+
763
+ def __init__(self, config: LDMBertConfig):
764
+ super().__init__(config)
765
+ self.model = LDMBertEncoder(config)
766
+ self.to_logits = nn.Linear(config.hidden_size, config.vocab_size)
767
+
768
+ def forward(
769
+ self,
770
+ input_ids=None,
771
+ attention_mask=None,
772
+ position_ids=None,
773
+ inputs_embeds=None,
774
+ output_attentions=None,
775
+ output_hidden_states=None,
776
+ return_dict=None,
777
+ ):
778
+ outputs = self.model(
779
+ input_ids,
780
+ attention_mask=attention_mask,
781
+ position_ids=position_ids,
782
+ inputs_embeds=inputs_embeds,
783
+ output_attentions=output_attentions,
784
+ output_hidden_states=output_hidden_states,
785
+ return_dict=return_dict,
786
+ )
787
+ return outputs
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_largedit.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import inspect
17
+ from typing import Callable, List, Optional, Tuple, Union
18
+
19
+ import paddle
20
+ from paddlenlp.transformers import AutoModelForCausalLM, AutoTokenizer
21
+
22
+ from ...models import AutoencoderKL, DiTLLaMAT2IModel
23
+ from ...schedulers import DDIMScheduler, DPMSolverMultistepScheduler
24
+ from ...utils import logging
25
+ from ...utils.paddle_utils import randn_tensor
26
+ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+
31
+ class LDMTextToImageLargeDiTPipeline(DiffusionPipeline):
32
+ r"""
33
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
34
+ library implements for all the pipelines (such as downloading or saving, running on a particular xxxx, etc.)
35
+
36
+ Parameters:
37
+ vae ([`AutoencoderKL`]):
38
+ text_encoder (`paddlenlp.transformers.AutoModelForCausalLM`):
39
+ tokenizer (`paddlenlp.transformers.AutoTokenizer`):
40
+ transformer ([`DiTLLaMAT2IModel`]): DiTLLaMAT2IModel(LargeDiT_T2I) architecture to denoise the encoded image latents.
41
+ scheduler ([`SchedulerMixin`]):
42
+ A scheduler to be used in combination with `transformer` to denoise the encoded image latents. Can be one of
43
+ [`DDIMScheduler`], [`DPMSolverMultistepScheduler`].
44
+ """
45
+
46
+ model_cpu_offload_seq = "text_encoder->transformer->vae"
47
+
48
+ def __init__(
49
+ self,
50
+ vae: AutoencoderKL,
51
+ text_encoder: AutoModelForCausalLM,
52
+ tokenizer: AutoTokenizer,
53
+ transformer: DiTLLaMAT2IModel,
54
+ scheduler: Union[DDIMScheduler, DPMSolverMultistepScheduler],
55
+ ):
56
+ super().__init__()
57
+ self.register_modules(
58
+ vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, transformer=transformer, scheduler=scheduler
59
+ )
60
+ self.cap_feat_dim = self.text_encoder.config.hidden_size
61
+ self.vae_scale_factor = 8
62
+
63
+ def _encode_prompt(self, prompt):
64
+ r"""
65
+ Encodes the prompt into text encoder hidden states.
66
+
67
+ Args:
68
+ prompt (`str` or `List[str]`, *optional*):
69
+ prompt to be encoded
70
+ prompt_embeds (`paddle.Tensor`, *optional*):
71
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
72
+ provided, text embeddings will be generated from `prompt` input argument.
73
+ """
74
+ cap_tok = self.tokenizer(prompt, truncation=False, return_tensors="pd").input_ids[0]
75
+ null_cap_tok = self.tokenizer("", truncation=False, return_tensors="pd").input_ids[0]
76
+ tok = paddle.zeros([2, max(len(cap_tok), len(null_cap_tok))], dtype=paddle.int64)
77
+ tok_mask = paddle.zeros_like(tok).cast("bool")
78
+
79
+ tok[0, : len(cap_tok)] = cap_tok
80
+ tok[1, : len(null_cap_tok)] = null_cap_tok
81
+ tok_mask[0, : len(cap_tok)] = True
82
+ tok_mask[1, : len(null_cap_tok)] = True
83
+
84
+ with paddle.no_grad():
85
+ cap_feats = self.text_encoder.get_decoder()(input_ids=tok)[0].cast("float32")
86
+
87
+ return cap_feats, tok_mask
88
+
89
+ def prepare_extra_step_kwargs(self, generator, eta):
90
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
91
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
92
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
93
+ # and should be between [0, 1]
94
+
95
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
96
+ extra_step_kwargs = {}
97
+ if accepts_eta:
98
+ extra_step_kwargs["eta"] = eta
99
+
100
+ # check if the scheduler accepts generator
101
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
102
+ if accepts_generator:
103
+ extra_step_kwargs["generator"] = generator
104
+ return extra_step_kwargs
105
+
106
+ def check_inputs(
107
+ self,
108
+ prompt,
109
+ height,
110
+ width,
111
+ callback_steps,
112
+ prompt_embeds=None,
113
+ ):
114
+ if height % 8 != 0 or width % 8 != 0:
115
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
116
+
117
+ if (callback_steps is None) or (
118
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
119
+ ):
120
+ raise ValueError(
121
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
122
+ f" {type(callback_steps)}."
123
+ )
124
+
125
+ if prompt is not None and prompt_embeds is not None:
126
+ raise ValueError(
127
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
128
+ " only forward one of the two."
129
+ )
130
+ elif prompt is None and prompt_embeds is None:
131
+ raise ValueError(
132
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
133
+ )
134
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
135
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
136
+
137
+ def prepare_latents(
138
+ self,
139
+ batch_size,
140
+ num_channels_latents,
141
+ height,
142
+ width,
143
+ dtype,
144
+ generator,
145
+ latents=None,
146
+ ):
147
+ shape = [
148
+ batch_size,
149
+ num_channels_latents,
150
+ height // self.vae_scale_factor,
151
+ width // self.vae_scale_factor,
152
+ ]
153
+ if isinstance(generator, list) and len(generator) != batch_size:
154
+ raise ValueError(
155
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
156
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
157
+ )
158
+
159
+ if latents is None:
160
+ latents = randn_tensor(shape, generator=generator, dtype=dtype)
161
+
162
+ # scale the initial noise by the standard deviation required by the scheduler
163
+ latents = latents * self.scheduler.init_noise_sigma
164
+ return latents
165
+
166
+ @paddle.no_grad()
167
+ def __call__(
168
+ self,
169
+ prompt: Union[str, List[str]],
170
+ height: int = 1024,
171
+ width: int = 1024,
172
+ num_inference_steps: int = 10,
173
+ guidance_scale: float = 4.0,
174
+ num_images_per_prompt: Optional[int] = 1,
175
+ eta: float = 0.0,
176
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
177
+ latents: Optional[paddle.Tensor] = None,
178
+ prompt_embeds: Optional[paddle.Tensor] = None,
179
+ output_type: Optional[str] = "pil",
180
+ return_dict: bool = True,
181
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
182
+ callback_steps: Optional[int] = 1,
183
+ **kwargs,
184
+ ) -> Union[Tuple, ImagePipelineOutput]:
185
+ r"""
186
+ Function invoked when calling the pipeline for generation.
187
+
188
+ Args:
189
+ prompt (`str` or `List[str]`, *optional*):
190
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
191
+ instead.
192
+ height (`int`, *optional*, defaults to 256):
193
+ The height in pixels of the generated image.
194
+ width (`int`, *optional*, defaults to 256):
195
+ The width in pixels of the generated image.
196
+ num_inference_steps (`int`, *optional*, defaults to 50):
197
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
198
+ expense of slower inference.
199
+ guidance_scale (`float`, *optional*, defaults to 1.0):
200
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
201
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
202
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
203
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
204
+ usually at the expense of lower image quality.
205
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
206
+ The number of images to generate per prompt.
207
+ eta (`float`, *optional*, defaults to 0.0):
208
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
209
+ [`schedulers.DDIMScheduler`], will be ignored for others.
210
+ generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
211
+ One or a list of paddle generator(s) to make generation deterministic.
212
+ latents (`paddle.Tensor`, *optional*):
213
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
214
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
215
+ tensor will ge generated by sampling using the supplied random `generator`.
216
+ prompt_embeds (`paddle.Tensor`, *optional*):
217
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
218
+ provided, text embeddings will be generated from `prompt` input argument.
219
+ output_type (`str`, *optional*, defaults to `"pil"`):
220
+ The output format of the generate image. Choose between
221
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
222
+ return_dict (`bool`, *optional*, defaults to `True`):
223
+ Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a
224
+ plain tuple.
225
+ callback (`Callable`, *optional*):
226
+ A function that will be called every `callback_steps` steps during inference. The function will be
227
+ called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
228
+ callback_steps (`int`, *optional*, defaults to 1):
229
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
230
+ called at every step.
231
+ cross_attention_kwargs (`dict`, *optional*):
232
+ A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
233
+ `self.processor` in
234
+ [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
235
+
236
+ Examples:
237
+
238
+ Returns:
239
+ [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
240
+ True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
241
+ """
242
+
243
+ # 1. Check inputs. Raise error if not correct
244
+ self.check_inputs(
245
+ prompt,
246
+ height,
247
+ width,
248
+ callback_steps,
249
+ prompt_embeds,
250
+ )
251
+ dtype = paddle.bfloat16 # TODO: make this a parameter
252
+ num_channels_latents = self.transformer.in_channels
253
+ latent_channels = self.transformer.config.in_channels
254
+
255
+ # 2. Define call parameters
256
+ if prompt is not None and isinstance(prompt, str):
257
+ batch_size = 1
258
+ elif prompt is not None and isinstance(prompt, list):
259
+ batch_size = len(prompt)
260
+ else:
261
+ batch_size = prompt_embeds.shape[0]
262
+
263
+ # 3. Encode input prompt
264
+ prompt_embeds, cap_mask = self._encode_prompt(prompt)
265
+ prompt_embeds = prompt_embeds.cast(dtype)
266
+ cap_mask = cap_mask.cast(dtype)
267
+
268
+ # 4. Prepare timesteps
269
+ self.scheduler.set_timesteps(num_inference_steps)
270
+ timesteps = self.scheduler.timesteps
271
+
272
+ # 5. Prepare latent variables
273
+ latents = self.prepare_latents(
274
+ batch_size * num_images_per_prompt,
275
+ num_channels_latents,
276
+ height,
277
+ width,
278
+ prompt_embeds.dtype,
279
+ generator,
280
+ latents,
281
+ )
282
+ latent_model_input = paddle.concat([latents] * 2) if guidance_scale > 1 else latents
283
+
284
+ # 6. Denoising loop
285
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
286
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
287
+ for i, t in enumerate(timesteps):
288
+ if guidance_scale > 1:
289
+ half = latent_model_input[: len(latent_model_input) // 2]
290
+ latent_model_input = paddle.concat([half, half], axis=0)
291
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
292
+
293
+ timesteps = t
294
+ if not paddle.is_tensor(timesteps):
295
+ if isinstance(timesteps, float):
296
+ dtype = paddle.float32
297
+ else:
298
+ dtype = paddle.int64
299
+ timesteps = paddle.to_tensor([timesteps], dtype=dtype)
300
+ elif len(timesteps.shape) == 0:
301
+ timesteps = timesteps[None]
302
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
303
+ timesteps = timesteps.expand(
304
+ [
305
+ latent_model_input.shape[0],
306
+ ]
307
+ )
308
+
309
+ # predict the noise residual
310
+ noise_pred = self.transformer(
311
+ latent_model_input, # [2, 4, 128, 128]
312
+ timesteps,
313
+ cap_feats=prompt_embeds,
314
+ cap_mask=cap_mask,
315
+ return_dict=True,
316
+ ).sample
317
+
318
+ # perform guidance
319
+ if guidance_scale > 1:
320
+ eps, rest = noise_pred[:, :3], noise_pred[:, 3:]
321
+ cond_eps, uncond_eps = paddle.chunk(eps, 2, axis=0)
322
+ half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
323
+ eps = paddle.concat([half_eps, half_eps], axis=0)
324
+ noise_pred = paddle.concat([eps, rest], axis=1)
325
+
326
+ # learned sigma
327
+ if self.transformer.config.out_channels // 2 == latent_channels:
328
+ model_output, _ = paddle.split(
329
+ noise_pred, [latent_channels, noise_pred.shape[1] - latent_channels], axis=1
330
+ )
331
+ else:
332
+ model_output = noise_pred
333
+
334
+ # compute previous image: x_t -> x_t-1
335
+ latent_model_input = self.scheduler.step(model_output, t, latent_model_input).prev_sample
336
+
337
+ # call the callback, if provided
338
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
339
+ progress_bar.update()
340
+ if callback is not None and i % callback_steps == 0:
341
+ callback(i, t, latents)
342
+
343
+ if guidance_scale > 1:
344
+ latents, _ = latent_model_input.chunk(2, axis=0)
345
+ else:
346
+ latents = latent_model_input
347
+
348
+ # 7. Decode latents to image
349
+ latents = 1 / self.vae.config.scaling_factor * latents
350
+ samples = self.vae.decode(latents).sample
351
+ samples = (samples / 2 + 0.5).clip(0, 1)
352
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
353
+ samples = samples.transpose([0, 2, 3, 1]).cast("float32").cpu().numpy()
354
+
355
+ # 8. Convert to PIL Image
356
+ if output_type == "pil":
357
+ samples = self.numpy_to_pil(samples)
358
+
359
+ if not return_dict:
360
+ return (samples,)
361
+
362
+ return ImagePipelineOutput(images=samples)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import List, Optional, Tuple, Union
17
+
18
+ import numpy as np
19
+ import paddle
20
+ import PIL.Image
21
+
22
+ from ...models import UNet2DModel, VQModel
23
+ from ...schedulers import (
24
+ DDIMScheduler,
25
+ DPMSolverMultistepScheduler,
26
+ EulerAncestralDiscreteScheduler,
27
+ EulerDiscreteScheduler,
28
+ LMSDiscreteScheduler,
29
+ PNDMScheduler,
30
+ )
31
+ from ...utils import PIL_INTERPOLATION
32
+ from ...utils.paddle_utils import randn_tensor
33
+ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
34
+
35
+
36
+ def preprocess(image):
37
+ w, h = image.size
38
+ w, h = (x - x % 32 for x in (w, h)) # resize to integer multiple of 32
39
+ image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
40
+ image = np.array(image).astype(np.float32) / 255.0
41
+ image = image[None].transpose(0, 3, 1, 2)
42
+ image = paddle.to_tensor(image)
43
+ return 2.0 * image - 1.0
44
+
45
+
46
+ class LDMSuperResolutionPipeline(DiffusionPipeline):
47
+ r"""
48
+ A pipeline for image super-resolution using latent diffusion.
49
+
50
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
51
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
52
+
53
+ Parameters:
54
+ vqvae ([`VQModel`]):
55
+ Vector-quantized (VQ) model to encode and decode images to and from latent representations.
56
+ unet ([`UNet2DModel`]):
57
+ A `UNet2DModel` to denoise the encoded image.
58
+ scheduler ([`SchedulerMixin`]):
59
+ A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
60
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`],
61
+ [`EulerAncestralDiscreteScheduler`], [`DPMSolverMultistepScheduler`], or [`PNDMScheduler`].
62
+ """
63
+
64
+ def __init__(
65
+ self,
66
+ vqvae: VQModel,
67
+ unet: UNet2DModel,
68
+ scheduler: Union[
69
+ DDIMScheduler,
70
+ PNDMScheduler,
71
+ LMSDiscreteScheduler,
72
+ EulerDiscreteScheduler,
73
+ EulerAncestralDiscreteScheduler,
74
+ DPMSolverMultistepScheduler,
75
+ ],
76
+ ):
77
+ super().__init__()
78
+ self.register_modules(vqvae=vqvae, unet=unet, scheduler=scheduler)
79
+
80
+ @paddle.no_grad()
81
+ def __call__(
82
+ self,
83
+ image: Union[paddle.Tensor, PIL.Image.Image] = None,
84
+ batch_size: Optional[int] = 1,
85
+ num_inference_steps: Optional[int] = 100,
86
+ eta: Optional[float] = 0.0,
87
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
88
+ output_type: Optional[str] = "pil",
89
+ return_dict: bool = True,
90
+ ) -> Union[Tuple, ImagePipelineOutput]:
91
+ r"""
92
+ The call function to the pipeline for generation.
93
+
94
+ Args:
95
+ image (`paddle.Tensor` or `PIL.Image.Image`):
96
+ `Image` or tensor representing an image batch to be used as the starting point for the process.
97
+ batch_size (`int`, *optional*, defaults to 1):
98
+ Number of images to generate.
99
+ num_inference_steps (`int`, *optional*, defaults to 100):
100
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
101
+ expense of slower inference.
102
+ eta (`float`, *optional*, defaults to 0.0):
103
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
104
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
105
+ generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
106
+ A [`paddle.Generator`] to make generation deterministic.
107
+ output_type (`str`, *optional*, defaults to `"pil"`):
108
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
109
+ return_dict (`bool`, *optional*, defaults to `True`):
110
+ Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
111
+
112
+ Example:
113
+
114
+ ```py
115
+ >>> import requests
116
+ >>> from PIL import Image
117
+ >>> from io import BytesIO
118
+ >>> from ppdiffusers import LDMSuperResolutionPipeline
119
+ >>> import paddle
120
+
121
+ >>> # load model and scheduler
122
+ >>> pipeline = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages")
123
+
124
+ >>> # let's download an image
125
+ >>> url = (
126
+ ... "https://user-images.githubusercontent.com/38061659/199705896-b48e17b8-b231-47cd-a270-4ffa5a93fa3e.png"
127
+ ... )
128
+ >>> response = requests.get(url)
129
+ >>> low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
130
+ >>> low_res_img = low_res_img.resize((128, 128))
131
+
132
+ >>> # run pipeline in inference (sample random noise and denoise)
133
+ >>> upscaled_image = pipeline(low_res_img, num_inference_steps=100, eta=1).images[0]
134
+ >>> # save image
135
+ >>> upscaled_image.save("ldm_generated_image.png")
136
+ ```
137
+
138
+ Returns:
139
+ [`~pipelines.ImagePipelineOutput`] or `tuple`:
140
+ If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
141
+ returned where the first element is a list with the generated images
142
+ """
143
+ if isinstance(image, PIL.Image.Image):
144
+ batch_size = 1
145
+ elif isinstance(image, paddle.Tensor):
146
+ batch_size = image.shape[0]
147
+ else:
148
+ raise ValueError(f"`image` has to be of type `PIL.Image.Image` or `paddle.Tensor` but is {type(image)}")
149
+
150
+ if isinstance(image, PIL.Image.Image):
151
+ image = preprocess(image)
152
+
153
+ height, width = image.shape[-2:]
154
+
155
+ # in_channels should be 6: 3 for latents, 3 for low resolution image
156
+ latents_shape = (batch_size, self.unet.config.in_channels // 2, height, width)
157
+ latents_dtype = self.unet.dtype
158
+
159
+ latents = randn_tensor(latents_shape, generator=generator, dtype=latents_dtype)
160
+
161
+ image = image.cast(dtype=latents_dtype)
162
+
163
+ # set timesteps and move to the correct device
164
+ self.scheduler.set_timesteps(num_inference_steps)
165
+ timesteps_tensor = self.scheduler.timesteps
166
+
167
+ # scale the initial noise by the standard deviation required by the scheduler
168
+ latents = latents * self.scheduler.init_noise_sigma
169
+
170
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature.
171
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
172
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
173
+ # and should be between [0, 1]
174
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
175
+ extra_kwargs = {}
176
+ if accepts_eta:
177
+ extra_kwargs["eta"] = eta
178
+
179
+ for t in self.progress_bar(timesteps_tensor):
180
+ # concat latents and low resolution image in the channel dimension.
181
+ image = image.cast(latents.dtype)
182
+ latents_input = paddle.concat([latents, image], axis=1)
183
+ latents_input = self.scheduler.scale_model_input(latents_input, t)
184
+ # predict the noise residual
185
+ noise_pred = self.unet(latents_input, t).sample
186
+ # compute the previous noisy sample x_t -> x_t-1
187
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_kwargs).prev_sample
188
+
189
+ # decode the image latents with the VQVAE
190
+ image = self.vqvae.decode(latents).sample
191
+ image = paddle.clip(image, -1.0, 1.0)
192
+ image = image / 2 + 0.5
193
+ image = image.transpose([0, 2, 3, 1]).cast("float32").cpu().numpy()
194
+
195
+ if output_type == "pil":
196
+ image = self.numpy_to_pil(image)
197
+
198
+ if not return_dict:
199
+ return (image,)
200
+
201
+ return ImagePipelineOutput(images=image)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_uvit.py ADDED
@@ -0,0 +1,477 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import inspect
17
+ from typing import Callable, List, Optional, Tuple, Union
18
+
19
+ import paddle
20
+
21
+ from ppdiffusers.transformers import CLIPTextModel, CLIPTokenizer
22
+
23
+ from ...models import AutoencoderKL, UViTT2IModel, VQModel
24
+ from ...schedulers import DDIMScheduler, DPMSolverMultistepScheduler
25
+ from ...utils import logging
26
+ from ...utils.paddle_utils import randn_tensor
27
+ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
28
+
29
+ logger = logging.get_logger(__name__)
30
+
31
+
32
+ class LDMTextToImageUViTPipeline(DiffusionPipeline):
33
+ r"""
34
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
35
+ library implements for all the pipelines (such as downloading or saving, running on a particular xxxx, etc.)
36
+
37
+ Parameters:
38
+ vqvae ([`VQModel`]):
39
+ Vector-quantized (VQ) Model to encode and decode images to and from latent representations.
40
+ bert ([`LDMBertModel`]):
41
+ Text-encoder model based on [BERT](https://paddlenlp.readthedocs.io/zh/latest/source/paddlenlp.transformers.bert.modeling.html#paddlenlp.transformers.bert.modeling.BertModel) architecture.
42
+ tokenizer (`paddlenlp.transformers.BertTokenizer`):
43
+ Tokenizer of class
44
+ [BertTokenizer](https://paddlenlp.readthedocs.io/zh/latest/source/paddlenlp.transformers.bert.tokenizer.html#paddlenlp.transformers.bert.tokenizer.BertTokenizer).
45
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
46
+ scheduler ([`SchedulerMixin`]):
47
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
48
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`PNDMScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`]
49
+ or [`DPMSolverMultistepScheduler`].
50
+ """
51
+
52
+ model_cpu_offload_seq = "text_encoder->unet->vqvae"
53
+
54
+ def __init__(
55
+ self,
56
+ vqvae: Union[VQModel, AutoencoderKL],
57
+ text_encoder: CLIPTextModel,
58
+ tokenizer: CLIPTokenizer,
59
+ unet: UViTT2IModel,
60
+ scheduler: Union[DDIMScheduler, DPMSolverMultistepScheduler],
61
+ ):
62
+ super().__init__()
63
+ if tokenizer.model_max_length > 77:
64
+ tokenizer.model_max_length = 77
65
+ self.register_modules(
66
+ vqvae=vqvae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler
67
+ )
68
+ self.vae_scale_factor = 8 # 2 ** (len(self.vqvae.config.block_out_channels) - 1)
69
+
70
+ def _encode_prompt(
71
+ self,
72
+ prompt,
73
+ num_images_per_prompt,
74
+ do_classifier_free_guidance,
75
+ negative_prompt=None,
76
+ prompt_embeds: Optional[paddle.Tensor] = None,
77
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
78
+ ):
79
+ r"""
80
+ Encodes the prompt into text encoder hidden states.
81
+
82
+ Args:
83
+ prompt (`str` or `List[str]`, *optional*):
84
+ prompt to be encoded
85
+ num_images_per_prompt (`int`):
86
+ number of images that should be generated per prompt
87
+ do_classifier_free_guidance (`bool`):
88
+ whether to use classifier free guidance or not
89
+ negative_prompt (`str` or `List[str]`, *optional*):
90
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
91
+ `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
92
+ Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
93
+ prompt_embeds (`paddle.Tensor`, *optional*):
94
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
95
+ provided, text embeddings will be generated from `prompt` input argument.
96
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
97
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
98
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
99
+ argument.
100
+ """
101
+ if prompt is not None and isinstance(prompt, str):
102
+ batch_size = 1
103
+ elif prompt is not None and isinstance(prompt, list):
104
+ batch_size = len(prompt)
105
+ else:
106
+ batch_size = prompt_embeds.shape[0]
107
+
108
+ if prompt_embeds is None:
109
+ text_inputs = self.tokenizer(
110
+ prompt,
111
+ padding="max_length",
112
+ max_length=self.tokenizer.model_max_length,
113
+ truncation=True,
114
+ return_tensors="pd",
115
+ )
116
+ text_input_ids = text_inputs.input_ids
117
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
118
+
119
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
120
+ text_input_ids, untruncated_ids
121
+ ):
122
+ removed_text = self.tokenizer.batch_decode(
123
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
124
+ )
125
+ logger.warning(
126
+ "The following part of your input was truncated because LDMBert can only handle sequences up to"
127
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
128
+ )
129
+
130
+ prompt_embeds = self.text_encoder(
131
+ text_input_ids,
132
+ )
133
+ prompt_embeds = prompt_embeds[0]
134
+
135
+ prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
136
+
137
+ bs_embed, seq_len, _ = prompt_embeds.shape
138
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
139
+ prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
140
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
141
+
142
+ # get unconditional embeddings for classifier free guidance
143
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
144
+ uncond_tokens: List[str]
145
+ if negative_prompt is None:
146
+ uncond_tokens = [""] * batch_size
147
+ elif type(prompt) is not type(negative_prompt):
148
+ raise TypeError(
149
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
150
+ f" {type(prompt)}."
151
+ )
152
+ elif isinstance(negative_prompt, str):
153
+ uncond_tokens = [negative_prompt]
154
+ elif batch_size != len(negative_prompt):
155
+ raise ValueError(
156
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
157
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
158
+ " the batch size of `prompt`."
159
+ )
160
+ else:
161
+ uncond_tokens = negative_prompt
162
+
163
+ max_length = prompt_embeds.shape[1]
164
+ uncond_input = self.tokenizer(
165
+ uncond_tokens,
166
+ padding="max_length",
167
+ max_length=max_length,
168
+ truncation=True,
169
+ return_tensors="pd",
170
+ )
171
+
172
+ negative_prompt_embeds = self.text_encoder(
173
+ uncond_input.input_ids,
174
+ )
175
+ negative_prompt_embeds = negative_prompt_embeds[0]
176
+
177
+ if do_classifier_free_guidance:
178
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
179
+ seq_len = negative_prompt_embeds.shape[1]
180
+
181
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
182
+
183
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
184
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
185
+
186
+ # For classifier free guidance, we need to do two forward passes.
187
+ # Here we concatenate the unconditional and text embeddings into a single batch
188
+ # to avoid doing two forward passes
189
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
190
+
191
+ return prompt_embeds
192
+
193
+ def decode_latents(self, latents):
194
+ latents = 1 / 0.18215 * latents
195
+ image = self.vqvae.decode(latents).sample
196
+ image = (image / 2 + 0.5).clip(0, 1)
197
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
198
+ image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
199
+ return image
200
+
201
+ def prepare_extra_step_kwargs(self, generator, eta):
202
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
203
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
204
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
205
+ # and should be between [0, 1]
206
+
207
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
208
+ extra_step_kwargs = {}
209
+ if accepts_eta:
210
+ extra_step_kwargs["eta"] = eta
211
+
212
+ # check if the scheduler accepts generator
213
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
214
+ if accepts_generator:
215
+ extra_step_kwargs["generator"] = generator
216
+ return extra_step_kwargs
217
+
218
+ def check_inputs(
219
+ self,
220
+ prompt,
221
+ height,
222
+ width,
223
+ callback_steps,
224
+ negative_prompt=None,
225
+ prompt_embeds=None,
226
+ negative_prompt_embeds=None,
227
+ ):
228
+ if height % 8 != 0 or width % 8 != 0:
229
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
230
+
231
+ if (callback_steps is None) or (
232
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
233
+ ):
234
+ raise ValueError(
235
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
236
+ f" {type(callback_steps)}."
237
+ )
238
+
239
+ if prompt is not None and prompt_embeds is not None:
240
+ raise ValueError(
241
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
242
+ " only forward one of the two."
243
+ )
244
+ elif prompt is None and prompt_embeds is None:
245
+ raise ValueError(
246
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
247
+ )
248
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
249
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
250
+
251
+ if negative_prompt is not None and negative_prompt_embeds is not None:
252
+ raise ValueError(
253
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
254
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
255
+ )
256
+
257
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
258
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
259
+ raise ValueError(
260
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
261
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
262
+ f" {negative_prompt_embeds.shape}."
263
+ )
264
+
265
+ def prepare_latents(
266
+ self,
267
+ batch_size,
268
+ num_channels_latents,
269
+ height,
270
+ width,
271
+ dtype,
272
+ generator,
273
+ latents=None,
274
+ ):
275
+ shape = [
276
+ batch_size,
277
+ num_channels_latents,
278
+ height // self.vae_scale_factor,
279
+ width // self.vae_scale_factor,
280
+ ]
281
+ if isinstance(generator, list) and len(generator) != batch_size:
282
+ raise ValueError(
283
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
284
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
285
+ )
286
+
287
+ if latents is None:
288
+ latents = randn_tensor(shape, generator=generator, dtype=dtype)
289
+
290
+ # scale the initial noise by the standard deviation required by the scheduler
291
+ latents = latents * self.scheduler.init_noise_sigma
292
+ return latents
293
+
294
+ @paddle.no_grad()
295
+ def __call__(
296
+ self,
297
+ prompt: Union[str, List[str]],
298
+ height: int = 256,
299
+ width: int = 256,
300
+ num_inference_steps: int = 50,
301
+ guidance_scale: float = 1.0,
302
+ negative_prompt: Optional[Union[str, List[str]]] = None,
303
+ num_images_per_prompt: Optional[int] = 1,
304
+ eta: float = 0.0,
305
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
306
+ latents: Optional[paddle.Tensor] = None,
307
+ prompt_embeds: Optional[paddle.Tensor] = None,
308
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
309
+ output_type: Optional[str] = "pil",
310
+ return_dict: bool = True,
311
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
312
+ callback_steps: Optional[int] = 1,
313
+ **kwargs,
314
+ ) -> Union[Tuple, ImagePipelineOutput]:
315
+ r"""
316
+ Function invoked when calling the pipeline for generation.
317
+
318
+ Args:
319
+ prompt (`str` or `List[str]`, *optional*):
320
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
321
+ instead.
322
+ height (`int`, *optional*, defaults to 256):
323
+ The height in pixels of the generated image.
324
+ width (`int`, *optional*, defaults to 256):
325
+ The width in pixels of the generated image.
326
+ num_inference_steps (`int`, *optional*, defaults to 50):
327
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
328
+ expense of slower inference.
329
+ guidance_scale (`float`, *optional*, defaults to 1.0):
330
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
331
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
332
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
333
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
334
+ usually at the expense of lower image quality.
335
+ negative_prompt (`str` or `List[str]`, *optional*):
336
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
337
+ `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
338
+ Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
339
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
340
+ The number of images to generate per prompt.
341
+ eta (`float`, *optional*, defaults to 0.0):
342
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
343
+ [`schedulers.DDIMScheduler`], will be ignored for others.
344
+ generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
345
+ One or a list of paddle generator(s) to make generation deterministic.
346
+ latents (`paddle.Tensor`, *optional*):
347
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
348
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
349
+ tensor will ge generated by sampling using the supplied random `generator`.
350
+ prompt_embeds (`paddle.Tensor`, *optional*):
351
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
352
+ provided, text embeddings will be generated from `prompt` input argument.
353
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
354
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
355
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
356
+ argument.
357
+ output_type (`str`, *optional*, defaults to `"pil"`):
358
+ The output format of the generate image. Choose between
359
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
360
+ return_dict (`bool`, *optional*, defaults to `True`):
361
+ Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a
362
+ plain tuple.
363
+ callback (`Callable`, *optional*):
364
+ A function that will be called every `callback_steps` steps during inference. The function will be
365
+ called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
366
+ callback_steps (`int`, *optional*, defaults to 1):
367
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
368
+ called at every step.
369
+ cross_attention_kwargs (`dict`, *optional*):
370
+ A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
371
+ `self.processor` in
372
+ [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
373
+
374
+ Examples:
375
+
376
+ Returns:
377
+ [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
378
+ True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
379
+ """
380
+
381
+ # 1. Check inputs. Raise error if not correct
382
+ self.check_inputs(
383
+ prompt,
384
+ height,
385
+ width,
386
+ callback_steps,
387
+ negative_prompt,
388
+ prompt_embeds,
389
+ negative_prompt_embeds,
390
+ )
391
+
392
+ # 2. Define call parameters
393
+ if prompt is not None and isinstance(prompt, str):
394
+ batch_size = 1
395
+ elif prompt is not None and isinstance(prompt, list):
396
+ batch_size = len(prompt)
397
+ else:
398
+ batch_size = prompt_embeds.shape[0]
399
+
400
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
401
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
402
+ # corresponds to doing no classifier free guidance.
403
+ do_classifier_free_guidance = guidance_scale > 1.0
404
+
405
+ # 3. Encode input prompt
406
+ prompt_embeds = self._encode_prompt(
407
+ prompt,
408
+ num_images_per_prompt,
409
+ do_classifier_free_guidance,
410
+ negative_prompt,
411
+ prompt_embeds=prompt_embeds,
412
+ negative_prompt_embeds=negative_prompt_embeds,
413
+ )
414
+
415
+ # 4. Prepare timesteps
416
+ self.scheduler.set_timesteps(num_inference_steps)
417
+ timesteps = self.scheduler.timesteps
418
+
419
+ # 5. Prepare latent variables
420
+ num_channels_latents = self.unet.in_channels
421
+ latents = self.prepare_latents(
422
+ batch_size * num_images_per_prompt,
423
+ num_channels_latents,
424
+ height,
425
+ width,
426
+ prompt_embeds.dtype,
427
+ generator,
428
+ latents,
429
+ )
430
+
431
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
432
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
433
+
434
+ # 7. Denoising loop
435
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
436
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
437
+ for i, t in enumerate(timesteps):
438
+ # expand the latents if we are doing classifier free guidance
439
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
440
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
441
+
442
+ # predict the noise residual
443
+ noise_pred = self.unet(
444
+ latent_model_input, # [2, 4, 32, 32]
445
+ t, # [1]
446
+ prompt_embeds, # [2, 77, 768] in uvit, but in unet [2, 77, 768->1280]
447
+ # cross_attention_kwargs=cross_attention_kwargs,
448
+ return_dict=True,
449
+ ).sample
450
+
451
+ # perform guidance
452
+ if do_classifier_free_guidance:
453
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
454
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
455
+
456
+ # compute the previous noisy sample x_t -> x_t-1
457
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
458
+
459
+ # call the callback, if provided
460
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
461
+ progress_bar.update()
462
+ if callback is not None and i % callback_steps == 0:
463
+ callback(i, t, latents)
464
+
465
+ if output_type == "pil":
466
+ # 8. Post-processing
467
+ image = self.decode_latents(latents)
468
+ # 10. Convert to PIL
469
+ image = self.numpy_to_pil(image)
470
+ else:
471
+ # 8. Post-processing
472
+ image = self.decode_latents(latents)
473
+
474
+ if not return_dict:
475
+ return (image,)
476
+
477
+ return ImagePipelineOutput(images=image)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/lvdm/__init__.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from typing import TYPE_CHECKING
17
+
18
+ from ...utils import (
19
+ PPDIFFUSERS_SLOW_IMPORT,
20
+ OptionalDependencyNotAvailable,
21
+ _LazyModule,
22
+ get_objects_from_module,
23
+ is_paddle_available,
24
+ is_paddlenlp_available,
25
+ )
26
+
27
+ _dummy_objects = {}
28
+ _import_structure = {"pipeline_output": ["VideoPipelineOutput"]}
29
+
30
+ try:
31
+ if not (is_paddlenlp_available() and is_paddle_available()):
32
+ raise OptionalDependencyNotAvailable()
33
+ except OptionalDependencyNotAvailable:
34
+ from ...utils import dummy_paddle_and_paddlenlp_objects # noqa F403
35
+
36
+ _dummy_objects.update(get_objects_from_module(dummy_paddle_and_paddlenlp_objects))
37
+ else:
38
+ _import_structure["pipeline_latent_video_diffusion_model_text2video"] = ["LVDMTextToVideoPipeline"]
39
+ _import_structure["pipeline_latent_video_diffusion_model_uncond"] = ["LVDMUncondPipeline"]
40
+
41
+
42
+ if TYPE_CHECKING or PPDIFFUSERS_SLOW_IMPORT:
43
+ try:
44
+ if not (is_paddlenlp_available() and is_paddle_available()):
45
+ raise OptionalDependencyNotAvailable()
46
+
47
+ except OptionalDependencyNotAvailable:
48
+ from ...utils.dummy_paddle_and_paddlenlp_objects import *
49
+ else:
50
+ from .pipeline_latent_video_diffusion_model_text2video import (
51
+ LVDMTextToVideoPipeline,
52
+ )
53
+ from .pipeline_latent_video_diffusion_model_uncond import LVDMUncondPipeline
54
+ from .pipeline_output import VideoPipelineOutput
55
+ else:
56
+ import sys
57
+
58
+ sys.modules[__name__] = _LazyModule(
59
+ __name__,
60
+ globals()["__file__"],
61
+ _import_structure,
62
+ module_spec=__spec__,
63
+ )
64
+
65
+ for name, value in _dummy_objects.items():
66
+ setattr(sys.modules[__name__], name, value)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_text2video.py ADDED
@@ -0,0 +1,704 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ import os
17
+ from typing import Any, Callable, Dict, List, Optional, Union
18
+
19
+ import numpy as np
20
+ import paddle
21
+ from einops import rearrange
22
+
23
+ from ppdiffusers.transformers import CLIPTextModel, CLIPTokenizer
24
+
25
+ from ...configuration_utils import FrozenDict
26
+ from ...models import LVDMAutoencoderKL, LVDMUNet3DModel
27
+ from ...schedulers import KarrasDiffusionSchedulers
28
+ from ...utils import deprecate, logging, randn_tensor, replace_example_docstring
29
+ from ..pipeline_utils import DiffusionPipeline
30
+ from . import VideoPipelineOutput
31
+ from .video_save import save_results
32
+
33
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
34
+
35
+ EXAMPLE_DOC_STRING = """
36
+ Examples:
37
+ ```py
38
+ >>> import paddle
39
+ >>> from ppdiffusers import LVDMTextToVideoPipeline
40
+ >>> pipe = LVDMTextToVideoPipeline.from_pretrained("westfish/lvdm_text2video_orig_webvid_2m")
41
+ >>> seed = 2013
42
+ >>> generator = paddle.Generator().manual_seed(seed)
43
+ >>> samples = pipe(
44
+ prompt="cutting in kitchen",
45
+ num_frames=16,
46
+ height=256,
47
+ width=256,
48
+ num_inference_steps=50,
49
+ generator=generator,
50
+ guidance_scale=15
51
+ eta=1,
52
+ save_dir='.',
53
+ save_name='ddim_lvdm_text_to_video_ucf',
54
+ encoder_type='2d',
55
+ scale_factor=0.18215,
56
+ shift_factor=0,
57
+ )
58
+ >>> prompt = "cliff diving"
59
+ >>> image = pipe(prompt).video[0]
60
+ ```
61
+ """
62
+
63
+
64
+ def split_video_to_clips(video, clip_length, drop_left=True):
65
+ video_length = video.shape[2]
66
+ shape = video.shape
67
+ if video_length % clip_length != 0 and drop_left:
68
+ video = video[:, :, : video_length // clip_length * clip_length, :, :]
69
+ print(f"[split_video_to_clips] Drop frames from {shape} to {video.shape}")
70
+ nclips = video_length // clip_length
71
+ clips = rearrange(video, "b c (nc cl) h w -> (b nc) c cl h w", cl=clip_length, nc=nclips)
72
+ return clips
73
+
74
+
75
+ def merge_clips_to_videos(clips, bs):
76
+ nclips = clips.shape[0] // bs
77
+ video = rearrange(clips, "(b nc) c t h w -> b c (nc t) h w", nc=nclips)
78
+ return video
79
+
80
+
81
+ class LVDMTextToVideoPipeline(DiffusionPipeline):
82
+ r"""
83
+ Pipeline for text-to-video generation using Latent Video Diffusion Model.
84
+
85
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
86
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
87
+
88
+ Args:
89
+ vae ([`LVDMAutoencoderKL`]):
90
+ Autoencoder Model to encode and decode videos to and from latent representations.
91
+ text_encoder ([`CLIPTextModel`]):
92
+ Frozen text-encoder. Stable Diffusion uses the text portion of
93
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
94
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
95
+ tokenizer (`CLIPTokenizer`):
96
+ Tokenizer of class
97
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
98
+ unet ([`LVDMUNet3DModel`]): 3D conditional U-Net architecture to denoise the encoded video latents.
99
+ scheduler ([`SchedulerMixin`]):
100
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
101
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`PNDMScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`]
102
+ or [`DPMSolverMultistepScheduler`].
103
+ """
104
+
105
+ def __init__(
106
+ self,
107
+ vae: LVDMAutoencoderKL,
108
+ text_encoder: CLIPTextModel,
109
+ tokenizer: CLIPTokenizer,
110
+ unet: LVDMUNet3DModel,
111
+ scheduler: KarrasDiffusionSchedulers,
112
+ ):
113
+ super().__init__()
114
+
115
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
116
+ deprecation_message = (
117
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
118
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
119
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
120
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
121
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
122
+ " file"
123
+ )
124
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
125
+ new_config = dict(scheduler.config)
126
+ new_config["steps_offset"] = 1
127
+ scheduler._internal_dict = FrozenDict(new_config)
128
+
129
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
130
+ deprecation_message = (
131
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
132
+ " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
133
+ " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
134
+ " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
135
+ " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
136
+ )
137
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
138
+ new_config = dict(scheduler.config)
139
+ new_config["clip_sample"] = False
140
+ scheduler._internal_dict = FrozenDict(new_config)
141
+
142
+ self.register_modules(
143
+ vae=vae,
144
+ text_encoder=text_encoder,
145
+ tokenizer=tokenizer,
146
+ unet=unet,
147
+ scheduler=scheduler,
148
+ )
149
+
150
+ # self.encoder_type = '2d'
151
+ # self.scale_factor = 0.18215
152
+ # self.shift_factor = 0
153
+
154
+ @paddle.no_grad()
155
+ def decode(self, z, **kwargs):
156
+ z = 1.0 / kwargs["scale_factor"] * z - kwargs["shift_factor"]
157
+ results = self.vae.decode(z).sample
158
+ return results
159
+
160
+ @paddle.no_grad()
161
+ def overlapped_decode(self, z, max_z_t=None, overlap_t=2, predict_cids=False, force_not_quantize=False):
162
+ if max_z_t is None:
163
+ max_z_t = z.shape[2]
164
+ assert max_z_t > overlap_t
165
+ max_x_t = max_z_t * 4
166
+ drop_r = overlap_t // 2
167
+ drop_l = overlap_t - drop_r
168
+ drop_r_x = drop_r * 4
169
+ drop_l_x = drop_l * 4
170
+ start = 0
171
+ end = max_z_t
172
+ zs = []
173
+ while start <= z.shape[2]:
174
+ zs.append(z[:, :, start:end, :, :])
175
+ start += max_z_t - overlap_t
176
+ end += max_z_t - overlap_t
177
+ reses = []
178
+ for i, z_ in enumerate(zs):
179
+ if i == 0:
180
+ res = self.decode(z_, predict_cids, force_not_quantize).cpu()[:, :, : max_x_t - drop_r_x, :, :]
181
+ elif i == len(zs) - 1:
182
+ res = self.decode(z_, predict_cids, force_not_quantize).cpu()[:, :, drop_l_x:, :, :]
183
+ else:
184
+ res = self.decode(z_, predict_cids, force_not_quantize).cpu()[
185
+ :, :, drop_l_x : max_x_t - drop_r_x, :, :
186
+ ]
187
+ reses.append(res)
188
+ results = paddle.concat(x=reses, axis=2)
189
+ return results
190
+
191
+ @paddle.no_grad()
192
+ def decode_first_stage_2DAE_video(self, z, decode_bs=16, return_cpu=True, **kwargs):
193
+ b, _, t, _, _ = z.shape
194
+ z = rearrange(z, "b c t h w -> (b t) c h w")
195
+ if decode_bs is None:
196
+ results = self.decode(z, **kwargs)
197
+ else:
198
+ z = paddle.split(x=z, num_or_sections=z.shape[0] // decode_bs, axis=0)
199
+ if return_cpu:
200
+ results = paddle.concat(x=[self.decode(z_, **kwargs).cpu() for z_ in z], axis=0)
201
+ else:
202
+ results = paddle.concat(x=[self.decode(z_, **kwargs) for z_ in z], axis=0)
203
+ results = rearrange(results, "(b t) c h w -> b c t h w", b=b, t=t).contiguous()
204
+ return results
205
+
206
+ @paddle.no_grad()
207
+ def decode_latents(
208
+ self,
209
+ z,
210
+ decode_bs=16,
211
+ return_cpu=True,
212
+ bs=None,
213
+ decode_single_video_allframes=False,
214
+ max_z_t=None,
215
+ overlapped_length=0,
216
+ **kwargs
217
+ ):
218
+ b, _, t, _, _ = z.shape
219
+ if kwargs["encoder_type"] == "2d" and z.dim() == 5:
220
+ return self.decode_first_stage_2DAE_video(z, decode_bs=decode_bs, return_cpu=return_cpu, **kwargs)
221
+ if decode_single_video_allframes:
222
+ z = paddle.split(x=z, num_or_sections=z.shape[0] // 1, axis=0)
223
+ cat_dim = 0
224
+ elif max_z_t is not None:
225
+ if kwargs["encoder_type"] == "3d":
226
+ z = paddle.split(x=z, num_or_sections=z.shape[2] // max_z_t, axis=2)
227
+ cat_dim = 2
228
+ if kwargs["encoder_type"] == "2d":
229
+ z = paddle.split(x=z, num_or_sections=z.shape[0] // max_z_t, axis=0)
230
+ cat_dim = 0
231
+ # elif self.split_clips and self.downfactor_t is not None or self.clip_length is not None and self.downfactor_t is not None and z.shape[
232
+ # 2
233
+ # ] > self.clip_length // self.downfactor_t and self.encoder_type == '3d':
234
+ # split_z_t = self.clip_length // self.downfactor_t
235
+ # print(f'split z ({z.shape}) to length={split_z_t} clips')
236
+ # z = split_video_to_clips(z, clip_length=split_z_t, drop_left=True)
237
+ # if bs is not None and z.shape[0] > bs:
238
+ # print(f'split z ({z.shape}) to bs={bs}')
239
+ # z = paddle.split(x=z, num_or_sections=z.shape[0] // bs, axis=0)
240
+ # cat_dim = 0
241
+ paddle.device.cuda.empty_cache()
242
+ if isinstance(z, tuple):
243
+ zs = [self.decode(z_, **kwargs).cpu() for z_ in z]
244
+ results = paddle.concat(x=zs, axis=cat_dim)
245
+ elif isinstance(z, paddle.Tensor):
246
+ results = self.decode(z, **kwargs)
247
+ else:
248
+ raise ValueError
249
+ # if self.split_clips and self.downfactor_t is not None:
250
+ # results = merge_clips_to_videos(results, bs=b)
251
+ return results
252
+
253
+ @paddle.no_grad()
254
+ def paddle_to_np(self, x):
255
+ sample = x.detach().cpu()
256
+ if sample.dim() == 5:
257
+ sample = sample.transpose(perm=[0, 2, 3, 4, 1])
258
+ else:
259
+ sample = sample.transpose(perm=[0, 2, 3, 1])
260
+
261
+ if isinstance("uint8", paddle.dtype):
262
+ dtype = "uint8"
263
+ elif isinstance("uint8", str) and "uint8" not in ["cpu", "cuda", "ipu", "xpu"]:
264
+ dtype = "uint8"
265
+ elif isinstance("uint8", paddle.Tensor):
266
+ dtype = "uint8".dtype
267
+ else:
268
+ dtype = ((sample + 1) * 127.5).clip(min=0, max=255).dtype
269
+ sample = ((sample + 1) * 127.5).clip(min=0, max=255).cast(dtype)
270
+
271
+ sample = sample.numpy()
272
+ return sample
273
+
274
+ def _encode_prompt(
275
+ self,
276
+ prompt,
277
+ num_videos_per_prompt,
278
+ do_classifier_free_guidance,
279
+ negative_prompt=None,
280
+ prompt_embeds: Optional[paddle.Tensor] = None,
281
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
282
+ ):
283
+ r"""
284
+ Encodes the prompt into text encoder hidden states.
285
+
286
+ Args:
287
+ prompt (`str` or `List[str]`, *optional*):
288
+ prompt to be encoded
289
+ num_videos_per_prompt (`int`):
290
+ number of videos that should be generated per prompt
291
+ do_classifier_free_guidance (`bool`):
292
+ whether to use classifier free guidance or not
293
+ negative_prompt (`str` or `List[str]`, *optional*):
294
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
295
+ `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
296
+ Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
297
+ prompt_embeds (`paddle.Tensor`, *optional*):
298
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
299
+ provided, text embeddings will be generated from `prompt` input argument.
300
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
301
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
302
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
303
+ argument.
304
+ """
305
+ if prompt is not None and isinstance(prompt, str):
306
+ batch_size = 1
307
+ elif prompt is not None and isinstance(prompt, list):
308
+ batch_size = len(prompt)
309
+ else:
310
+ batch_size = prompt_embeds.shape[0]
311
+
312
+ if prompt_embeds is None:
313
+ text_inputs = self.tokenizer(
314
+ prompt,
315
+ padding="max_length",
316
+ max_length=self.tokenizer.model_max_length,
317
+ truncation=True,
318
+ return_tensors="pd",
319
+ )
320
+ text_input_ids = text_inputs.input_ids
321
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
322
+
323
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
324
+ text_input_ids, untruncated_ids
325
+ ):
326
+ removed_text = self.tokenizer.batch_decode(
327
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
328
+ )
329
+ logger.warning(
330
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
331
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
332
+ )
333
+
334
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
335
+ attention_mask = text_inputs.attention_mask
336
+ else:
337
+ attention_mask = None
338
+ prompt_embeds = self.text_encoder(
339
+ text_input_ids,
340
+ attention_mask=attention_mask,
341
+ )
342
+ prompt_embeds = prompt_embeds[0]
343
+
344
+ prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
345
+
346
+ bs_embed, seq_len, _ = prompt_embeds.shape
347
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
348
+ prompt_embeds = prompt_embeds.tile([1, num_videos_per_prompt, 1])
349
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_videos_per_prompt, seq_len, -1])
350
+
351
+ # get unconditional embeddings for classifier free guidance
352
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
353
+ uncond_tokens: List[str]
354
+ if negative_prompt is None:
355
+ uncond_tokens = [""] * batch_size
356
+ elif type(prompt) is not type(negative_prompt):
357
+ raise TypeError(
358
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
359
+ f" {type(prompt)}."
360
+ )
361
+ elif isinstance(negative_prompt, str):
362
+ uncond_tokens = [negative_prompt]
363
+ elif batch_size != len(negative_prompt):
364
+ raise ValueError(
365
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
366
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
367
+ " the batch size of `prompt`."
368
+ )
369
+ else:
370
+ uncond_tokens = negative_prompt
371
+
372
+ max_length = prompt_embeds.shape[1]
373
+ uncond_input = self.tokenizer(
374
+ uncond_tokens,
375
+ padding="max_length",
376
+ max_length=max_length,
377
+ truncation=True,
378
+ return_tensors="pd",
379
+ )
380
+
381
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
382
+ attention_mask = uncond_input.attention_mask
383
+ else:
384
+ attention_mask = None
385
+
386
+ negative_prompt_embeds = self.text_encoder(
387
+ uncond_input.input_ids,
388
+ attention_mask=attention_mask,
389
+ )
390
+ negative_prompt_embeds = negative_prompt_embeds[0]
391
+
392
+ if do_classifier_free_guidance:
393
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
394
+ seq_len = negative_prompt_embeds.shape[1]
395
+
396
+ negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
397
+
398
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_videos_per_prompt, 1])
399
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_videos_per_prompt, seq_len, -1])
400
+
401
+ # For classifier free guidance, we need to do two forward passes.
402
+ # Here we concatenate the unconditional and text embeddings into a single batch
403
+ # to avoid doing two forward passes
404
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
405
+
406
+ return prompt_embeds
407
+
408
+ def prepare_extra_step_kwargs(self, generator, eta):
409
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
410
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
411
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
412
+ # and should be between [0, 1]
413
+
414
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
415
+ extra_step_kwargs = {}
416
+ if accepts_eta:
417
+ extra_step_kwargs["eta"] = eta
418
+
419
+ # check if the scheduler accepts generator
420
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
421
+ if accepts_generator:
422
+ extra_step_kwargs["generator"] = generator
423
+ return extra_step_kwargs
424
+
425
+ def check_inputs(
426
+ self,
427
+ prompt,
428
+ height,
429
+ width,
430
+ callback_steps,
431
+ negative_prompt=None,
432
+ prompt_embeds=None,
433
+ negative_prompt_embeds=None,
434
+ ):
435
+ if height % 8 != 0 or width % 8 != 0:
436
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
437
+
438
+ if (callback_steps is None) or (
439
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
440
+ ):
441
+ raise ValueError(
442
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
443
+ f" {type(callback_steps)}."
444
+ )
445
+
446
+ if prompt is not None and prompt_embeds is not None:
447
+ raise ValueError(
448
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
449
+ " only forward one of the two."
450
+ )
451
+ elif prompt is None and prompt_embeds is None:
452
+ raise ValueError(
453
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
454
+ )
455
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
456
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
457
+
458
+ if negative_prompt is not None and negative_prompt_embeds is not None:
459
+ raise ValueError(
460
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
461
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
462
+ )
463
+
464
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
465
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
466
+ raise ValueError(
467
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
468
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
469
+ f" {negative_prompt_embeds.shape}."
470
+ )
471
+
472
+ def prepare_latents(
473
+ self, batch_size, num_channels_latents, num_frames, height, width, dtype, generator, latents=None
474
+ ):
475
+ shape = [batch_size, num_channels_latents, num_frames, height // 8, width // 8]
476
+ if isinstance(generator, list) and len(generator) != batch_size:
477
+ raise ValueError(
478
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
479
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
480
+ )
481
+
482
+ if latents is None:
483
+ latents = randn_tensor(shape, generator=generator, dtype=dtype)
484
+
485
+ # scale the initial noise by the standard deviation required by the scheduler
486
+ latents = latents * self.scheduler.init_noise_sigma
487
+ return latents
488
+
489
+ @paddle.no_grad()
490
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
491
+ def __call__(
492
+ self,
493
+ prompt: Union[str, List[str]] = None,
494
+ height: Optional[int] = 256,
495
+ width: Optional[int] = 256,
496
+ num_inference_steps: int = 50,
497
+ guidance_scale: float = 7.5,
498
+ negative_prompt: Optional[Union[str, List[str]]] = None,
499
+ num_videos_per_prompt: Optional[int] = 1,
500
+ eta: float = 0.0,
501
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
502
+ latents: Optional[paddle.Tensor] = None,
503
+ prompt_embeds: Optional[paddle.Tensor] = None,
504
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
505
+ output_type: Optional[str] = "pil",
506
+ return_dict: bool = True,
507
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
508
+ callback_steps: Optional[int] = 1,
509
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
510
+ save_dir=None,
511
+ save_name=None,
512
+ num_frames: Optional[int] = 16,
513
+ encoder_type="2d",
514
+ scale_factor=0.18215,
515
+ shift_factor=0,
516
+ save_fps=8,
517
+ ):
518
+ r"""
519
+ Function invoked when calling the pipeline for generation.
520
+
521
+ Args:
522
+ prompt (`str` or `List[str]`, *optional*):
523
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
524
+ instead.
525
+ height (`int`, *optional*, defaults to 256):
526
+ The height in pixels of the generated video frame.
527
+ width (`int`, *optional*, defaults to 256):
528
+ The width in pixels of the generated video frame.
529
+ num_inference_steps (`int`, *optional*, defaults to 50):
530
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
531
+ expense of slower inference.
532
+ guidance_scale (`float`, *optional*, defaults to 7.5):
533
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
534
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
535
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
536
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
537
+ usually at the expense of lower image quality.
538
+ negative_prompt (`str` or `List[str]`, *optional*):
539
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
540
+ `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
541
+ Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
542
+ num_videos_per_prompt (`int`, *optional*, defaults to 1):
543
+ The number of videos to generate per prompt.
544
+ eta (`float`, *optional*, defaults to 0.0):
545
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
546
+ [`schedulers.DDIMScheduler`], will be ignored for others.
547
+ generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
548
+ One or a list of paddle generator(s) to make generation deterministic.
549
+ latents (`paddle.Tensor`, *optional*):
550
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video
551
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
552
+ tensor will ge generated by sampling using the supplied random `generator`.
553
+ prompt_embeds (`paddle.Tensor`, *optional*):
554
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
555
+ provided, text embeddings will be generated from `prompt` input argument.
556
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
557
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
558
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
559
+ argument.
560
+ output_type (`str`, *optional*, defaults to `"pil"`):
561
+ The output format of the generate image. Choose between
562
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
563
+ return_dict (`bool`, *optional*, defaults to `True`):
564
+ Whether or not to return a VideoPipelineOutput instead of a plain tuple.
565
+ callback (`Callable`, *optional*):
566
+ A function that will be called every `callback_steps` steps during inference. The function will be
567
+ called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
568
+ callback_steps (`int`, *optional*, defaults to 1):
569
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
570
+ called at every step.
571
+ cross_attention_kwargs (`dict`, *optional*):
572
+ A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
573
+ `self.processor` in ppdiffusers.cross_attention.
574
+ save_dir (`str` or `List[str]`, *optional*):
575
+ If provided, will save videos generated to *save_dir*. Otherwise will save them to the current path.
576
+ save_name (`str` or `List[str]`, *optional*):
577
+ If provided, will save videos generated to *save_name*.
578
+ num_frames (`int`, *optional*, defaults to 16):
579
+ Number of frames of the video. If None, will generate 16 frames per video.
580
+ encoder_type (`str`, *optional*, defaults to `"2d"`):
581
+ If provided, will use the specified encoder to generate the video, chosen from [`2d`, `3d`].
582
+ scale_factor (`float`, *optional*, defaults to 0.18215):
583
+ Scale factor for the generated video.
584
+ shift_factor (`float`, *optional*, defaults to 0):
585
+ Shift factor for the generated video.
586
+ save_fps (`int`, *optional*, defaults to 8):
587
+ The number of frames per second to save.
588
+ Examples:
589
+ Returns:
590
+ [`VideoPipelineOutput`] or `tuple`: [`VideoPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
591
+ When returning a tuple, the first element is a list with the generated images.
592
+ """
593
+ # 0. Default height and width to unet
594
+ if height % 8 != 0 or width % 8 != 0:
595
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
596
+
597
+ # 1. Check inputs. Raise error if not correct
598
+ self.check_inputs(
599
+ prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
600
+ )
601
+
602
+ # 2. Define call parameters
603
+ if prompt is not None and isinstance(prompt, str):
604
+ batch_size = 1
605
+ elif prompt is not None and isinstance(prompt, list):
606
+ batch_size = len(prompt)
607
+ else:
608
+ batch_size = prompt_embeds.shape[0]
609
+
610
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
611
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
612
+ # corresponds to doing no classifier free guidance.
613
+ do_classifier_free_guidance = guidance_scale > 1.0
614
+
615
+ # 3. Encode input prompt
616
+ prompt_embeds = self._encode_prompt(
617
+ prompt,
618
+ num_videos_per_prompt,
619
+ do_classifier_free_guidance,
620
+ negative_prompt,
621
+ prompt_embeds=prompt_embeds,
622
+ negative_prompt_embeds=negative_prompt_embeds,
623
+ )
624
+
625
+ # 4. Prepare timesteps
626
+ self.scheduler.set_timesteps(num_inference_steps)
627
+ timesteps = self.scheduler.timesteps
628
+
629
+ # 5. Prepare latent variables
630
+ num_channels_latents = self.unet.in_channels
631
+ latents = self.prepare_latents(
632
+ batch_size * num_videos_per_prompt,
633
+ num_channels_latents,
634
+ num_frames,
635
+ height,
636
+ width,
637
+ prompt_embeds.dtype,
638
+ generator,
639
+ latents,
640
+ )
641
+
642
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
643
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
644
+
645
+ # 7. Denoising loop
646
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
647
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
648
+ for i, t in enumerate(timesteps):
649
+ # expand the latents if we are doing classifier free guidance
650
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
651
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
652
+
653
+ # predict the noise residual
654
+ noise_pred = self.unet(
655
+ latent_model_input,
656
+ timesteps=t,
657
+ context=prompt_embeds,
658
+ cross_attention_kwargs=cross_attention_kwargs,
659
+ ).sample
660
+
661
+ # perform guidance
662
+ if do_classifier_free_guidance:
663
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
664
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
665
+
666
+ # compute the previous noisy sample x_t -> x_t-1
667
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
668
+
669
+ # call the callback, if provided
670
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
671
+ progress_bar.update()
672
+ if callback is not None and i % callback_steps == 0:
673
+ callback(i, t, latents)
674
+ all_videos = []
675
+ extra_decode_kwargs = {
676
+ "encoder_type": encoder_type,
677
+ "scale_factor": scale_factor,
678
+ "shift_factor": shift_factor,
679
+ }
680
+ sampled_videos = self.decode_latents(latents, decode_bs=1, return_cpu=False, **extra_decode_kwargs)
681
+ all_videos.append(self.paddle_to_np(sampled_videos))
682
+ all_videos = np.concatenate(all_videos, axis=0)
683
+
684
+ # return sampled_videos
685
+ videos_frames = []
686
+ for idx in range(sampled_videos.shape[0]):
687
+ video = sampled_videos[idx]
688
+ video_frames = []
689
+ for fidx in range(video.shape[1]):
690
+ frame = video[:, fidx]
691
+ frame = (frame / 2 + 0.5).clip(0, 1)
692
+ frame = frame.transpose([1, 2, 0]).astype("float32").numpy()
693
+ if output_type == "pil":
694
+ frame = self.numpy_to_pil(frame)
695
+ video_frames.append(frame)
696
+ videos_frames.append(video_frames)
697
+
698
+ if not save_name:
699
+ save_name = "defaul_video"
700
+ if not save_dir:
701
+ save_dir = "."
702
+ os.makedirs(save_dir, exist_ok=True)
703
+ save_results(all_videos, save_dir=save_dir, save_name=save_name, save_fps=save_fps)
704
+ return VideoPipelineOutput(frames=videos_frames, samples=sampled_videos)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_output.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from dataclasses import dataclass
16
+ from typing import List, Union
17
+
18
+ import numpy as np
19
+ import paddle
20
+
21
+ from ...utils import BaseOutput
22
+
23
+
24
+ @dataclass
25
+ class VideoPipelineOutput(BaseOutput):
26
+ """
27
+ Output class for text to video pipelines.
28
+
29
+ Args:
30
+ frames (`List[np.ndarray]` or `paddle.Tensor`)
31
+ List of denoised frames (essentially images) as NumPy arrays of shape `(height, width, num_channels)` or as
32
+ a `paddle` tensor. NumPy array present the denoised images of the diffusion pipeline. The length of the list
33
+ denotes the video length i.e., the number of frames.
34
+ """
35
+
36
+ frames: Union[List[np.ndarray], paddle.Tensor]
37
+ samples: paddle.Tensor
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/musicldm/__init__.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import TYPE_CHECKING
16
+
17
+ from ...utils import (
18
+ PPDIFFUSERS_SLOW_IMPORT,
19
+ OptionalDependencyNotAvailable,
20
+ _LazyModule,
21
+ get_objects_from_module,
22
+ is_paddle_available,
23
+ is_paddlenlp_available,
24
+ is_paddlenlp_version,
25
+ )
26
+
27
+ _dummy_objects = {}
28
+ _import_structure = {}
29
+
30
+ try:
31
+ if not (is_paddlenlp_available() and is_paddle_available() and is_paddlenlp_version(">=", "2.6.0")):
32
+ raise OptionalDependencyNotAvailable()
33
+ except OptionalDependencyNotAvailable:
34
+ from ...utils import dummy_paddle_and_paddlenlp_objects # noqa F403
35
+
36
+ _dummy_objects.update(get_objects_from_module(dummy_paddle_and_paddlenlp_objects))
37
+ else:
38
+ _import_structure["pipeline_musicldm"] = ["MusicLDMPipeline"]
39
+
40
+
41
+ if TYPE_CHECKING or PPDIFFUSERS_SLOW_IMPORT:
42
+ try:
43
+ if not (is_paddlenlp_available() and is_paddle_available() and is_paddlenlp_version(">=", "2.6.0")):
44
+ raise OptionalDependencyNotAvailable()
45
+
46
+ except OptionalDependencyNotAvailable:
47
+ from ...utils.dummy_paddle_and_paddlenlp_objects import *
48
+ else:
49
+ from .pipeline_musicldm import MusicLDMPipeline
50
+
51
+ else:
52
+ import sys
53
+
54
+ sys.modules[__name__] = _LazyModule(
55
+ __name__,
56
+ globals()["__file__"],
57
+ _import_structure,
58
+ module_spec=__spec__,
59
+ )
60
+
61
+ for name, value in _dummy_objects.items():
62
+ setattr(sys.modules[__name__], name, value)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/musicldm/pipeline_musicldm.py ADDED
@@ -0,0 +1,590 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Union
17
+
18
+ import numpy as np
19
+ import paddle
20
+
21
+ from ppdiffusers.transformers import (
22
+ ClapFeatureExtractor,
23
+ ClapModel,
24
+ ClapTextModelWithProjection,
25
+ RobertaTokenizer,
26
+ SpeechT5HifiGan,
27
+ )
28
+
29
+ from ...models import AutoencoderKL, UNet2DConditionModel
30
+ from ...schedulers import KarrasDiffusionSchedulers
31
+ from ...utils import is_librosa_available, logging, replace_example_docstring
32
+ from ...utils.paddle_utils import randn_tensor
33
+ from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
34
+
35
+ if is_librosa_available():
36
+ import librosa
37
+
38
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
39
+
40
+ EXAMPLE_DOC_STRING = """
41
+ Examples:
42
+ ```py
43
+ >>> from ppdiffusers import MusicLDMPipeline
44
+ >>> import paddle
45
+ >>> import scipy
46
+
47
+ >>> repo_id = "ucsd-reach/musicldm"
48
+ >>> pipe = MusicLDMPipeline.from_pretrained(repo_id, paddle_dtype=paddle.float16)
49
+
50
+ >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
51
+ >>> audio = pipe(prompt, num_inference_steps=10, audio_length_in_s=5.0).audios[0]
52
+
53
+ >>> # save the audio sample as a .wav file
54
+ >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
55
+ ```
56
+ """
57
+
58
+
59
+ class MusicLDMPipeline(DiffusionPipeline):
60
+ r"""
61
+ Pipeline for text-to-audio generation using MusicLDM.
62
+
63
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
64
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
65
+
66
+ Args:
67
+ vae ([`AutoencoderKL`]):
68
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
69
+ text_encoder ([`~transformers.ClapModel`]):
70
+ Frozen text-audio embedding model (`ClapTextModel`), specifically the
71
+ [laion/clap-htsat-unfused](https://huggingface.co/laion/clap-htsat-unfused) variant.
72
+ tokenizer ([`PreTrainedTokenizer`]):
73
+ A [`~transformers.RobertaTokenizer`] to tokenize text.
74
+ feature_extractor ([`~transformers.ClapFeatureExtractor`]):
75
+ Feature extractor to compute mel-spectrograms from audio waveforms.
76
+ unet ([`UNet2DConditionModel`]):
77
+ A `UNet2DConditionModel` to denoise the encoded audio latents.
78
+ scheduler ([`SchedulerMixin`]):
79
+ A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
80
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
81
+ vocoder ([`~transformers.SpeechT5HifiGan`]):
82
+ Vocoder of class `SpeechT5HifiGan`.
83
+ """
84
+
85
+ def __init__(
86
+ self,
87
+ vae: AutoencoderKL,
88
+ text_encoder: Union[ClapTextModelWithProjection, ClapModel],
89
+ tokenizer: RobertaTokenizer,
90
+ feature_extractor: Optional[ClapFeatureExtractor],
91
+ unet: UNet2DConditionModel,
92
+ scheduler: KarrasDiffusionSchedulers,
93
+ vocoder: SpeechT5HifiGan,
94
+ ):
95
+ super().__init__()
96
+
97
+ self.register_modules(
98
+ vae=vae,
99
+ text_encoder=text_encoder,
100
+ tokenizer=tokenizer,
101
+ feature_extractor=feature_extractor,
102
+ unet=unet,
103
+ scheduler=scheduler,
104
+ vocoder=vocoder,
105
+ )
106
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
107
+
108
+ def _encode_prompt(
109
+ self,
110
+ prompt,
111
+ num_waveforms_per_prompt,
112
+ do_classifier_free_guidance,
113
+ negative_prompt=None,
114
+ prompt_embeds: Optional[paddle.Tensor] = None,
115
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
116
+ ):
117
+ r"""
118
+ Encodes the prompt into text encoder hidden states.
119
+
120
+ Args:
121
+ prompt (`str` or `List[str]`, *optional*):
122
+ prompt to be encoded
123
+ num_waveforms_per_prompt (`int`):
124
+ number of waveforms that should be generated per prompt
125
+ do_classifier_free_guidance (`bool`):
126
+ whether to use classifier free guidance or not
127
+ negative_prompt (`str` or `List[str]`, *optional*):
128
+ The prompt or prompts not to guide the audio generation. If not defined, one has to pass
129
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
130
+ less than `1`).
131
+ prompt_embeds (`paddle.Tensor`, *optional*):
132
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
133
+ provided, text embeddings will be generated from `prompt` input argument.
134
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
135
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
136
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
137
+ argument.
138
+ """
139
+ if prompt is not None and isinstance(prompt, str):
140
+ batch_size = 1
141
+ elif prompt is not None and isinstance(prompt, list):
142
+ batch_size = len(prompt)
143
+ else:
144
+ batch_size = prompt_embeds.shape[0]
145
+
146
+ if prompt_embeds is None:
147
+ text_inputs = self.tokenizer(
148
+ prompt,
149
+ padding="max_length",
150
+ max_length=self.tokenizer.model_max_length,
151
+ truncation=True,
152
+ return_tensors="pd",
153
+ )
154
+ text_input_ids = text_inputs.input_ids
155
+ attention_mask = text_inputs.attention_mask
156
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
157
+
158
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
159
+ text_input_ids, untruncated_ids
160
+ ):
161
+ removed_text = self.tokenizer.batch_decode(
162
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
163
+ )
164
+ logger.warning(
165
+ "The following part of your input was truncated because CLAP can only handle sequences up to"
166
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
167
+ )
168
+
169
+ prompt_embeds = self.text_encoder.get_text_features(
170
+ text_input_ids,
171
+ attention_mask=attention_mask,
172
+ )
173
+
174
+ prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.text_model.dtype)
175
+
176
+ (
177
+ bs_embed,
178
+ seq_len,
179
+ ) = prompt_embeds.shape
180
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
181
+ prompt_embeds = prompt_embeds.tile([1, num_waveforms_per_prompt])
182
+ prompt_embeds = prompt_embeds.reshape([bs_embed * num_waveforms_per_prompt, seq_len])
183
+
184
+ # get unconditional embeddings for classifier free guidance
185
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
186
+ uncond_tokens: List[str]
187
+ if negative_prompt is None:
188
+ uncond_tokens = [""] * batch_size
189
+ elif type(prompt) is not type(negative_prompt):
190
+ raise TypeError(
191
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
192
+ f" {type(prompt)}."
193
+ )
194
+ elif isinstance(negative_prompt, str):
195
+ uncond_tokens = [negative_prompt]
196
+ elif batch_size != len(negative_prompt):
197
+ raise ValueError(
198
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
199
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
200
+ " the batch size of `prompt`."
201
+ )
202
+ else:
203
+ uncond_tokens = negative_prompt
204
+
205
+ max_length = prompt_embeds.shape[1]
206
+ uncond_input = self.tokenizer(
207
+ uncond_tokens,
208
+ padding="max_length",
209
+ max_length=max_length,
210
+ truncation=True,
211
+ return_tensors="pd",
212
+ )
213
+
214
+ uncond_input_ids = uncond_input.input_ids
215
+ attention_mask = uncond_input.attention_mask
216
+
217
+ negative_prompt_embeds = self.text_encoder.get_text_features(
218
+ uncond_input_ids,
219
+ attention_mask=attention_mask,
220
+ )
221
+
222
+ if do_classifier_free_guidance:
223
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
224
+ seq_len = negative_prompt_embeds.shape[1]
225
+
226
+ negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.text_model.dtype)
227
+
228
+ negative_prompt_embeds = negative_prompt_embeds.tile([1, num_waveforms_per_prompt])
229
+ negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_waveforms_per_prompt, seq_len])
230
+
231
+ # For classifier free guidance, we need to do two forward passes.
232
+ # Here we concatenate the unconditional and text embeddings into a single batch
233
+ # to avoid doing two forward passes
234
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
235
+
236
+ return prompt_embeds
237
+
238
+ # Copied from ppdiffusers.pipelines.audioldm.pipeline_audioldm.AudioLDMPipeline.mel_spectrogram_to_waveform
239
+ def mel_spectrogram_to_waveform(self, mel_spectrogram):
240
+ if mel_spectrogram.dim() == 4:
241
+ mel_spectrogram = mel_spectrogram.squeeze(1)
242
+
243
+ waveform = self.vocoder(mel_spectrogram)
244
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
245
+ waveform = waveform.cast("float32").cpu()
246
+ return waveform
247
+
248
+ # Copied from ppdiffusers.pipelines.audioldm2.pipeline_audioldm2.AudioLDM2Pipeline.score_waveforms
249
+ def score_waveforms(self, text, audio, num_waveforms_per_prompt, dtype):
250
+ if not is_librosa_available():
251
+ logger.info(
252
+ "Automatic scoring of the generated audio waveforms against the input prompt text requires the "
253
+ "`librosa` package to resample the generated waveforms. Returning the audios in the order they were "
254
+ "generated. To enable automatic scoring, install `librosa` with: `pip install librosa`."
255
+ )
256
+ return audio
257
+ inputs = self.tokenizer(text, return_tensors="pd", padding=True)
258
+ resampled_audio = librosa.resample(
259
+ audio.numpy(), orig_sr=self.vocoder.config.sampling_rate, target_sr=self.feature_extractor.sampling_rate
260
+ )
261
+ inputs["input_features"] = self.feature_extractor(
262
+ list(resampled_audio), return_tensors="pd", sampling_rate=self.feature_extractor.sampling_rate
263
+ ).input_features.cast(dtype)
264
+
265
+ # compute the audio-text similarity score using the CLAP model
266
+ logits_per_text = self.text_encoder(**inputs).logits_per_text
267
+ # sort by the highest matching generations per prompt
268
+ indices = paddle.argsort(logits_per_text, axis=1, descending=True)[:, :num_waveforms_per_prompt]
269
+ audio = paddle.index_select(
270
+ audio,
271
+ axis=0,
272
+ index=indices.reshape(
273
+ [
274
+ -1,
275
+ ]
276
+ ),
277
+ )
278
+ return audio
279
+
280
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
281
+ def prepare_extra_step_kwargs(self, generator, eta):
282
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
283
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
284
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
285
+ # and should be between [0, 1]
286
+
287
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
288
+ extra_step_kwargs = {}
289
+ if accepts_eta:
290
+ extra_step_kwargs["eta"] = eta
291
+
292
+ # check if the scheduler accepts generator
293
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
294
+ if accepts_generator:
295
+ extra_step_kwargs["generator"] = generator
296
+ return extra_step_kwargs
297
+
298
+ # Copied from ppdiffusers.pipelines.audioldm.pipeline_audioldm.AudioLDMPipeline.check_inputs
299
+ def check_inputs(
300
+ self,
301
+ prompt,
302
+ audio_length_in_s,
303
+ vocoder_upsample_factor,
304
+ callback_steps,
305
+ negative_prompt=None,
306
+ prompt_embeds=None,
307
+ negative_prompt_embeds=None,
308
+ ):
309
+ min_audio_length_in_s = vocoder_upsample_factor * self.vae_scale_factor
310
+ if audio_length_in_s < min_audio_length_in_s:
311
+ raise ValueError(
312
+ f"`audio_length_in_s` has to be a positive value greater than or equal to {min_audio_length_in_s}, but "
313
+ f"is {audio_length_in_s}."
314
+ )
315
+
316
+ if self.vocoder.config.model_in_dim % self.vae_scale_factor != 0:
317
+ raise ValueError(
318
+ f"The number of frequency bins in the vocoder's log-mel spectrogram has to be divisible by the "
319
+ f"VAE scale factor, but got {self.vocoder.config.model_in_dim} bins and a scale factor of "
320
+ f"{self.vae_scale_factor}."
321
+ )
322
+
323
+ if (callback_steps is None) or (
324
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
325
+ ):
326
+ raise ValueError(
327
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
328
+ f" {type(callback_steps)}."
329
+ )
330
+
331
+ if prompt is not None and prompt_embeds is not None:
332
+ raise ValueError(
333
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
334
+ " only forward one of the two."
335
+ )
336
+ elif prompt is None and prompt_embeds is None:
337
+ raise ValueError(
338
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
339
+ )
340
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
341
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
342
+
343
+ if negative_prompt is not None and negative_prompt_embeds is not None:
344
+ raise ValueError(
345
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
346
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
347
+ )
348
+
349
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
350
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
351
+ raise ValueError(
352
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
353
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
354
+ f" {negative_prompt_embeds.shape}."
355
+ )
356
+
357
+ # Copied from ppdiffusers.pipelines.audioldm.pipeline_audioldm.AudioLDMPipeline.prepare_latents
358
+ def prepare_latents(self, batch_size, num_channels_latents, height, dtype, generator, latents=None):
359
+ shape = (
360
+ batch_size,
361
+ num_channels_latents,
362
+ height // self.vae_scale_factor,
363
+ self.vocoder.config.model_in_dim // self.vae_scale_factor,
364
+ )
365
+ if isinstance(generator, list) and len(generator) != batch_size:
366
+ raise ValueError(
367
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
368
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
369
+ )
370
+
371
+ if latents is None:
372
+ latents = randn_tensor(shape, generator=generator, dtype=dtype)
373
+ else:
374
+ latents = latents.cast(dtype)
375
+
376
+ # scale the initial noise by the standard deviation required by the scheduler
377
+ latents = latents * self.scheduler.init_noise_sigma
378
+ return latents
379
+
380
+ @paddle.no_grad()
381
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
382
+ def __call__(
383
+ self,
384
+ prompt: Union[str, List[str]] = None,
385
+ audio_length_in_s: Optional[float] = None,
386
+ num_inference_steps: int = 200,
387
+ guidance_scale: float = 2.0,
388
+ negative_prompt: Optional[Union[str, List[str]]] = None,
389
+ num_waveforms_per_prompt: Optional[int] = 1,
390
+ eta: float = 0.0,
391
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
392
+ latents: Optional[paddle.Tensor] = None,
393
+ prompt_embeds: Optional[paddle.Tensor] = None,
394
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
395
+ return_dict: bool = True,
396
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
397
+ callback_steps: Optional[int] = 1,
398
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
399
+ output_type: Optional[str] = "np",
400
+ ):
401
+ r"""
402
+ The call function to the pipeline for generation.
403
+
404
+ Args:
405
+ prompt (`str` or `List[str]`, *optional*):
406
+ The prompt or prompts to guide audio generation. If not defined, you need to pass `prompt_embeds`.
407
+ audio_length_in_s (`int`, *optional*, defaults to 10.24):
408
+ The length of the generated audio sample in seconds.
409
+ num_inference_steps (`int`, *optional*, defaults to 200):
410
+ The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
411
+ expense of slower inference.
412
+ guidance_scale (`float`, *optional*, defaults to 2.0):
413
+ A higher guidance scale value encourages the model to generate audio that is closely linked to the text
414
+ `prompt` at the expense of lower sound quality. Guidance scale is enabled when `guidance_scale > 1`.
415
+ negative_prompt (`str` or `List[str]`, *optional*):
416
+ The prompt or prompts to guide what to not include in audio generation. If not defined, you need to
417
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
418
+ num_waveforms_per_prompt (`int`, *optional*, defaults to 1):
419
+ The number of waveforms to generate per prompt. If `num_waveforms_per_prompt > 1`, the text encoding
420
+ model is a joint text-audio model ([`~transformers.ClapModel`]), and the tokenizer is a
421
+ `[~transformers.ClapProcessor]`, then automatic scoring will be performed between the generated outputs
422
+ and the input text. This scoring ranks the generated waveforms based on their cosine similarity to text
423
+ input in the joint text-audio embedding space.
424
+ eta (`float`, *optional*, defaults to 0.0):
425
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
426
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
427
+ generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
428
+ A [`paddle.Generator`] to make generation deterministic.
429
+ latents (`paddle.Tensor`, *optional*):
430
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
431
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
432
+ tensor is generated by sampling using the supplied random `generator`.
433
+ prompt_embeds (`paddle.Tensor`, *optional*):
434
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
435
+ provided, text embeddings are generated from the `prompt` input argument.
436
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
437
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
438
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
439
+ return_dict (`bool`, *optional*, defaults to `True`):
440
+ Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
441
+ callback (`Callable`, *optional*):
442
+ A function that calls every `callback_steps` steps during inference. The function is called with the
443
+ following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
444
+ callback_steps (`int`, *optional*, defaults to 1):
445
+ The frequency at which the `callback` function is called. If not specified, the callback is called at
446
+ every step.
447
+ cross_attention_kwargs (`dict`, *optional*):
448
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
449
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
450
+ output_type (`str`, *optional*, defaults to `"np"`):
451
+ The output format of the generated audio. Choose between `"np"` to return a NumPy `np.ndarray` or
452
+ `"pd"` to return a Paddle `paddle.Tensor` object. Set to `"latent"` to return the latent diffusion
453
+ model (LDM) output.
454
+
455
+ Examples:
456
+
457
+ Returns:
458
+ [`~pipelines.AudioPipelineOutput`] or `tuple`:
459
+ If `return_dict` is `True`, [`~pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
460
+ returned where the first element is a list with the generated audio.
461
+ """
462
+ # 0. Convert audio input length from seconds to spectrogram height
463
+ vocoder_upsample_factor = np.prod(self.vocoder.config.upsample_rates) / self.vocoder.config.sampling_rate
464
+
465
+ if audio_length_in_s is None:
466
+ audio_length_in_s = self.unet.config.sample_size * self.vae_scale_factor * vocoder_upsample_factor
467
+
468
+ height = int(audio_length_in_s / vocoder_upsample_factor)
469
+
470
+ original_waveform_length = int(audio_length_in_s * self.vocoder.config.sampling_rate)
471
+ if height % self.vae_scale_factor != 0:
472
+ height = int(np.ceil(height / self.vae_scale_factor)) * self.vae_scale_factor
473
+ logger.info(
474
+ f"Audio length in seconds {audio_length_in_s} is increased to {height * vocoder_upsample_factor} "
475
+ f"so that it can be handled by the model. It will be cut to {audio_length_in_s} after the "
476
+ f"denoising process."
477
+ )
478
+
479
+ # 1. Check inputs. Raise error if not correct
480
+ self.check_inputs(
481
+ prompt,
482
+ audio_length_in_s,
483
+ vocoder_upsample_factor,
484
+ callback_steps,
485
+ negative_prompt,
486
+ prompt_embeds,
487
+ negative_prompt_embeds,
488
+ )
489
+
490
+ # 2. Define call parameters
491
+ if prompt is not None and isinstance(prompt, str):
492
+ batch_size = 1
493
+ elif prompt is not None and isinstance(prompt, list):
494
+ batch_size = len(prompt)
495
+ else:
496
+ batch_size = prompt_embeds.shape[0]
497
+
498
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
499
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
500
+ # corresponds to doing no classifier free guidance.
501
+ do_classifier_free_guidance = guidance_scale > 1.0
502
+
503
+ # 3. Encode input prompt
504
+ prompt_embeds = self._encode_prompt(
505
+ prompt,
506
+ num_waveforms_per_prompt,
507
+ do_classifier_free_guidance,
508
+ negative_prompt,
509
+ prompt_embeds=prompt_embeds,
510
+ negative_prompt_embeds=negative_prompt_embeds,
511
+ )
512
+
513
+ # 4. Prepare timesteps
514
+ self.scheduler.set_timesteps(num_inference_steps)
515
+ timesteps = self.scheduler.timesteps
516
+
517
+ # 5. Prepare latent variables
518
+ num_channels_latents = self.unet.config.in_channels
519
+ latents = self.prepare_latents(
520
+ batch_size * num_waveforms_per_prompt,
521
+ num_channels_latents,
522
+ height,
523
+ prompt_embeds.dtype,
524
+ generator,
525
+ latents,
526
+ )
527
+
528
+ # 6. Prepare extra step kwargs
529
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
530
+
531
+ # 7. Denoising loop
532
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
533
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
534
+ for i, t in enumerate(timesteps):
535
+ # expand the latents if we are doing classifier free guidance
536
+ latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
537
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
538
+
539
+ # predict the noise residual
540
+ noise_pred = self.unet(
541
+ latent_model_input,
542
+ t,
543
+ encoder_hidden_states=None,
544
+ class_labels=prompt_embeds,
545
+ cross_attention_kwargs=cross_attention_kwargs,
546
+ return_dict=False,
547
+ )[0]
548
+
549
+ # perform guidance
550
+ if do_classifier_free_guidance:
551
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
552
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
553
+
554
+ # compute the previous noisy sample x_t -> x_t-1
555
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
556
+
557
+ # call the callback, if provided
558
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
559
+ progress_bar.update()
560
+ if callback is not None and i % callback_steps == 0:
561
+ step_idx = i // getattr(self.scheduler, "order", 1)
562
+ callback(step_idx, t, latents)
563
+
564
+ # 8. Post-processing
565
+ if not output_type == "latent":
566
+ latents = 1 / self.vae.config.scaling_factor * latents
567
+ mel_spectrogram = self.vae.decode(latents).sample
568
+ else:
569
+ return AudioPipelineOutput(audios=latents)
570
+
571
+ audio = self.mel_spectrogram_to_waveform(mel_spectrogram)
572
+
573
+ audio = audio[:, :original_waveform_length]
574
+
575
+ # 9. Automatic scoring
576
+ if num_waveforms_per_prompt > 1 and prompt is not None:
577
+ audio = self.score_waveforms(
578
+ text=prompt,
579
+ audio=audio,
580
+ num_waveforms_per_prompt=num_waveforms_per_prompt,
581
+ dtype=prompt_embeds.dtype,
582
+ )
583
+
584
+ if output_type == "np":
585
+ audio = audio.numpy()
586
+
587
+ if not return_dict:
588
+ return (audio,)
589
+
590
+ return AudioPipelineOutput(audios=audio)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/pndm/__init__.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import TYPE_CHECKING
16
+
17
+ from ...utils import PPDIFFUSERS_SLOW_IMPORT, _LazyModule
18
+
19
+ _import_structure = {"pipeline_pndm": ["PNDMPipeline"]}
20
+
21
+ if TYPE_CHECKING or PPDIFFUSERS_SLOW_IMPORT:
22
+ from .pipeline_pndm import PNDMPipeline
23
+ else:
24
+ import sys
25
+
26
+ sys.modules[__name__] = _LazyModule(
27
+ __name__,
28
+ globals()["__file__"],
29
+ _import_structure,
30
+ module_spec=__spec__,
31
+ )
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/repaint/__init__.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import TYPE_CHECKING
16
+
17
+ from ...utils import PPDIFFUSERS_SLOW_IMPORT, _LazyModule
18
+
19
+ _import_structure = {"pipeline_repaint": ["RePaintPipeline"]}
20
+
21
+ if TYPE_CHECKING or PPDIFFUSERS_SLOW_IMPORT:
22
+ from .pipeline_repaint import RePaintPipeline
23
+
24
+ else:
25
+ import sys
26
+
27
+ sys.modules[__name__] = _LazyModule(
28
+ __name__,
29
+ globals()["__file__"],
30
+ _import_structure,
31
+ module_spec=__spec__,
32
+ )
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/repaint/pipeline_repaint.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 ETH Zurich Computer Vision Lab and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from typing import List, Optional, Tuple, Union
17
+
18
+ import numpy as np
19
+ import paddle
20
+ import PIL.Image
21
+
22
+ from ...models import UNet2DModel
23
+ from ...schedulers import RePaintScheduler
24
+ from ...utils import PIL_INTERPOLATION, deprecate, logging
25
+ from ...utils.paddle_utils import randn_tensor
26
+ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
27
+
28
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
29
+
30
+
31
+ # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
32
+ def _preprocess_image(image: Union[List, PIL.Image.Image, paddle.Tensor]):
33
+ deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
34
+ deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
35
+ if isinstance(image, paddle.Tensor):
36
+ return image
37
+ elif isinstance(image, PIL.Image.Image):
38
+ image = [image]
39
+
40
+ if isinstance(image[0], PIL.Image.Image):
41
+ w, h = image[0].size
42
+ w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
43
+
44
+ image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
45
+ image = np.concatenate(image, axis=0)
46
+ image = np.array(image).astype(np.float32) / 255.0
47
+ image = image.transpose(0, 3, 1, 2)
48
+ image = 2.0 * image - 1.0
49
+ image = paddle.to_tensor(image)
50
+ elif isinstance(image[0], paddle.Tensor):
51
+ image = paddle.concat(image, axis=0)
52
+ return image
53
+
54
+
55
+ def _preprocess_mask(mask: Union[List, PIL.Image.Image, paddle.Tensor]):
56
+ if isinstance(mask, paddle.Tensor):
57
+ return mask
58
+ elif isinstance(mask, PIL.Image.Image):
59
+ mask = [mask]
60
+
61
+ if isinstance(mask[0], PIL.Image.Image):
62
+ w, h = mask[0].size
63
+ w, h = (x - x % 32 for x in (w, h)) # resize to integer multiple of 32
64
+ mask = [np.array(m.convert("L").resize((w, h), resample=PIL_INTERPOLATION["nearest"]))[None, :] for m in mask]
65
+ mask = np.concatenate(mask, axis=0)
66
+ mask = mask.astype(np.float32) / 255.0
67
+ mask[mask < 0.5] = 0
68
+ mask[mask >= 0.5] = 1
69
+ mask = paddle.to_tensor(mask)
70
+ elif isinstance(mask[0], paddle.Tensor):
71
+ mask = paddle.concat(mask, axis=0)
72
+ return mask
73
+
74
+
75
+ class RePaintPipeline(DiffusionPipeline):
76
+ r"""
77
+ Pipeline for image inpainting using RePaint.
78
+
79
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
80
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
81
+
82
+ Parameters:
83
+ unet ([`UNet2DModel`]):
84
+ A `UNet2DModel` to denoise the encoded image latents.
85
+ scheduler ([`RePaintScheduler`]):
86
+ A `RePaintScheduler` to be used in combination with `unet` to denoise the encoded image.
87
+ """
88
+
89
+ unet: UNet2DModel
90
+ scheduler: RePaintScheduler
91
+ model_cpu_offload_seq = "unet"
92
+
93
+ def __init__(self, unet, scheduler):
94
+ super().__init__()
95
+ self.register_modules(unet=unet, scheduler=scheduler)
96
+
97
+ @paddle.no_grad()
98
+ def __call__(
99
+ self,
100
+ image: Union[paddle.Tensor, PIL.Image.Image],
101
+ mask_image: Union[paddle.Tensor, PIL.Image.Image],
102
+ num_inference_steps: int = 250,
103
+ eta: float = 0.0,
104
+ jump_length: int = 10,
105
+ jump_n_sample: int = 10,
106
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
107
+ output_type: Optional[str] = "pil",
108
+ return_dict: bool = True,
109
+ ) -> Union[ImagePipelineOutput, Tuple]:
110
+ r"""
111
+ The call function to the pipeline for generation.
112
+
113
+ Args:
114
+ image (`paddle.Tensor` or `PIL.Image.Image`):
115
+ The original image to inpaint on.
116
+ mask_image (`paddle.Tensor` or `PIL.Image.Image`):
117
+ The mask_image where 0.0 define which part of the original image to inpaint.
118
+ num_inference_steps (`int`, *optional*, defaults to 1000):
119
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
120
+ expense of slower inference.
121
+ eta (`float`):
122
+ The weight of the added noise in a diffusion step. Its value is between 0.0 and 1.0; 0.0 corresponds to
123
+ DDIM and 1.0 is the DDPM scheduler.
124
+ jump_length (`int`, *optional*, defaults to 10):
125
+ The number of steps taken forward in time before going backward in time for a single jump ("j" in
126
+ RePaint paper). Take a look at Figure 9 and 10 in the [paper](https://arxiv.org/pdf/2201.09865.pdf).
127
+ jump_n_sample (`int`, *optional*, defaults to 10):
128
+ The number of times to make a forward time jump for a given chosen time sample. Take a look at Figure 9
129
+ and 10 in the [paper](https://arxiv.org/pdf/2201.09865.pdf).
130
+ generator (`paddle.Generator`, *optional*):
131
+ A [`paddle.Generator`] to make generation deterministic.
132
+ output_type (`str`, `optional`, defaults to `"pil"`):
133
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
134
+ return_dict (`bool`, *optional*, defaults to `True`):
135
+ Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
136
+
137
+ Example:
138
+
139
+ ```py
140
+ >>> from io import BytesIO
141
+ >>> import paddle
142
+ >>> import PIL
143
+ >>> import requests
144
+ >>> from ppdiffusers import RePaintPipeline, RePaintScheduler
145
+
146
+
147
+ >>> def download_image(url):
148
+ ... response = requests.get(url)
149
+ ... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
150
+
151
+
152
+ >>> img_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/celeba_hq_256.png"
153
+ >>> mask_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/mask_256.png"
154
+
155
+ >>> # Load the original image and the mask as PIL images
156
+ >>> original_image = download_image(img_url).resize((256, 256))
157
+ >>> mask_image = download_image(mask_url).resize((256, 256))
158
+
159
+ >>> # Load the RePaint scheduler and pipeline based on a pretrained DDPM model
160
+ >>> scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256")
161
+ >>> pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler)
162
+
163
+ >>> generator = paddle.Generator().manual_seed(0)
164
+ >>> output = pipe(
165
+ ... image=original_image,
166
+ ... mask_image=mask_image,
167
+ ... num_inference_steps=250,
168
+ ... eta=0.0,
169
+ ... jump_length=10,
170
+ ... jump_n_sample=10,
171
+ ... generator=generator,
172
+ ... )
173
+ >>> inpainted_image = output.images[0]
174
+ ```
175
+
176
+ Returns:
177
+ [`~pipelines.ImagePipelineOutput`] or `tuple`:
178
+ If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
179
+ returned where the first element is a list with the generated images.
180
+ """
181
+
182
+ original_image = image
183
+
184
+ original_image = _preprocess_image(original_image)
185
+ original_image = original_image.cast(dtype=self.unet.dtype)
186
+ mask_image = _preprocess_mask(mask_image)
187
+ mask_image = mask_image.cast(dtype=self.unet.dtype)
188
+
189
+ batch_size = original_image.shape[0]
190
+
191
+ # sample gaussian noise to begin the loop
192
+ if isinstance(generator, list) and len(generator) != batch_size:
193
+ raise ValueError(
194
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
195
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
196
+ )
197
+
198
+ image_shape = original_image.shape
199
+ image = randn_tensor(image_shape, generator=generator, dtype=self.unet.dtype)
200
+
201
+ # set step values
202
+ self.scheduler.set_timesteps(num_inference_steps, jump_length, jump_n_sample)
203
+ self.scheduler.eta = eta
204
+
205
+ t_last = self.scheduler.timesteps[0] + 1
206
+ generator = generator[0] if isinstance(generator, list) else generator
207
+ for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
208
+ if t < t_last:
209
+ # predict the noise residual
210
+ model_output = self.unet(image, t).sample
211
+ # compute previous image: x_t -> x_t-1
212
+ image = self.scheduler.step(model_output, t, image, original_image, mask_image, generator).prev_sample
213
+
214
+ else:
215
+ # compute the reverse: x_t-1 -> x_t
216
+ image = self.scheduler.undo_step(image, t_last, generator)
217
+ t_last = t
218
+
219
+ image = (image / 2 + 0.5).clip(0, 1)
220
+ image = image.transpose([0, 2, 3, 1]).cast("float32").cpu().numpy()
221
+ if output_type == "pil":
222
+ image = self.numpy_to_pil(image)
223
+
224
+ if not return_dict:
225
+ return (image,)
226
+
227
+ return ImagePipelineOutput(images=image)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_3/__init__.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TYPE_CHECKING
2
+
3
+ from ...utils import (
4
+ PPDIFFUSERS_SLOW_IMPORT,
5
+ OptionalDependencyNotAvailable,
6
+ _LazyModule,
7
+ get_objects_from_module,
8
+ is_paddle_available,
9
+ is_paddlenlp_available,
10
+ )
11
+
12
+
13
+ _dummy_objects = {}
14
+ _additional_imports = {}
15
+ _import_structure = {"pipeline_output": ["StableDiffusion3PipelineOutput"]}
16
+
17
+ try:
18
+ if not (is_paddlenlp_available() and is_paddle_available()):
19
+ raise OptionalDependencyNotAvailable()
20
+ except OptionalDependencyNotAvailable:
21
+ from ...utils import dummy_paddle_and_paddlenlp_objects # noqa F403
22
+
23
+ _dummy_objects.update(get_objects_from_module(dummy_paddle_and_paddlenlp_objects))
24
+ else:
25
+ _import_structure["pipeline_stable_diffusion_3"] = ["StableDiffusion3Pipeline"]
26
+ _import_structure["pipeline_stable_diffusion_3_img2img"] = ["StableDiffusion3Img2ImgPipeline"]
27
+
28
+ if TYPE_CHECKING or PPDIFFUSERS_SLOW_IMPORT:
29
+ try:
30
+ if not (is_paddlenlp_available() and is_paddle_available()):
31
+ raise OptionalDependencyNotAvailable()
32
+ except OptionalDependencyNotAvailable:
33
+ from ...utils.dummy_paddle_and_paddlenlp_objects import * # noqa F403
34
+ else:
35
+ from .pipeline_stable_diffusion_3 import StableDiffusion3Pipeline
36
+ from .pipeline_stable_diffusion_3_img2img import StableDiffusion3Img2ImgPipeline
37
+
38
+ else:
39
+ import sys
40
+
41
+ sys.modules[__name__] = _LazyModule(
42
+ __name__,
43
+ globals()["__file__"],
44
+ _import_structure,
45
+ module_spec=__spec__,
46
+ )
47
+
48
+ for name, value in _dummy_objects.items():
49
+ setattr(sys.modules[__name__], name, value)
50
+ for name, value in _additional_imports.items():
51
+ setattr(sys.modules[__name__], name, value)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py ADDED
@@ -0,0 +1,899 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ # Copyright 2024 Stability AI and The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Union
17
+
18
+ import paddle
19
+ import paddle.distributed as dist
20
+ import paddle.distributed.fleet as fleet
21
+
22
+ from ppdiffusers.transformers import ( # T5TokenizerFast,
23
+ CLIPTextModelWithProjection,
24
+ CLIPTokenizer,
25
+ T5EncoderModel,
26
+ T5Tokenizer,
27
+ )
28
+
29
+ from ...image_processor import VaeImageProcessor
30
+ from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin
31
+ from ...models.autoencoder_kl import AutoencoderKL
32
+ from ...models.transformer_sd3 import SD3Transformer2DModel
33
+ from ...schedulers import FlowMatchEulerDiscreteScheduler
34
+ from ...utils import logging, replace_example_docstring
35
+ from ...utils.paddle_utils import randn_tensor
36
+ from ..pipeline_utils import DiffusionPipeline
37
+ from .pipeline_output import StableDiffusion3PipelineOutput
38
+
39
+ try:
40
+ # paddle.incubate.jit.inference is available in paddle develop but not in paddle 3.0beta, so we add a try except.
41
+ from paddle.incubate.jit import is_inference_mode
42
+ except:
43
+
44
+ def is_inference_mode(func):
45
+ return False
46
+
47
+
48
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
49
+
50
+ EXAMPLE_DOC_STRING = """
51
+ Examples:
52
+ ```py
53
+ >>> import paddle
54
+ >>> from ppdiffusers import StableDiffusion3Pipeline
55
+
56
+ >>> pipe = StableDiffusion3Pipeline.from_pretrained(
57
+ ... "stabilityai/stable-diffusion-3-medium-diffusers", paddle_dtype=paddle.float16
58
+ ... )
59
+ >>> prompt = "A cat holding a sign that says hello world"
60
+ >>> image = pipe(prompt).images[0]
61
+ >>> image.save("sd3.png")
62
+ ```
63
+ """
64
+
65
+
66
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
67
+ def retrieve_timesteps(
68
+ scheduler,
69
+ num_inference_steps: Optional[int] = None,
70
+ timesteps: Optional[List[int]] = None,
71
+ sigmas: Optional[List[float]] = None,
72
+ **kwargs,
73
+ ):
74
+ """
75
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
76
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
77
+
78
+ Args:
79
+ scheduler (`SchedulerMixin`):
80
+ The scheduler to get timesteps from.
81
+ num_inference_steps (`int`):
82
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
83
+ must be `None`.
84
+ timesteps (`List[int]`, *optional*):
85
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
86
+ `num_inference_steps` and `sigmas` must be `None`.
87
+ sigmas (`List[float]`, *optional*):
88
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
89
+ `num_inference_steps` and `timesteps` must be `None`.
90
+
91
+ Returns:
92
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
93
+ second element is the number of inference steps.
94
+ """
95
+ if timesteps is not None and sigmas is not None:
96
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
97
+ if timesteps is not None:
98
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
99
+ if not accepts_timesteps:
100
+ raise ValueError(
101
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
102
+ f" timestep schedules. Please check whether you are using the correct scheduler."
103
+ )
104
+ scheduler.set_timesteps(timesteps=timesteps, **kwargs)
105
+ timesteps = scheduler.timesteps
106
+ num_inference_steps = len(timesteps)
107
+ elif sigmas is not None:
108
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
109
+ if not accept_sigmas:
110
+ raise ValueError(
111
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
112
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
113
+ )
114
+ scheduler.set_timesteps(sigmas=sigmas, **kwargs)
115
+ timesteps = scheduler.timesteps
116
+ num_inference_steps = len(timesteps)
117
+ else:
118
+ scheduler.set_timesteps(num_inference_steps, **kwargs)
119
+ timesteps = scheduler.timesteps
120
+ return timesteps, num_inference_steps
121
+
122
+
123
+ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin):
124
+
125
+ r"""
126
+ Args:
127
+ transformer ([`SD3Transformer2DModel`]):
128
+ Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
129
+ scheduler ([`FlowMatchEulerDiscreteScheduler`]):
130
+ A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
131
+ vae ([`AutoencoderKL`]):
132
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
133
+ text_encoder ([`CLIPTextModelWithProjection`]):
134
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
135
+ specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant,
136
+ with an additional added projection layer that is initialized with a diagonal matrix with the `hidden_size`
137
+ as its dimension.
138
+ text_encoder_2 ([`CLIPTextModelWithProjection`]):
139
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
140
+ specifically the
141
+ [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
142
+ variant.
143
+ text_encoder_3 ([`T5EncoderModel`]):
144
+ Frozen text-encoder. Stable Diffusion 3 uses
145
+ [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
146
+ [t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
147
+ tokenizer (`CLIPTokenizer`):
148
+ Tokenizer of class
149
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
150
+ tokenizer_2 (`CLIPTokenizer`):
151
+ Second Tokenizer of class
152
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
153
+ tokenizer_3 (`T5Tokenizer`):
154
+ Tokenizer of class
155
+ [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
156
+ """
157
+
158
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
159
+ _optional_components = []
160
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
161
+
162
+ def __init__(
163
+ self,
164
+ transformer: SD3Transformer2DModel,
165
+ scheduler: FlowMatchEulerDiscreteScheduler,
166
+ vae: AutoencoderKL,
167
+ text_encoder: CLIPTextModelWithProjection,
168
+ tokenizer: CLIPTokenizer,
169
+ text_encoder_2: CLIPTextModelWithProjection,
170
+ tokenizer_2: CLIPTokenizer,
171
+ text_encoder_3: T5EncoderModel,
172
+ tokenizer_3: T5Tokenizer,
173
+ ):
174
+ super().__init__()
175
+
176
+ self.register_modules(
177
+ vae=vae,
178
+ text_encoder=text_encoder,
179
+ text_encoder_2=text_encoder_2,
180
+ text_encoder_3=text_encoder_3,
181
+ tokenizer=tokenizer,
182
+ tokenizer_2=tokenizer_2,
183
+ tokenizer_3=tokenizer_3,
184
+ transformer=transformer,
185
+ scheduler=scheduler,
186
+ )
187
+ self.vae_scale_factor = (
188
+ 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
189
+ )
190
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
191
+ self.tokenizer_max_length = (
192
+ self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
193
+ )
194
+ self.default_sample_size = (
195
+ self.transformer.config.sample_size
196
+ if hasattr(self, "transformer") and self.transformer is not None
197
+ else 128
198
+ )
199
+
200
+ def _get_t5_prompt_embeds(
201
+ self,
202
+ prompt: Union[str, List[str]] = None,
203
+ max_sequence_length: int = 256,
204
+ num_images_per_prompt: int = 1,
205
+ dtype: Optional[paddle.dtype] = None,
206
+ ):
207
+ dtype = dtype or self.text_encoder.dtype
208
+
209
+ prompt = [prompt] if isinstance(prompt, str) else prompt
210
+ batch_size = len(prompt)
211
+
212
+ if self.text_encoder_3 is None:
213
+ return paddle.zeros(
214
+ (batch_size, self.tokenizer_max_length, self.transformer.config.joint_attention_dim),
215
+ dtype=dtype,
216
+ )
217
+ text_inputs = self.tokenizer_3(
218
+ prompt,
219
+ padding="max_length",
220
+ max_length=max_sequence_length,
221
+ truncation=True,
222
+ add_special_tokens=True,
223
+ return_tensors="pd",
224
+ )
225
+ text_input_ids = text_inputs.input_ids
226
+ untruncated_ids = self.tokenizer_3(prompt, padding="longest", return_tensors="pd").input_ids
227
+
228
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal(text_input_ids, untruncated_ids):
229
+ removed_text = self.tokenizer_3.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
230
+ logger.warning(
231
+ "The following part of your input was truncated because 'max_sequence_length' is set to"
232
+ f" {max_sequence_length} tokens: {removed_text}"
233
+ )
234
+ prompt_embeds = self.text_encoder_3(text_input_ids)[0]
235
+
236
+ dtype = self.text_encoder_3.dtype
237
+ prompt_embeds = prompt_embeds.astype(dtype=dtype)
238
+
239
+ _, seq_len, _ = prompt_embeds.shape
240
+
241
+ # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
242
+ prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
243
+ prompt_embeds = prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
244
+
245
+ return prompt_embeds
246
+
247
+ def _get_clip_prompt_embeds(
248
+ self,
249
+ prompt: Union[str, List[str]],
250
+ num_images_per_prompt: int = 1,
251
+ clip_skip: Optional[int] = None,
252
+ clip_model_index: int = 0,
253
+ ):
254
+
255
+ clip_tokenizers = [self.tokenizer, self.tokenizer_2]
256
+ clip_text_encoders = [self.text_encoder, self.text_encoder_2]
257
+
258
+ tokenizer = clip_tokenizers[clip_model_index]
259
+ text_encoder = clip_text_encoders[clip_model_index]
260
+
261
+ prompt = [prompt] if isinstance(prompt, str) else prompt
262
+ batch_size = len(prompt)
263
+
264
+ text_inputs = tokenizer(
265
+ prompt,
266
+ padding="max_length",
267
+ max_length=self.tokenizer_max_length,
268
+ truncation=True,
269
+ return_tensors="pd",
270
+ )
271
+
272
+ text_input_ids = text_inputs.input_ids
273
+ untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
274
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal(text_input_ids, untruncated_ids):
275
+ removed_text = tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
276
+ logger.warning(
277
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
278
+ f" {self.tokenizer_max_length} tokens: {removed_text}"
279
+ )
280
+ prompt_embeds = text_encoder(text_input_ids, output_hidden_states=True)
281
+ pooled_prompt_embeds = prompt_embeds[0]
282
+
283
+ if clip_skip is None:
284
+ prompt_embeds = prompt_embeds.hidden_states[-2]
285
+ else:
286
+ prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
287
+
288
+ prompt_embeds = prompt_embeds.astype(dtype=self.text_encoder.dtype)
289
+
290
+ _, seq_len, _ = prompt_embeds.shape
291
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
292
+ prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
293
+ prompt_embeds = prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
294
+
295
+ pooled_prompt_embeds = pooled_prompt_embeds.tile([1, num_images_per_prompt, 1])
296
+ pooled_prompt_embeds = pooled_prompt_embeds.reshape([batch_size * num_images_per_prompt, -1])
297
+
298
+ return prompt_embeds, pooled_prompt_embeds
299
+
300
+ def encode_prompt(
301
+ self,
302
+ prompt: Union[str, List[str]],
303
+ prompt_2: Union[str, List[str]],
304
+ prompt_3: Union[str, List[str]],
305
+ num_images_per_prompt: int = 1,
306
+ do_classifier_free_guidance: bool = True,
307
+ negative_prompt: Optional[Union[str, List[str]]] = None,
308
+ negative_prompt_2: Optional[Union[str, List[str]]] = None,
309
+ negative_prompt_3: Optional[Union[str, List[str]]] = None,
310
+ prompt_embeds: Optional[paddle.Tensor] = None,
311
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
312
+ pooled_prompt_embeds: Optional[paddle.Tensor] = None,
313
+ negative_pooled_prompt_embeds: Optional[paddle.Tensor] = None,
314
+ clip_skip: Optional[int] = None,
315
+ max_sequence_length: int = 256,
316
+ ):
317
+ r"""
318
+
319
+ Args:
320
+ prompt (`str` or `List[str]`, *optional*):
321
+ prompt to be encoded
322
+ prompt_2 (`str` or `List[str]`, *optional*):
323
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
324
+ used in all text-encoders
325
+ prompt_3 (`str` or `List[str]`, *optional*):
326
+ The prompt or prompts to be sent to the `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
327
+ used in all text-encoders
328
+ num_images_per_prompt (`int`):
329
+ number of images that should be generated per prompt
330
+ do_classifier_free_guidance (`bool`):
331
+ whether to use classifier free guidance or not
332
+ negative_prompt (`str` or `List[str]`, *optional*):
333
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
334
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
335
+ less than `1`).
336
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
337
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
338
+ `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
339
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
340
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
341
+ `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders
342
+ prompt_embeds (`paddle.Tensor`, *optional*):
343
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
344
+ provided, text embeddings will be generated from `prompt` input argument.
345
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
346
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
347
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
348
+ argument.
349
+ pooled_prompt_embeds (`paddle.Tensor`, *optional*):
350
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
351
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
352
+ negative_pooled_prompt_embeds (`paddle.Tensor`, *optional*):
353
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
354
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
355
+ input argument.
356
+ clip_skip (`int`, *optional*):
357
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
358
+ the output of the pre-final layer will be used for computing the prompt embeddings.
359
+ """
360
+ prompt = [prompt] if isinstance(prompt, str) else prompt
361
+ if prompt is not None:
362
+ batch_size = len(prompt)
363
+ else:
364
+ batch_size = prompt_embeds.shape[0]
365
+
366
+ if prompt_embeds is None:
367
+ prompt_2 = prompt_2 or prompt
368
+ prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
369
+
370
+ prompt_3 = prompt_3 or prompt
371
+ prompt_3 = [prompt_3] if isinstance(prompt_3, str) else prompt_3
372
+
373
+ prompt_embed, pooled_prompt_embed = self._get_clip_prompt_embeds(
374
+ prompt=prompt,
375
+ num_images_per_prompt=num_images_per_prompt,
376
+ clip_skip=clip_skip,
377
+ clip_model_index=0,
378
+ )
379
+ prompt_2_embed, pooled_prompt_2_embed = self._get_clip_prompt_embeds(
380
+ prompt=prompt_2,
381
+ num_images_per_prompt=num_images_per_prompt,
382
+ clip_skip=clip_skip,
383
+ clip_model_index=1,
384
+ )
385
+ clip_prompt_embeds = paddle.concat([prompt_embed, prompt_2_embed], axis=-1)
386
+
387
+ t5_prompt_embed = self._get_t5_prompt_embeds(
388
+ prompt=prompt_3,
389
+ num_images_per_prompt=num_images_per_prompt,
390
+ max_sequence_length=max_sequence_length,
391
+ )
392
+
393
+ clip_prompt_embeds = paddle.nn.functional.pad(
394
+ clip_prompt_embeds,
395
+ (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1]),
396
+ data_format="NCL",
397
+ )
398
+
399
+ prompt_embeds = paddle.concat([clip_prompt_embeds, t5_prompt_embed], axis=-2)
400
+ pooled_prompt_embeds = paddle.concat([pooled_prompt_embed, pooled_prompt_2_embed], axis=-1)
401
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
402
+ negative_prompt = negative_prompt or ""
403
+ negative_prompt_2 = negative_prompt_2 or negative_prompt
404
+ negative_prompt_3 = negative_prompt_3 or negative_prompt
405
+
406
+ # normalize str to list
407
+ negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
408
+ negative_prompt_2 = (
409
+ batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
410
+ )
411
+ negative_prompt_3 = (
412
+ batch_size * [negative_prompt_3] if isinstance(negative_prompt_3, str) else negative_prompt_3
413
+ )
414
+
415
+ if prompt is not None and type(prompt) is not type(negative_prompt):
416
+ raise TypeError(
417
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
418
+ f" {type(prompt)}."
419
+ )
420
+ elif batch_size != len(negative_prompt):
421
+ raise ValueError(
422
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
423
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
424
+ " the batch size of `prompt`."
425
+ )
426
+
427
+ negative_prompt_embed, negative_pooled_prompt_embed = self._get_clip_prompt_embeds(
428
+ negative_prompt,
429
+ num_images_per_prompt=num_images_per_prompt,
430
+ clip_skip=None,
431
+ clip_model_index=0,
432
+ )
433
+ negative_prompt_2_embed, negative_pooled_prompt_2_embed = self._get_clip_prompt_embeds(
434
+ negative_prompt_2,
435
+ num_images_per_prompt=num_images_per_prompt,
436
+ clip_skip=None,
437
+ clip_model_index=1,
438
+ )
439
+ negative_clip_prompt_embeds = paddle.concat([negative_prompt_embed, negative_prompt_2_embed], axis=-1)
440
+
441
+ t5_negative_prompt_embed = self._get_t5_prompt_embeds(
442
+ prompt=negative_prompt_3,
443
+ num_images_per_prompt=num_images_per_prompt,
444
+ max_sequence_length=max_sequence_length,
445
+ )
446
+
447
+ negative_clip_prompt_embeds = paddle.nn.functional.pad(
448
+ negative_clip_prompt_embeds,
449
+ (0, t5_negative_prompt_embed.shape[-1] - negative_clip_prompt_embeds.shape[-1]),
450
+ data_format="NCL",
451
+ )
452
+
453
+ negative_prompt_embeds = paddle.concat([negative_clip_prompt_embeds, t5_negative_prompt_embed], axis=-2)
454
+ negative_pooled_prompt_embeds = paddle.concat(
455
+ [negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], axis=-1
456
+ )
457
+
458
+ return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
459
+
460
+ def check_inputs(
461
+ self,
462
+ prompt,
463
+ prompt_2,
464
+ prompt_3,
465
+ height,
466
+ width,
467
+ negative_prompt=None,
468
+ negative_prompt_2=None,
469
+ negative_prompt_3=None,
470
+ prompt_embeds=None,
471
+ negative_prompt_embeds=None,
472
+ pooled_prompt_embeds=None,
473
+ negative_pooled_prompt_embeds=None,
474
+ callback_on_step_end_tensor_inputs=None,
475
+ max_sequence_length=None,
476
+ ):
477
+ if height % 8 != 0 or width % 8 != 0:
478
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
479
+
480
+ if callback_on_step_end_tensor_inputs is not None and not all(
481
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
482
+ ):
483
+ raise ValueError(
484
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
485
+ )
486
+
487
+ if prompt is not None and prompt_embeds is not None:
488
+ raise ValueError(
489
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
490
+ " only forward one of the two."
491
+ )
492
+ elif prompt_2 is not None and prompt_embeds is not None:
493
+ raise ValueError(
494
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
495
+ " only forward one of the two."
496
+ )
497
+ elif prompt_3 is not None and prompt_embeds is not None:
498
+ raise ValueError(
499
+ f"Cannot forward both `prompt_3`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
500
+ " only forward one of the two."
501
+ )
502
+ elif prompt is None and prompt_embeds is None:
503
+ raise ValueError(
504
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
505
+ )
506
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
507
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
508
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
509
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
510
+ elif prompt_3 is not None and (not isinstance(prompt_3, str) and not isinstance(prompt_3, list)):
511
+ raise ValueError(f"`prompt_3` has to be of type `str` or `list` but is {type(prompt_3)}")
512
+
513
+ if negative_prompt is not None and negative_prompt_embeds is not None:
514
+ raise ValueError(
515
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
516
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
517
+ )
518
+ elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
519
+ raise ValueError(
520
+ f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
521
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
522
+ )
523
+ elif negative_prompt_3 is not None and negative_prompt_embeds is not None:
524
+ raise ValueError(
525
+ f"Cannot forward both `negative_prompt_3`: {negative_prompt_3} and `negative_prompt_embeds`:"
526
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
527
+ )
528
+
529
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
530
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
531
+ raise ValueError(
532
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
533
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
534
+ f" {negative_prompt_embeds.shape}."
535
+ )
536
+
537
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
538
+ raise ValueError(
539
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
540
+ )
541
+
542
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
543
+ raise ValueError(
544
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
545
+ )
546
+
547
+ if max_sequence_length is not None and max_sequence_length > 512:
548
+ raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
549
+
550
+ def prepare_latents(
551
+ self,
552
+ batch_size,
553
+ num_channels_latents,
554
+ height,
555
+ width,
556
+ dtype,
557
+ generator,
558
+ latents=None,
559
+ ):
560
+ if latents is not None:
561
+ return latents.cast(dtype=dtype)
562
+
563
+ shape = (
564
+ batch_size,
565
+ num_channels_latents,
566
+ int(height) // self.vae_scale_factor,
567
+ int(width) // self.vae_scale_factor,
568
+ )
569
+
570
+ if isinstance(generator, list) and len(generator) != batch_size:
571
+ raise ValueError(
572
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
573
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
574
+ )
575
+ latents = randn_tensor(shape, generator=generator, dtype=dtype)
576
+
577
+ return latents
578
+
579
+ @property
580
+ def guidance_scale(self):
581
+ return self._guidance_scale
582
+
583
+ @property
584
+ def clip_skip(self):
585
+ return self._clip_skip
586
+
587
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
588
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
589
+ # corresponds to doing no classifier free guidance.
590
+ @property
591
+ def do_classifier_free_guidance(self):
592
+ return self._guidance_scale > 1
593
+
594
+ @property
595
+ def joint_attention_kwargs(self):
596
+ return self._joint_attention_kwargs
597
+
598
+ @property
599
+ def num_timesteps(self):
600
+ return self._num_timesteps
601
+
602
+ @property
603
+ def interrupt(self):
604
+ return self._interrupt
605
+
606
+ @paddle.no_grad()
607
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
608
+ def __call__(
609
+ self,
610
+ prompt: Union[str, List[str]] = None,
611
+ prompt_2: Optional[Union[str, List[str]]] = None,
612
+ prompt_3: Optional[Union[str, List[str]]] = None,
613
+ height: Optional[int] = None,
614
+ width: Optional[int] = None,
615
+ num_inference_steps: int = 28,
616
+ timesteps: List[int] = None,
617
+ guidance_scale: float = 7.0,
618
+ negative_prompt: Optional[Union[str, List[str]]] = None,
619
+ negative_prompt_2: Optional[Union[str, List[str]]] = None,
620
+ negative_prompt_3: Optional[Union[str, List[str]]] = None,
621
+ num_images_per_prompt: Optional[int] = 1,
622
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
623
+ latents: Optional[paddle.Tensor] = None,
624
+ prompt_embeds: Optional[paddle.Tensor] = None,
625
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
626
+ pooled_prompt_embeds: Optional[paddle.Tensor] = None,
627
+ negative_pooled_prompt_embeds: Optional[paddle.Tensor] = None,
628
+ output_type: Optional[str] = "pil",
629
+ return_dict: bool = True,
630
+ joint_attention_kwargs: Optional[Dict[str, Any]] = None,
631
+ clip_skip: Optional[int] = None,
632
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
633
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
634
+ max_sequence_length: int = 256,
635
+ ):
636
+ r"""
637
+ Function invoked when calling the pipeline for generation.
638
+
639
+ Args:
640
+ prompt (`str` or `List[str]`, *optional*):
641
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
642
+ instead.
643
+ prompt_2 (`str` or `List[str]`, *optional*):
644
+ The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
645
+ will be used instead
646
+ prompt_3 (`str` or `List[str]`, *optional*):
647
+ The prompt or prompts to be sent to `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
648
+ will be used instead
649
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
650
+ The height in pixels of the generated image. This is set to 1024 by default for the best results.
651
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
652
+ The width in pixels of the generated image. This is set to 1024 by default for the best results.
653
+ num_inference_steps (`int`, *optional*, defaults to 50):
654
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
655
+ expense of slower inference.
656
+ timesteps (`List[int]`, *optional*):
657
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
658
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
659
+ passed will be used. Must be in descending order.
660
+ guidance_scale (`float`, *optional*, defaults to 5.0):
661
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
662
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
663
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
664
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
665
+ usually at the expense of lower image quality.
666
+ negative_prompt (`str` or `List[str]`, *optional*):
667
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
668
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
669
+ less than `1`).
670
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
671
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
672
+ `text_encoder_2`. If not defined, `negative_prompt` is used instead
673
+ negative_prompt_3 (`str` or `List[str]`, *optional*):
674
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
675
+ `text_encoder_3`. If not defined, `negative_prompt` is used instead
676
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
677
+ The number of images to generate per prompt.
678
+ generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
679
+ One or a list of [[paddle] generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
680
+ to make generation deterministic.
681
+ latents (`paddle.Tensor`, *optional*):
682
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
683
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
684
+ tensor will ge generated by sampling using the supplied random `generator`.
685
+ prompt_embeds (`paddle.Tensor`, *optional*):
686
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
687
+ provided, text embeddings will be generated from `prompt` input argument.
688
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
689
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
690
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
691
+ argument.
692
+ pooled_prompt_embeds (`paddle.Tensor`, *optional*):
693
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
694
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
695
+ negative_pooled_prompt_embeds (`paddle.Tensor`, *optional*):
696
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
697
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
698
+ input argument.
699
+ output_type (`str`, *optional*, defaults to `"pil"`):
700
+ The output format of the generate image. Choose between
701
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
702
+ return_dict (`bool`, *optional*, defaults to `True`):
703
+ Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
704
+ of a plain tuple.
705
+ joint_attention_kwargs (`dict`, *optional*):
706
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
707
+ `self.processor` in
708
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
709
+ callback_on_step_end (`Callable`, *optional*):
710
+ A function that calls at the end of each denoising steps during the inference. The function is called
711
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
712
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
713
+ `callback_on_step_end_tensor_inputs`.
714
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
715
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
716
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
717
+ `._callback_tensor_inputs` attribute of your pipeline class.
718
+ max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
719
+ Examples:
720
+
721
+ Returns:
722
+ [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
723
+ [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
724
+ `tuple`. When returning a tuple, the first element is a list with the generated images.
725
+ """
726
+
727
+ height = height or self.default_sample_size * self.vae_scale_factor
728
+ width = width or self.default_sample_size * self.vae_scale_factor
729
+
730
+ # 1. Check inputs. Raise error if not correct
731
+ self.check_inputs(
732
+ prompt,
733
+ prompt_2,
734
+ prompt_3,
735
+ height,
736
+ width,
737
+ negative_prompt=negative_prompt,
738
+ negative_prompt_2=negative_prompt_2,
739
+ negative_prompt_3=negative_prompt_3,
740
+ prompt_embeds=prompt_embeds,
741
+ negative_prompt_embeds=negative_prompt_embeds,
742
+ pooled_prompt_embeds=pooled_prompt_embeds,
743
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
744
+ callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
745
+ max_sequence_length=max_sequence_length,
746
+ )
747
+
748
+ self._guidance_scale = guidance_scale
749
+ self._clip_skip = clip_skip
750
+ self._joint_attention_kwargs = joint_attention_kwargs
751
+ self._interrupt = False
752
+
753
+ # 2. Define call parameters
754
+ if prompt is not None and isinstance(prompt, str):
755
+ batch_size = 1
756
+ elif prompt is not None and isinstance(prompt, list):
757
+ batch_size = len(prompt)
758
+ else:
759
+ batch_size = prompt_embeds.shape[0]
760
+
761
+ (
762
+ prompt_embeds,
763
+ negative_prompt_embeds,
764
+ pooled_prompt_embeds,
765
+ negative_pooled_prompt_embeds,
766
+ ) = self.encode_prompt(
767
+ prompt=prompt,
768
+ prompt_2=prompt_2,
769
+ prompt_3=prompt_3,
770
+ negative_prompt=negative_prompt,
771
+ negative_prompt_2=negative_prompt_2,
772
+ negative_prompt_3=negative_prompt_3,
773
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
774
+ prompt_embeds=prompt_embeds,
775
+ negative_prompt_embeds=negative_prompt_embeds,
776
+ pooled_prompt_embeds=pooled_prompt_embeds,
777
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
778
+ clip_skip=self.clip_skip,
779
+ num_images_per_prompt=num_images_per_prompt,
780
+ max_sequence_length=max_sequence_length,
781
+ )
782
+
783
+ if self.do_classifier_free_guidance:
784
+ prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds], axis=0)
785
+ pooled_prompt_embeds = paddle.concat([negative_pooled_prompt_embeds, pooled_prompt_embeds], axis=0)
786
+
787
+ # 4. Prepare timesteps
788
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, timesteps)
789
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
790
+ self._num_timesteps = len(timesteps)
791
+
792
+ # 5. Prepare latent variables
793
+ num_channels_latents = self.transformer.config.in_channels
794
+ latents = self.prepare_latents(
795
+ batch_size * num_images_per_prompt,
796
+ num_channels_latents,
797
+ height,
798
+ width,
799
+ prompt_embeds.dtype,
800
+ generator,
801
+ latents,
802
+ )
803
+
804
+ # 6. Denoising loop
805
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
806
+ for i, t in enumerate(timesteps):
807
+ if self.interrupt:
808
+ continue
809
+
810
+ # expand the latents if we are doing classifier free guidance
811
+ latent_model_input = paddle.concat([latents] * 2) if self.do_classifier_free_guidance else latents
812
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
813
+ timestep = t.expand(latent_model_input.shape[0])
814
+
815
+ enabled_cfg_dp = False
816
+ if self.transformer.inference_dp_size > 1:
817
+ enabled_cfg_dp = True
818
+ assert self.do_classifier_free_guidance, "do_classifier_free_guidance must be true"
819
+
820
+ if enabled_cfg_dp:
821
+ dp_id = self.transformer.dp_id
822
+ latent_input = paddle.split(latent_model_input, 2, axis=0)[dp_id]
823
+ timestep_input = paddle.split(timestep, 2, axis=0)[dp_id]
824
+ prompt_embeds_input = paddle.split(prompt_embeds, 2, axis=0)[dp_id]
825
+ pooled_prompt_embeds_input = paddle.split(pooled_prompt_embeds, 2, axis=0)[dp_id]
826
+
827
+ else:
828
+ latent_input = latent_model_input
829
+ timestep_input = timestep
830
+ prompt_embeds_input = prompt_embeds
831
+ pooled_prompt_embeds_input = pooled_prompt_embeds
832
+
833
+ model_output = self.transformer(
834
+ hidden_states=latent_input,
835
+ timestep=timestep_input,
836
+ encoder_hidden_states=prompt_embeds_input,
837
+ pooled_projections=pooled_prompt_embeds_input,
838
+ joint_attention_kwargs=self.joint_attention_kwargs,
839
+ return_dict=False,
840
+ )
841
+ if is_inference_mode(self.transformer):
842
+ # NOTE:(changwenbin,zhoukangkang)
843
+ # This is for paddle inference mode
844
+ output = model_output
845
+ else:
846
+ output = model_output[0]
847
+
848
+ if enabled_cfg_dp:
849
+ tmp_shape = output.shape
850
+ tmp_shape[0] *= 2
851
+ noise_pred = paddle.zeros(tmp_shape, dtype=output.dtype)
852
+ dist.all_gather(
853
+ noise_pred, output, group=fleet.get_hybrid_communicate_group().get_data_parallel_group()
854
+ )
855
+ else:
856
+ noise_pred = output
857
+
858
+ # perform guidance
859
+ if self.do_classifier_free_guidance:
860
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
861
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
862
+
863
+ # compute the previous noisy sample x_t -> x_t-1
864
+ latents_dtype = latents.dtype
865
+ latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
866
+
867
+ if callback_on_step_end is not None:
868
+ callback_kwargs = {}
869
+ for k in callback_on_step_end_tensor_inputs:
870
+ callback_kwargs[k] = locals()[k]
871
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
872
+
873
+ latents = callback_outputs.pop("latents", latents)
874
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
875
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
876
+ negative_pooled_prompt_embeds = callback_outputs.pop(
877
+ "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
878
+ )
879
+
880
+ # call the callback, if provided
881
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
882
+ progress_bar.update()
883
+
884
+ if output_type == "latent":
885
+ image = latents
886
+
887
+ else:
888
+ latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
889
+
890
+ image = self.vae.decode(latents, return_dict=False)[0]
891
+ image = self.image_processor.postprocess(image, output_type=output_type)
892
+
893
+ # Offload all models
894
+ self.maybe_free_model_hooks()
895
+
896
+ if not return_dict:
897
+ return (image,)
898
+
899
+ return StableDiffusion3PipelineOutput(images=image)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/__init__.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import TYPE_CHECKING
16
+
17
+ from ...utils import (
18
+ PPDIFFUSERS_SLOW_IMPORT,
19
+ OptionalDependencyNotAvailable,
20
+ _LazyModule,
21
+ get_objects_from_module,
22
+ is_fastdeploy_available,
23
+ is_paddle_available,
24
+ is_paddlenlp_available,
25
+ )
26
+
27
+ _dummy_objects = {}
28
+ _additional_imports = {}
29
+ _import_structure = {"pipeline_output": ["StableDiffusionXLPipelineOutput"]}
30
+
31
+ try:
32
+ if not (is_paddlenlp_available() and is_paddle_available()):
33
+ raise OptionalDependencyNotAvailable()
34
+ except OptionalDependencyNotAvailable:
35
+ from ...utils import dummy_paddle_and_paddlenlp_objects # noqa F403
36
+
37
+ _dummy_objects.update(get_objects_from_module(dummy_paddle_and_paddlenlp_objects))
38
+ else:
39
+ _import_structure["pipeline_stable_diffusion_xl"] = ["StableDiffusionXLPipeline"]
40
+ _import_structure["pipeline_stable_diffusion_xl_img2img"] = ["StableDiffusionXLImg2ImgPipeline"]
41
+ _import_structure["pipeline_stable_diffusion_xl_inpaint"] = ["StableDiffusionXLInpaintPipeline"]
42
+ _import_structure["pipeline_stable_diffusion_xl_instruct_pix2pix"] = ["StableDiffusionXLInstructPix2PixPipeline"]
43
+ # paddleinfer
44
+ _import_structure["pipeline_paddleinfer_stable_diffusion_xl"] = ["PaddleInferStableDiffusionXLPipeline"]
45
+ _import_structure["pipeline_paddleinfer_stable_diffusion_xl_img2img"] = [
46
+ "PaddleInferStableDiffusionXLImg2ImgPipeline"
47
+ ]
48
+ _import_structure["pipeline_paddleinfer_stable_diffusion_xl_inpaint"] = [
49
+ "PaddleInferStableDiffusionXLInpaintPipeline"
50
+ ]
51
+ _import_structure["pipeline_paddleinfer_stable_diffusion_xl_pix2pix"] = [
52
+ "PaddleInferStableDiffusionXLPix2PixPipeline"
53
+ ]
54
+ _import_structure["pipeline_paddleinfer_stable_diffusion_xl_mega"] = ["PaddleInferStableDiffusionXLMegaPipeline"]
55
+ _import_structure["pipeline_paddleinfer_stable_diffusion_xl_instruct_pix2pix"] = [
56
+ "PaddleInferStableDiffusionXLInstructPix2PixPipeline"
57
+ ]
58
+
59
+
60
+ # fastdeploy
61
+ try:
62
+ if not (is_paddle_available() and is_paddlenlp_available() and is_fastdeploy_available()):
63
+ raise OptionalDependencyNotAvailable()
64
+ except OptionalDependencyNotAvailable:
65
+ from ...utils import dummy_fastdeploy_objects # noqa F403
66
+
67
+ _dummy_objects.update(get_objects_from_module(dummy_fastdeploy_objects))
68
+ else:
69
+ _import_structure["pipeline_fastdeploy_stable_diffusion_xl"] = ["FastDeployStableDiffusionXLPipeline"]
70
+ _import_structure["pipeline_fastdeploy_stable_diffusion_xl_img2img"] = [
71
+ "FastDeployStableDiffusionXLImg2ImgPipeline"
72
+ ]
73
+ _import_structure["pipeline_fastdeploy_stable_diffusion_xl_inpaint"] = [
74
+ "FastDeployStableDiffusionXLInpaintPipeline"
75
+ ]
76
+ _import_structure["pipeline_fastdeploy_stable_diffusion_xl_mega"] = ["FastDeployStableDiffusionXLMegaPipeline"]
77
+ _import_structure["pipeline_fastdeploy_stable_diffusion_xl_instruct_pix2pix"] = [
78
+ "FastDeployStableDiffusionXLInstructPix2PixPipeline"
79
+ ]
80
+
81
+ if TYPE_CHECKING or PPDIFFUSERS_SLOW_IMPORT:
82
+ try:
83
+ if not (is_paddlenlp_available() and is_paddle_available()):
84
+ raise OptionalDependencyNotAvailable()
85
+ except OptionalDependencyNotAvailable:
86
+ from ...utils.dummy_paddle_and_paddlenlp_objects import * # noqa F403
87
+ else:
88
+ from .pipeline_output import StableDiffusionXLPipelineOutput
89
+ from .pipeline_paddleinfer_stable_diffusion_xl import (
90
+ PaddleInferStableDiffusionXLPipeline,
91
+ )
92
+ from .pipeline_paddleinfer_stable_diffusion_xl_img2img import (
93
+ PaddleInferStableDiffusionXLImg2ImgPipeline,
94
+ )
95
+ from .pipeline_paddleinfer_stable_diffusion_xl_inpaint import (
96
+ PaddleInferStableDiffusionXLInpaintPipeline,
97
+ )
98
+ from .pipeline_paddleinfer_stable_diffusion_xl_instruct_pix2pix import (
99
+ PaddleInferStableDiffusionXLInstructPix2PixPipeline,
100
+ )
101
+ from .pipeline_paddleinfer_stable_diffusion_xl_mega import (
102
+ PaddleInferStableDiffusionXLMegaPipeline,
103
+ )
104
+ from .pipeline_stable_diffusion_xl import StableDiffusionXLPipeline
105
+ from .pipeline_stable_diffusion_xl_img2img import (
106
+ StableDiffusionXLImg2ImgPipeline,
107
+ )
108
+ from .pipeline_stable_diffusion_xl_inpaint import (
109
+ StableDiffusionXLInpaintPipeline,
110
+ )
111
+ from .pipeline_stable_diffusion_xl_instruct_pix2pix import (
112
+ StableDiffusionXLInstructPix2PixPipeline,
113
+ )
114
+
115
+ try:
116
+ if not (is_paddle_available() and is_paddlenlp_available() and is_fastdeploy_available()):
117
+ raise OptionalDependencyNotAvailable()
118
+ except OptionalDependencyNotAvailable:
119
+ from ...utils.dummy_fastdeploy_objects import *
120
+ else:
121
+ from .pipeline_fastdeploy_stable_diffusion_xl import (
122
+ FastDeployStableDiffusionXLPipeline,
123
+ )
124
+ from .pipeline_fastdeploy_stable_diffusion_xl_img2img import (
125
+ FastDeployStableDiffusionXLImg2ImgPipeline,
126
+ )
127
+ from .pipeline_fastdeploy_stable_diffusion_xl_inpaint import (
128
+ FastDeployStableDiffusionXLInpaintPipeline,
129
+ )
130
+ from .pipeline_fastdeploy_stable_diffusion_xl_instruct_pix2pix import (
131
+ FastDeployStableDiffusionXLInstructPix2PixPipeline,
132
+ )
133
+ from .pipeline_fastdeploy_stable_diffusion_xl_mega import (
134
+ FastDeployStableDiffusionXLMegaPipeline,
135
+ )
136
+
137
+ else:
138
+ import sys
139
+
140
+ sys.modules[__name__] = _LazyModule(
141
+ __name__,
142
+ globals()["__file__"],
143
+ _import_structure,
144
+ module_spec=__spec__,
145
+ )
146
+
147
+ for name, value in _dummy_objects.items():
148
+ setattr(sys.modules[__name__], name, value)
149
+ for name, value in _additional_imports.items():
150
+ setattr(sys.modules[__name__], name, value)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/pipeline_fastdeploy_stable_diffusion_xl_img2img.py ADDED
@@ -0,0 +1,552 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
16
+
17
+ import numpy as np
18
+ import paddle
19
+ import PIL.Image
20
+
21
+ from ppdiffusers.transformers import CLIPImageProcessor, CLIPTokenizer
22
+
23
+ from ...image_processor import PipelineImageInput
24
+ from ...loaders import IPAdapterMixin
25
+ from ...schedulers import KarrasDiffusionSchedulers
26
+ from ...utils import logging, replace_example_docstring
27
+ from ..fastdeploy_utils import FastDeployRuntimeModel
28
+ from ..fastdeployxl_utils import FastDeployDiffusionXLPipelineMixin
29
+ from ..pipeline_utils import DiffusionPipeline
30
+ from . import StableDiffusionXLPipelineOutput
31
+
32
+ logger = logging.get_logger(__name__)
33
+ EXAMPLE_DOC_STRING = """
34
+ Examples:
35
+ ```py
36
+ >>> import paddle
37
+ >>> from ppdiffusers import StableDiffusionXLImg2ImgPipeline
38
+ >>> from ppdiffusers.utils import load_image
39
+
40
+ >>> pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
41
+ ... "stabilityai/stable-diffusion-xl-refiner-1.0", paddle_dtype=paddle.float16
42
+ ... )
43
+ >>> url = "https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/aa_xl/000000009.png"
44
+
45
+ >>> init_image = load_image(url).convert("RGB")
46
+ >>> prompt = "a photo of an astronaut riding a horse on mars"
47
+ >>> image = pipe(prompt, image=init_image).images[0]
48
+ ```
49
+ """
50
+
51
+
52
+ class FastDeployStableDiffusionXLImg2ImgPipeline(
53
+ DiffusionPipeline, FastDeployDiffusionXLPipelineMixin, IPAdapterMixin
54
+ ):
55
+ """
56
+ Pipeline for text-to-image generation using Stable Diffusion XL.
57
+
58
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
59
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
60
+
61
+ In addition the pipeline inherits the following loading methods:
62
+ - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
63
+ - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
64
+ - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
65
+
66
+ as well as the following saving methods:
67
+ - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
68
+
69
+ Args:
70
+ vae ([`AutoencoderKL`]):
71
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
72
+ text_encoder ([`CLIPTextModel`]):
73
+ Frozen text-encoder. Stable Diffusion XL uses the text portion of
74
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
75
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
76
+ text_encoder_2 ([` CLIPTextModelWithProjection`]):
77
+ Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
78
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
79
+ specifically the
80
+ [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
81
+ variant.
82
+ tokenizer (`CLIPTokenizer`):
83
+ Tokenizer of class
84
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
85
+ tokenizer_2 (`CLIPTokenizer`):
86
+ Second Tokenizer of class
87
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
88
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
89
+ scheduler ([`SchedulerMixin`]):
90
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
91
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
92
+ """
93
+
94
+ _optional_components = ["image_encoder", "tokenizer", "text_encoder"]
95
+
96
+ def __init__(
97
+ self,
98
+ vae_encoder: FastDeployRuntimeModel,
99
+ vae_decoder: FastDeployRuntimeModel,
100
+ text_encoder: FastDeployRuntimeModel,
101
+ text_encoder_2: FastDeployRuntimeModel,
102
+ tokenizer: CLIPTokenizer,
103
+ tokenizer_2: CLIPTokenizer,
104
+ unet: FastDeployRuntimeModel,
105
+ image_encoder: FastDeployRuntimeModel,
106
+ feature_extractor: CLIPImageProcessor,
107
+ scheduler: KarrasDiffusionSchedulers,
108
+ force_zeros_for_empty_prompt: bool = True,
109
+ requires_aesthetics_score: bool = False,
110
+ ):
111
+ super().__init__()
112
+ self.register_modules(
113
+ vae_encoder=vae_encoder,
114
+ vae_decoder=vae_decoder,
115
+ text_encoder=text_encoder,
116
+ text_encoder_2=text_encoder_2,
117
+ tokenizer=tokenizer,
118
+ tokenizer_2=tokenizer_2,
119
+ unet=unet,
120
+ image_encoder=image_encoder,
121
+ feature_extractor=feature_extractor,
122
+ scheduler=scheduler,
123
+ )
124
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
125
+ self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
126
+ self.post_init(vae_scaling_factor=0.13025)
127
+
128
+ def check_inputs(
129
+ self,
130
+ prompt,
131
+ prompt_2,
132
+ strength,
133
+ num_inference_steps,
134
+ callback_steps,
135
+ negative_prompt=None,
136
+ negative_prompt_2=None,
137
+ prompt_embeds=None,
138
+ negative_prompt_embeds=None,
139
+ ):
140
+ if strength < 0 or strength > 1:
141
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
142
+ if num_inference_steps is None:
143
+ raise ValueError("`num_inference_steps` cannot be None.")
144
+ elif not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
145
+ raise ValueError(
146
+ f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type {type(num_inference_steps)}."
147
+ )
148
+ if (
149
+ callback_steps is None
150
+ or callback_steps is not None
151
+ and (not isinstance(callback_steps, int) or callback_steps <= 0)
152
+ ):
153
+ raise ValueError(
154
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}."
155
+ )
156
+ if prompt is not None and prompt_embeds is not None:
157
+ raise ValueError(
158
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to only forward one of the two."
159
+ )
160
+ elif prompt_2 is not None and prompt_embeds is not None:
161
+ raise ValueError(
162
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to only forward one of the two."
163
+ )
164
+ elif prompt is None and prompt_embeds is None:
165
+ raise ValueError(
166
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
167
+ )
168
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
169
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
170
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
171
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
172
+ if negative_prompt is not None and negative_prompt_embeds is not None:
173
+ raise ValueError(
174
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two."
175
+ )
176
+ elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
177
+ raise ValueError(
178
+ f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two."
179
+ )
180
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
181
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
182
+ raise ValueError(
183
+ f"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds` {negative_prompt_embeds.shape}."
184
+ )
185
+
186
+ @paddle.no_grad()
187
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
188
+ def __call__(
189
+ self,
190
+ prompt: Union[str, List[str]] = None,
191
+ prompt_2: Optional[Union[str, List[str]]] = None,
192
+ image: Union[
193
+ paddle.Tensor, PIL.Image.Image, np.ndarray, List[paddle.Tensor], List[PIL.Image.Image], List[np.ndarray]
194
+ ] = None,
195
+ strength: float = 0.3,
196
+ height: Optional[int] = None,
197
+ width: Optional[int] = None,
198
+ num_inference_steps: int = 50,
199
+ denoising_start: Optional[float] = None,
200
+ denoising_end: Optional[float] = None,
201
+ guidance_scale: float = 5.0,
202
+ negative_prompt: Optional[Union[str, List[str]]] = None,
203
+ negative_prompt_2: Optional[Union[str, List[str]]] = None,
204
+ num_images_per_prompt: Optional[int] = 1,
205
+ timesteps: List[int] = None,
206
+ eta: float = 0.0,
207
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
208
+ latents: Optional[paddle.Tensor] = None,
209
+ prompt_embeds: Optional[paddle.Tensor] = None,
210
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
211
+ pooled_prompt_embeds: Optional[paddle.Tensor] = None,
212
+ negative_pooled_prompt_embeds: Optional[paddle.Tensor] = None,
213
+ ip_adapter_image: Optional[PipelineImageInput] = None,
214
+ output_type: Optional[str] = "pil",
215
+ return_dict: bool = True,
216
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
217
+ callback_steps: int = 1,
218
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
219
+ guidance_rescale: float = 0.0,
220
+ original_size: Tuple[int, int] = None,
221
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
222
+ target_size: Tuple[int, int] = None,
223
+ aesthetic_score: float = 6.0,
224
+ negative_aesthetic_score: float = 2.5,
225
+ infer_op_dict: Dict[str, str] = None,
226
+ ):
227
+ """
228
+ Function invoked when calling the pipeline for generation.
229
+
230
+ Args:
231
+ prompt (`str` or `List[str]`, *optional*):
232
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
233
+ instead.
234
+ prompt_2 (`str` or `List[str]`, *optional*):
235
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
236
+ used in both text-encoders
237
+ image (`paddle.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[paddle.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`):
238
+ The image(s) to modify with the pipeline.
239
+ strength (`float`, *optional*, defaults to 0.3):
240
+ Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
241
+ will be used as a starting point, adding more noise to it the larger the `strength`. The number of
242
+ denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
243
+ be maximum and the denoising process will run for the full number of iterations specified in
244
+ `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. Note that in the case of
245
+ `denoising_start` being declared as an integer, the value of `strength` will be ignored.
246
+ height (`int`, *optional*, defaults to None):
247
+ The height in pixels of the generated image.
248
+ width (`int`, *optional*, defaults to None):
249
+ The width in pixels of the generated image.
250
+ num_inference_steps (`int`, *optional*, defaults to 50):
251
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
252
+ expense of slower inference.
253
+ denoising_start (`float`, *optional*):
254
+ When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
255
+ bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
256
+ it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
257
+ strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
258
+ is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refining the Image
259
+ Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
260
+ denoising_end (`float`, *optional*):
261
+ When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
262
+ completed before it is intentionally prematurely terminated. As a result, the returned sample will
263
+ still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
264
+ denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
265
+ final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
266
+ forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
267
+ Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
268
+ guidance_scale (`float`, *optional*, defaults to 7.5):
269
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
270
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
271
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
272
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
273
+ usually at the expense of lower image quality.
274
+ negative_prompt (`str` or `List[str]`, *optional*):
275
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
276
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
277
+ less than `1`).
278
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
279
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
280
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
281
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
282
+ The number of images to generate per prompt.
283
+ eta (`float`, *optional*, defaults to 0.0):
284
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
285
+ [`schedulers.DDIMScheduler`], will be ignored for others.
286
+ generator (`paddle.Generator` or `List[paddlee.Generator]`, *optional*):
287
+ One or a list of paddle generator(s).
288
+ to make generation deterministic.
289
+ latents (`paddle.Tensor`, *optional*):
290
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
291
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
292
+ tensor will ge generated by sampling using the supplied random `generator`.
293
+ prompt_embeds (`paddle.Tensor`, *optional*):
294
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
295
+ provided, text embeddings will be generated from `prompt` input argument.
296
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
297
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
298
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
299
+ argument.
300
+ pooled_prompt_embeds (`paddle.Tensor`, *optional*):
301
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
302
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
303
+ negative_pooled_prompt_embeds (`paddle.Tensor`, *optional*):
304
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
305
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
306
+ input argument.
307
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
308
+ output_type (`str`, *optional*, defaults to `"pil"`):
309
+ The output format of the generate image. Choose between
310
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
311
+ return_dict (`bool`, *optional*, defaults to `True`):
312
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
313
+ plain tuple.
314
+ callback (`Callable`, *optional*):
315
+ A function that will be called every `callback_steps` steps during inference. The function will be
316
+ called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
317
+ callback_steps (`int`, *optional*, defaults to 1):
318
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
319
+ called at every step.
320
+ cross_attention_kwargs (`dict`, *optional*):
321
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
322
+ `self.processor` in ppdiffusers.cross_attention.
323
+ guidance_rescale (`float`, *optional*, defaults to 0.7):
324
+ Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
325
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
326
+ [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
327
+ Guidance rescale factor should fix overexposure when using zero terminal SNR.
328
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
329
+ If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
330
+ `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
331
+ explained in section 2.2 of
332
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
333
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
334
+ `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
335
+ `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
336
+ `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
337
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
338
+ target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
339
+ For most cases, `target_size` should be set to the desired height and width of the generated image. If
340
+ not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
341
+ section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
342
+ aesthetic_score (`float`, *optional*, defaults to 6.0):
343
+ Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
344
+ Part of SDXL's micro-conditioning as explained in section 2.2 of
345
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
346
+ negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
347
+ Part of SDXL's micro-conditioning as explained in section 2.2 of
348
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
349
+ simulate an aesthetic score of the generated image by influencing the negative text condition.
350
+
351
+ Examples:
352
+
353
+ Returns:
354
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
355
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
356
+ `tuple. When returning a tuple, the first element is a list with the generated images.
357
+ """
358
+ # 1. Check inputs. Raise error if not correct
359
+ infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
360
+ self.check_inputs(
361
+ prompt,
362
+ prompt_2,
363
+ strength,
364
+ num_inference_steps,
365
+ callback_steps,
366
+ negative_prompt,
367
+ negative_prompt_2,
368
+ prompt_embeds,
369
+ negative_prompt_embeds,
370
+ )
371
+
372
+ # 2. Define call parameters
373
+ if prompt is not None and isinstance(prompt, str):
374
+ batch_size = 1
375
+ elif prompt is not None and isinstance(prompt, list):
376
+ batch_size = len(prompt)
377
+ else:
378
+ batch_size = prompt_embeds.shape[0]
379
+
380
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
381
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
382
+ # corresponds to doing no classifier free guidance.
383
+ do_classifier_free_guidance = guidance_scale > 1.0
384
+
385
+ # 3. Encode input prompt
386
+ text_encoder_lora_scale = (
387
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
388
+ )
389
+ (
390
+ prompt_embeds,
391
+ negative_prompt_embeds,
392
+ pooled_prompt_embeds,
393
+ negative_pooled_prompt_embeds,
394
+ ) = self.encode_prompt(
395
+ prompt=prompt,
396
+ prompt_2=prompt_2,
397
+ num_images_per_prompt=num_images_per_prompt,
398
+ do_classifier_free_guidance=do_classifier_free_guidance,
399
+ negative_prompt=negative_prompt,
400
+ negative_prompt_2=negative_prompt_2,
401
+ prompt_embeds=prompt_embeds,
402
+ negative_prompt_embeds=negative_prompt_embeds,
403
+ pooled_prompt_embeds=pooled_prompt_embeds,
404
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
405
+ lora_scale=text_encoder_lora_scale,
406
+ infer_op=infer_op_dict.get("text_encoder", None),
407
+ )
408
+
409
+ if ip_adapter_image is not None:
410
+ image_embeds, negative_image_embeds = self.encode_image(
411
+ ip_adapter_image, num_images_per_prompt, infer_op=infer_op_dict.get("image_encoder", None)
412
+ )
413
+ if do_classifier_free_guidance:
414
+ image_embeds = paddle.concat([negative_image_embeds, image_embeds])
415
+
416
+ # 4. Preprocess image
417
+ image = self.image_processor.preprocess(image, height=height, width=width)
418
+ height, width = image.shape[-2:]
419
+
420
+ # 5. Prepare timesteps
421
+
422
+ def denoising_value_valid(dnv):
423
+ return type(denoising_end) == float and 0 < dnv < 1
424
+
425
+ self.scheduler.set_timesteps(num_inference_steps)
426
+ timesteps, num_inference_steps = self.get_timesteps(
427
+ num_inference_steps, strength, denoising_start=denoising_start if denoising_value_valid else None
428
+ )
429
+ latent_timestep = timesteps[:1].tile(repeat_times=[batch_size * num_images_per_prompt])
430
+ # add_noise = True if denoising_start is None else False
431
+
432
+ # 6. Prepare latent variables
433
+ # latents = self.prepare_latents(
434
+ # image=image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, generator, add_noise
435
+ # )
436
+ is_strength_max = strength == 1.0
437
+ latents = self.prepare_latents(
438
+ batch_size * num_images_per_prompt,
439
+ height,
440
+ width,
441
+ generator,
442
+ latents,
443
+ image=image,
444
+ timestep=latent_timestep,
445
+ is_strength_max=is_strength_max,
446
+ infer_op=infer_op_dict.get("vae_encoder", None),
447
+ )
448
+ # 7. Prepare extra step kwargs.
449
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
450
+ height, width = latents.shape[-2:]
451
+ height = height * self.vae_scale_factor
452
+ width = width * self.vae_scale_factor
453
+ original_size = original_size or (height, width)
454
+ target_size = target_size or (height, width)
455
+
456
+ # 8. Prepare added time ids & embeddings
457
+ add_text_embeds = pooled_prompt_embeds
458
+ add_time_ids, add_neg_time_ids = self._get_add_time_ids_2(
459
+ original_size,
460
+ crops_coords_top_left,
461
+ target_size,
462
+ aesthetic_score,
463
+ negative_aesthetic_score,
464
+ dtype=prompt_embeds.dtype,
465
+ )
466
+ add_time_ids = add_time_ids.tile(repeat_times=[batch_size * num_images_per_prompt, 1])
467
+ if do_classifier_free_guidance:
468
+ prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds], axis=0)
469
+ add_text_embeds = paddle.concat(x=[negative_pooled_prompt_embeds, add_text_embeds], axis=0)
470
+ add_neg_time_ids = add_neg_time_ids.tile(repeat_times=[batch_size * num_images_per_prompt, 1])
471
+ add_time_ids = paddle.concat(x=[add_neg_time_ids, add_time_ids], axis=0)
472
+
473
+ # 9. Denoising loop
474
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
475
+
476
+ # 9.1 Apply denoising_end
477
+ if (
478
+ denoising_end is not None
479
+ and denoising_start is not None
480
+ and denoising_value_valid(denoising_end)
481
+ and denoising_value_valid(denoising_start)
482
+ and denoising_start >= denoising_end
483
+ ):
484
+ raise ValueError(
485
+ f"`denoising_start`: {denoising_start} cannot be larger than or equal to `denoising_end`: "
486
+ + f" {denoising_end} when using type float."
487
+ )
488
+ elif denoising_end is not None and denoising_value_valid(denoising_end):
489
+ discrete_timestep_cutoff = int(
490
+ round(
491
+ self.scheduler.config.num_train_timesteps
492
+ - denoising_end * self.scheduler.config.num_train_timesteps
493
+ )
494
+ )
495
+ num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
496
+ timesteps = timesteps[:num_inference_steps]
497
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
498
+ for i, t in enumerate(timesteps):
499
+ # expand the latents if we are doing classifier free guidance
500
+ latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents
501
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
502
+
503
+ unet_inputs = dict(
504
+ sample=latent_model_input,
505
+ timestep=t,
506
+ encoder_hidden_states=prompt_embeds,
507
+ text_embeds=add_text_embeds,
508
+ time_ids=add_time_ids,
509
+ infer_op=infer_op_dict.get("unet", None),
510
+ output_shape=latent_model_input.shape,
511
+ )
512
+ # Add image embeds for IP-Adapter
513
+ # added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
514
+ if ip_adapter_image:
515
+ unet_inputs["image_embeds"] = image_embeds
516
+
517
+ # predict the noise residual
518
+ noise_pred = self.unet(**unet_inputs)[0]
519
+
520
+ # perform guidance
521
+ if do_classifier_free_guidance:
522
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
523
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
524
+
525
+ if guidance_rescale > 0.0:
526
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
527
+ noise_pred = self.rescale_noise_cfg(
528
+ noise_pred, noise_pred_text, guidance_rescale=guidance_rescale
529
+ )
530
+
531
+ # compute the previous noisy sample x_t -> x_t-1
532
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
533
+
534
+ # call the callback, if provided
535
+ if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
536
+ progress_bar.update()
537
+ if callback is not None and i % callback_steps == 0:
538
+ callback(i, t, latents)
539
+
540
+ if not output_type == "latent":
541
+ image = self._decode_vae_latents(
542
+ latents / self.vae_scaling_factor, infer_op=infer_op_dict.get("vae_decoder", None)
543
+ )
544
+ else:
545
+ image = latents
546
+ return StableDiffusionXLPipelineOutput(images=image)
547
+
548
+ image = self.image_processor.postprocess(image, output_type=output_type)
549
+ if not return_dict:
550
+ return (image,)
551
+
552
+ return StableDiffusionXLPipelineOutput(images=image)
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/pipeline_output.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from dataclasses import dataclass
16
+ from typing import List, Union
17
+
18
+ import numpy as np
19
+ import PIL.Image
20
+
21
+ from ...utils import BaseOutput
22
+
23
+
24
+ @dataclass
25
+ class StableDiffusionXLPipelineOutput(BaseOutput):
26
+ """
27
+ Output class for Stable Diffusion pipelines.
28
+
29
+ Args:
30
+ images (`List[PIL.Image.Image]` or `np.ndarray`)
31
+ List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
32
+ num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
33
+ """
34
+
35
+ images: Union[List[PIL.Image.Image], np.ndarray]
VLMEvalKit_old/PaddleMIX/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/pipeline_paddleinfer_stable_diffusion_xl_img2img.py ADDED
@@ -0,0 +1,548 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
16
+
17
+ import numpy as np
18
+ import paddle
19
+ import PIL.Image
20
+
21
+ from ppdiffusers.transformers import CLIPImageProcessor, CLIPTokenizer
22
+
23
+ from ...image_processor import PipelineImageInput
24
+ from ...loaders import IPAdapterMixin
25
+ from ...models.paddleinfer_runtime import PaddleInferRuntimeModel
26
+ from ...schedulers import KarrasDiffusionSchedulers
27
+ from ...utils import logging, replace_example_docstring
28
+ from ..paddleinfer_xl_utils import PaddleInferDiffusionXLPipelineMixin
29
+ from ..pipeline_utils import DiffusionPipeline
30
+ from . import StableDiffusionXLPipelineOutput
31
+
32
+ logger = logging.get_logger(__name__)
33
+ EXAMPLE_DOC_STRING = """
34
+ Examples:
35
+ ```py
36
+ >>> import paddle
37
+ >>> from ppdiffusers import StableDiffusionXLImg2ImgPipeline
38
+ >>> from ppdiffusers.utils import load_image
39
+
40
+ >>> pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
41
+ ... "stabilityai/stable-diffusion-xl-refiner-1.0", paddle_dtype=paddle.float16
42
+ ... )
43
+ >>> url = "https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/aa_xl/000000009.png"
44
+
45
+ >>> init_image = load_image(url).convert("RGB")
46
+ >>> prompt = "a photo of an astronaut riding a horse on mars"
47
+ >>> image = pipe(prompt, image=init_image).images[0]
48
+ ```
49
+ """
50
+
51
+
52
+ class PaddleInferStableDiffusionXLImg2ImgPipeline(
53
+ DiffusionPipeline, PaddleInferDiffusionXLPipelineMixin, IPAdapterMixin
54
+ ):
55
+ """
56
+ Pipeline for text-to-image generation using Stable Diffusion XL.
57
+
58
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
59
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
60
+
61
+ In addition the pipeline inherits the following loading methods:
62
+ - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
63
+ - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
64
+ - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
65
+
66
+ as well as the following saving methods:
67
+ - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
68
+
69
+ Args:
70
+ vae ([`AutoencoderKL`]):
71
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
72
+ text_encoder ([`CLIPTextModel`]):
73
+ Frozen text-encoder. Stable Diffusion XL uses the text portion of
74
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
75
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
76
+ text_encoder_2 ([` CLIPTextModelWithProjection`]):
77
+ Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
78
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
79
+ specifically the
80
+ [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
81
+ variant.
82
+ tokenizer (`CLIPTokenizer`):
83
+ Tokenizer of class
84
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
85
+ tokenizer_2 (`CLIPTokenizer`):
86
+ Second Tokenizer of class
87
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
88
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
89
+ scheduler ([`SchedulerMixin`]):
90
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
91
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
92
+ """
93
+
94
+ _optional_components = ["image_encoder", "tokenizer", "text_encoder"]
95
+
96
+ def __init__(
97
+ self,
98
+ vae_encoder: PaddleInferRuntimeModel,
99
+ vae_decoder: PaddleInferRuntimeModel,
100
+ text_encoder: PaddleInferRuntimeModel,
101
+ text_encoder_2: PaddleInferRuntimeModel,
102
+ tokenizer: CLIPTokenizer,
103
+ tokenizer_2: CLIPTokenizer,
104
+ unet: PaddleInferRuntimeModel,
105
+ image_encoder: PaddleInferRuntimeModel,
106
+ feature_extractor: CLIPImageProcessor,
107
+ scheduler: KarrasDiffusionSchedulers,
108
+ force_zeros_for_empty_prompt: bool = True,
109
+ requires_aesthetics_score: bool = False,
110
+ ):
111
+ super().__init__()
112
+ self.register_modules(
113
+ vae_encoder=vae_encoder,
114
+ vae_decoder=vae_decoder,
115
+ text_encoder=text_encoder,
116
+ text_encoder_2=text_encoder_2,
117
+ tokenizer=tokenizer,
118
+ tokenizer_2=tokenizer_2,
119
+ unet=unet,
120
+ image_encoder=image_encoder,
121
+ feature_extractor=feature_extractor,
122
+ scheduler=scheduler,
123
+ )
124
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
125
+ self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
126
+ self.post_init(vae_scaling_factor=0.13025)
127
+
128
+ def check_inputs(
129
+ self,
130
+ prompt,
131
+ prompt_2,
132
+ strength,
133
+ num_inference_steps,
134
+ callback_steps,
135
+ negative_prompt=None,
136
+ negative_prompt_2=None,
137
+ prompt_embeds=None,
138
+ negative_prompt_embeds=None,
139
+ ):
140
+ if strength < 0 or strength > 1:
141
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
142
+ if num_inference_steps is None:
143
+ raise ValueError("`num_inference_steps` cannot be None.")
144
+ elif not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
145
+ raise ValueError(
146
+ f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type {type(num_inference_steps)}."
147
+ )
148
+ if (
149
+ callback_steps is None
150
+ or callback_steps is not None
151
+ and (not isinstance(callback_steps, int) or callback_steps <= 0)
152
+ ):
153
+ raise ValueError(
154
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}."
155
+ )
156
+ if prompt is not None and prompt_embeds is not None:
157
+ raise ValueError(
158
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to only forward one of the two."
159
+ )
160
+ elif prompt_2 is not None and prompt_embeds is not None:
161
+ raise ValueError(
162
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to only forward one of the two."
163
+ )
164
+ elif prompt is None and prompt_embeds is None:
165
+ raise ValueError(
166
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
167
+ )
168
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
169
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
170
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
171
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
172
+ if negative_prompt is not None and negative_prompt_embeds is not None:
173
+ raise ValueError(
174
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two."
175
+ )
176
+ elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
177
+ raise ValueError(
178
+ f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two."
179
+ )
180
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
181
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
182
+ raise ValueError(
183
+ f"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds` {negative_prompt_embeds.shape}."
184
+ )
185
+
186
+ @paddle.no_grad()
187
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
188
+ def __call__(
189
+ self,
190
+ prompt: Union[str, List[str]] = None,
191
+ prompt_2: Optional[Union[str, List[str]]] = None,
192
+ image: Union[
193
+ paddle.Tensor, PIL.Image.Image, np.ndarray, List[paddle.Tensor], List[PIL.Image.Image], List[np.ndarray]
194
+ ] = None,
195
+ strength: float = 0.3,
196
+ height: Optional[int] = None,
197
+ width: Optional[int] = None,
198
+ num_inference_steps: int = 50,
199
+ denoising_start: Optional[float] = None,
200
+ denoising_end: Optional[float] = None,
201
+ guidance_scale: float = 5.0,
202
+ negative_prompt: Optional[Union[str, List[str]]] = None,
203
+ negative_prompt_2: Optional[Union[str, List[str]]] = None,
204
+ num_images_per_prompt: Optional[int] = 1,
205
+ timesteps: List[int] = None,
206
+ eta: float = 0.0,
207
+ generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
208
+ latents: Optional[paddle.Tensor] = None,
209
+ prompt_embeds: Optional[paddle.Tensor] = None,
210
+ negative_prompt_embeds: Optional[paddle.Tensor] = None,
211
+ pooled_prompt_embeds: Optional[paddle.Tensor] = None,
212
+ negative_pooled_prompt_embeds: Optional[paddle.Tensor] = None,
213
+ ip_adapter_image: Optional[PipelineImageInput] = None,
214
+ output_type: Optional[str] = "pil",
215
+ return_dict: bool = True,
216
+ callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
217
+ callback_steps: int = 1,
218
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
219
+ guidance_rescale: float = 0.0,
220
+ original_size: Tuple[int, int] = None,
221
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
222
+ target_size: Tuple[int, int] = None,
223
+ aesthetic_score: float = 6.0,
224
+ negative_aesthetic_score: float = 2.5,
225
+ ):
226
+ """
227
+ Function invoked when calling the pipeline for generation.
228
+
229
+ Args:
230
+ prompt (`str` or `List[str]`, *optional*):
231
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
232
+ instead.
233
+ prompt_2 (`str` or `List[str]`, *optional*):
234
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
235
+ used in both text-encoders
236
+ image (`paddle.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[paddle.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`):
237
+ The image(s) to modify with the pipeline.
238
+ strength (`float`, *optional*, defaults to 0.3):
239
+ Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
240
+ will be used as a starting point, adding more noise to it the larger the `strength`. The number of
241
+ denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
242
+ be maximum and the denoising process will run for the full number of iterations specified in
243
+ `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. Note that in the case of
244
+ `denoising_start` being declared as an integer, the value of `strength` will be ignored.
245
+ height (`int`, *optional*, defaults to None):
246
+ The height in pixels of the generated image.
247
+ width (`int`, *optional*, defaults to None):
248
+ The width in pixels of the generated image.
249
+ num_inference_steps (`int`, *optional*, defaults to 50):
250
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
251
+ expense of slower inference.
252
+ denoising_start (`float`, *optional*):
253
+ When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
254
+ bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
255
+ it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
256
+ strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
257
+ is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refining the Image
258
+ Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
259
+ denoising_end (`float`, *optional*):
260
+ When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
261
+ completed before it is intentionally prematurely terminated. As a result, the returned sample will
262
+ still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
263
+ denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
264
+ final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
265
+ forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
266
+ Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
267
+ guidance_scale (`float`, *optional*, defaults to 7.5):
268
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
269
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
270
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
271
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
272
+ usually at the expense of lower image quality.
273
+ negative_prompt (`str` or `List[str]`, *optional*):
274
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
275
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
276
+ less than `1`).
277
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
278
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
279
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
280
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
281
+ The number of images to generate per prompt.
282
+ eta (`float`, *optional*, defaults to 0.0):
283
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
284
+ [`schedulers.DDIMScheduler`], will be ignored for others.
285
+ generator (`paddle.Generator` or `List[paddlee.Generator]`, *optional*):
286
+ One or a list of paddle generator(s).
287
+ to make generation deterministic.
288
+ latents (`paddle.Tensor`, *optional*):
289
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
290
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
291
+ tensor will ge generated by sampling using the supplied random `generator`.
292
+ prompt_embeds (`paddle.Tensor`, *optional*):
293
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
294
+ provided, text embeddings will be generated from `prompt` input argument.
295
+ negative_prompt_embeds (`paddle.Tensor`, *optional*):
296
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
297
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
298
+ argument.
299
+ pooled_prompt_embeds (`paddle.Tensor`, *optional*):
300
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
301
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
302
+ negative_pooled_prompt_embeds (`paddle.Tensor`, *optional*):
303
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
304
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
305
+ input argument.
306
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
307
+ output_type (`str`, *optional*, defaults to `"pil"`):
308
+ The output format of the generate image. Choose between
309
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
310
+ return_dict (`bool`, *optional*, defaults to `True`):
311
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
312
+ plain tuple.
313
+ callback (`Callable`, *optional*):
314
+ A function that will be called every `callback_steps` steps during inference. The function will be
315
+ called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
316
+ callback_steps (`int`, *optional*, defaults to 1):
317
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
318
+ called at every step.
319
+ cross_attention_kwargs (`dict`, *optional*):
320
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
321
+ `self.processor` in ppdiffusers.cross_attention.
322
+ guidance_rescale (`float`, *optional*, defaults to 0.7):
323
+ Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
324
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
325
+ [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
326
+ Guidance rescale factor should fix overexposure when using zero terminal SNR.
327
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
328
+ If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
329
+ `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
330
+ explained in section 2.2 of
331
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
332
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
333
+ `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
334
+ `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
335
+ `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
336
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
337
+ target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
338
+ For most cases, `target_size` should be set to the desired height and width of the generated image. If
339
+ not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
340
+ section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
341
+ aesthetic_score (`float`, *optional*, defaults to 6.0):
342
+ Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
343
+ Part of SDXL's micro-conditioning as explained in section 2.2 of
344
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
345
+ negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
346
+ Part of SDXL's micro-conditioning as explained in section 2.2 of
347
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
348
+ simulate an aesthetic score of the generated image by influencing the negative text condition.
349
+
350
+ Examples:
351
+
352
+ Returns:
353
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
354
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
355
+ `tuple. When returning a tuple, the first element is a list with the generated images.
356
+ """
357
+ # 1. Check inputs. Raise error if not correct
358
+ self.check_inputs(
359
+ prompt,
360
+ prompt_2,
361
+ strength,
362
+ num_inference_steps,
363
+ callback_steps,
364
+ negative_prompt,
365
+ negative_prompt_2,
366
+ prompt_embeds,
367
+ negative_prompt_embeds,
368
+ )
369
+
370
+ # 2. Define call parameters
371
+ if prompt is not None and isinstance(prompt, str):
372
+ batch_size = 1
373
+ elif prompt is not None and isinstance(prompt, list):
374
+ batch_size = len(prompt)
375
+ else:
376
+ batch_size = prompt_embeds.shape[0]
377
+
378
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
379
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
380
+ # corresponds to doing no classifier free guidance.
381
+ do_classifier_free_guidance = guidance_scale > 1.0
382
+
383
+ # 3. Encode input prompt
384
+ text_encoder_lora_scale = (
385
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
386
+ )
387
+ (
388
+ prompt_embeds,
389
+ negative_prompt_embeds,
390
+ pooled_prompt_embeds,
391
+ negative_pooled_prompt_embeds,
392
+ ) = self.encode_prompt(
393
+ prompt=prompt,
394
+ prompt_2=prompt_2,
395
+ num_images_per_prompt=num_images_per_prompt,
396
+ do_classifier_free_guidance=do_classifier_free_guidance,
397
+ negative_prompt=negative_prompt,
398
+ negative_prompt_2=negative_prompt_2,
399
+ prompt_embeds=prompt_embeds,
400
+ negative_prompt_embeds=negative_prompt_embeds,
401
+ pooled_prompt_embeds=pooled_prompt_embeds,
402
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
403
+ lora_scale=text_encoder_lora_scale,
404
+ )
405
+
406
+ if ip_adapter_image is not None:
407
+ image_embeds, negative_image_embeds = self.encode_image(
408
+ ip_adapter_image,
409
+ num_images_per_prompt,
410
+ )
411
+ if do_classifier_free_guidance:
412
+ image_embeds = paddle.concat([negative_image_embeds, image_embeds])
413
+
414
+ # 4. Preprocess image
415
+ image = self.image_processor.preprocess(image, height=height, width=width)
416
+ height, width = image.shape[-2:]
417
+
418
+ # 5. Prepare timesteps
419
+
420
+ def denoising_value_valid(dnv):
421
+ return type(denoising_end) == float and 0 < dnv < 1
422
+
423
+ self.scheduler.set_timesteps(num_inference_steps)
424
+ timesteps, num_inference_steps = self.get_timesteps(
425
+ num_inference_steps, strength, denoising_start=denoising_start if denoising_value_valid else None
426
+ )
427
+ latent_timestep = timesteps[:1].tile(repeat_times=[batch_size * num_images_per_prompt])
428
+ # add_noise = True if denoising_start is None else False
429
+
430
+ # 6. Prepare latent variables
431
+ # latents = self.prepare_latents(
432
+ # image=image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, generator, add_noise
433
+ # )
434
+ is_strength_max = strength == 1.0
435
+ latents = self.prepare_latents(
436
+ batch_size * num_images_per_prompt,
437
+ height,
438
+ width,
439
+ generator,
440
+ latents,
441
+ image=image,
442
+ timestep=latent_timestep,
443
+ is_strength_max=is_strength_max,
444
+ )
445
+ # 7. Prepare extra step kwargs.
446
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
447
+ height, width = latents.shape[-2:]
448
+ height = height * self.vae_scale_factor
449
+ width = width * self.vae_scale_factor
450
+ original_size = original_size or (height, width)
451
+ target_size = target_size or (height, width)
452
+
453
+ # 8. Prepare added time ids & embeddings
454
+ add_text_embeds = pooled_prompt_embeds
455
+ add_time_ids, add_neg_time_ids = self._get_add_time_ids_2(
456
+ original_size,
457
+ crops_coords_top_left,
458
+ target_size,
459
+ aesthetic_score,
460
+ negative_aesthetic_score,
461
+ dtype=prompt_embeds.dtype,
462
+ )
463
+ add_time_ids = add_time_ids.tile(repeat_times=[batch_size * num_images_per_prompt, 1])
464
+ if do_classifier_free_guidance:
465
+ prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds], axis=0)
466
+ add_text_embeds = paddle.concat(x=[negative_pooled_prompt_embeds, add_text_embeds], axis=0)
467
+ add_neg_time_ids = add_neg_time_ids.tile(repeat_times=[batch_size * num_images_per_prompt, 1])
468
+ add_time_ids = paddle.concat(x=[add_neg_time_ids, add_time_ids], axis=0)
469
+
470
+ # 9. Denoising loop
471
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
472
+
473
+ # 9.1 Apply denoising_end
474
+ if (
475
+ denoising_end is not None
476
+ and denoising_start is not None
477
+ and denoising_value_valid(denoising_end)
478
+ and denoising_value_valid(denoising_start)
479
+ and denoising_start >= denoising_end
480
+ ):
481
+ raise ValueError(
482
+ f"`denoising_start`: {denoising_start} cannot be larger than or equal to `denoising_end`: "
483
+ + f" {denoising_end} when using type float."
484
+ )
485
+ elif denoising_end is not None and denoising_value_valid(denoising_end):
486
+ discrete_timestep_cutoff = int(
487
+ round(
488
+ self.scheduler.config.num_train_timesteps
489
+ - denoising_end * self.scheduler.config.num_train_timesteps
490
+ )
491
+ )
492
+ num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
493
+ timesteps = timesteps[:num_inference_steps]
494
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
495
+ for i, t in enumerate(timesteps):
496
+ # expand the latents if we are doing classifier free guidance
497
+ latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents
498
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
499
+
500
+ unet_inputs = dict(
501
+ sample=latent_model_input,
502
+ timestep=t,
503
+ encoder_hidden_states=prompt_embeds,
504
+ text_embeds=add_text_embeds,
505
+ time_ids=add_time_ids,
506
+ output_shape=latent_model_input.shape,
507
+ )
508
+ # Add image embeds for IP-Adapter
509
+ # added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
510
+ if ip_adapter_image:
511
+ unet_inputs["image_embeds"] = image_embeds
512
+
513
+ # predict the noise residual
514
+ noise_pred = self.unet(**unet_inputs)[0]
515
+
516
+ # perform guidance
517
+ if do_classifier_free_guidance:
518
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
519
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
520
+
521
+ if guidance_rescale > 0.0:
522
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
523
+ noise_pred = self.rescale_noise_cfg(
524
+ noise_pred, noise_pred_text, guidance_rescale=guidance_rescale
525
+ )
526
+
527
+ # compute the previous noisy sample x_t -> x_t-1
528
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
529
+
530
+ # call the callback, if provided
531
+ if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
532
+ progress_bar.update()
533
+ if callback is not None and i % callback_steps == 0:
534
+ callback(i, t, latents)
535
+
536
+ if not output_type == "latent":
537
+ image = self._decode_vae_latents(
538
+ latents / self.vae_scaling_factor,
539
+ )
540
+ else:
541
+ image = latents
542
+ return StableDiffusionXLPipelineOutput(images=image)
543
+
544
+ image = self.image_processor.postprocess(image, output_type=output_type)
545
+ if not return_dict:
546
+ return (image,)
547
+
548
+ return StableDiffusionXLPipelineOutput(images=image)