Nymbo commited on
Commit
a06d504
1 Parent(s): 13de8f8

Delete custom_pipeline.py

Browse files
Files changed (1) hide show
  1. custom_pipeline.py +0 -930
custom_pipeline.py DELETED
@@ -1,930 +0,0 @@
1
- # Copyright 2024 Harutatsu Akiyama and The HuggingFace Team. All rights reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- import inspect
16
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
-
18
- import PIL.Image
19
- import torch
20
- from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
21
-
22
- from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
23
- from diffusers.loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
24
- from diffusers.models import AutoencoderKL, UNet2DConditionModel
25
- from diffusers.models.attention_processor import (
26
- AttnProcessor2_0,
27
- FusedAttnProcessor2_0,
28
- LoRAAttnProcessor2_0,
29
- LoRAXFormersAttnProcessor,
30
- XFormersAttnProcessor,
31
- )
32
- from diffusers.models.lora import adjust_lora_scale_text_encoder
33
- from diffusers.schedulers import KarrasDiffusionSchedulers
34
- from diffusers.utils import (
35
- USE_PEFT_BACKEND,
36
- deprecate,
37
- is_invisible_watermark_available,
38
- is_torch_xla_available,
39
- logging,
40
- replace_example_docstring,
41
- scale_lora_layers,
42
- )
43
- from diffusers.utils.torch_utils import randn_tensor
44
- from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
45
- from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
46
-
47
- if is_invisible_watermark_available():
48
- from diffusers.pipelines.stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
49
-
50
- if is_torch_xla_available():
51
- import torch_xla.core.xla_model as xm
52
-
53
- XLA_AVAILABLE = True
54
- else:
55
- XLA_AVAILABLE = False
56
-
57
-
58
- logger = logging.get_logger(__name__) # pylint: disable=invalid-name
59
-
60
- EXAMPLE_DOC_STRING = """
61
- Examples:
62
- ```py
63
- >>> import torch
64
- >>> from diffusers import StableDiffusionXLInstructPix2PixPipeline
65
- >>> from diffusers.utils import load_image
66
-
67
- >>> resolution = 768
68
- >>> image = load_image(
69
- ... "https://hf.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png"
70
- ... ).resize((resolution, resolution))
71
- >>> edit_instruction = "Turn sky into a cloudy one"
72
-
73
- >>> pipe = StableDiffusionXLInstructPix2PixPipeline.from_pretrained(
74
- ... "diffusers/sdxl-instructpix2pix-768", torch_dtype=torch.float16
75
- ... ).to("cuda")
76
-
77
- >>> edited_image = pipe(
78
- ... prompt=edit_instruction,
79
- ... image=image,
80
- ... height=resolution,
81
- ... width=resolution,
82
- ... guidance_scale=3.0,
83
- ... image_guidance_scale=1.5,
84
- ... num_inference_steps=30,
85
- ... ).images[0]
86
- >>> edited_image
87
- ```
88
- """
89
-
90
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
91
- def retrieve_latents(
92
- encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
93
- ):
94
- if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
95
- return encoder_output.latent_dist.sample(generator)
96
- elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
97
- return encoder_output.latent_dist.mode()
98
- elif hasattr(encoder_output, "latents"):
99
- return encoder_output.latents
100
- else:
101
- raise AttributeError("Could not access latents of provided encoder_output")
102
-
103
-
104
- def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
105
- """
106
- Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
107
- Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
108
- """
109
- std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
110
- std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
111
- # rescale the results from guidance (fixes overexposure)
112
- noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
113
- # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
114
- noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
115
- return noise_cfg
116
-
117
-
118
- class CosStableDiffusionXLInstructPix2PixPipeline(
119
- DiffusionPipeline,
120
- StableDiffusionMixin,
121
- TextualInversionLoaderMixin,
122
- FromSingleFileMixin,
123
- StableDiffusionXLLoraLoaderMixin,
124
- ):
125
- r"""
126
- Pipeline for pixel-level image editing by following text instructions. Based on Stable Diffusion XL.
127
-
128
- This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
129
- library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
130
-
131
- The pipeline also inherits the following loading methods:
132
- - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
133
- - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
134
- - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
135
- - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
136
-
137
- Args:
138
- vae ([`AutoencoderKL`]):
139
- Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
140
- text_encoder ([`CLIPTextModel`]):
141
- Frozen text-encoder. Stable Diffusion XL uses the text portion of
142
- [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
143
- the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
144
- text_encoder_2 ([` CLIPTextModelWithProjection`]):
145
- Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
146
- [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
147
- specifically the
148
- [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
149
- variant.
150
- tokenizer (`CLIPTokenizer`):
151
- Tokenizer of class
152
- [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
153
- tokenizer_2 (`CLIPTokenizer`):
154
- Second Tokenizer of class
155
- [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
156
- unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
157
- scheduler ([`SchedulerMixin`]):
158
- A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
159
- [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
160
- requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
161
- Whether the `unet` requires a aesthetic_score condition to be passed during inference. Also see the config
162
- of `stabilityai/stable-diffusion-xl-refiner-1-0`.
163
- force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
164
- Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
165
- `stabilityai/stable-diffusion-xl-base-1-0`.
166
- add_watermarker (`bool`, *optional*):
167
- Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
168
- watermark output images. If not defined, it will default to True if the package is installed, otherwise no
169
- watermarker will be used.
170
- """
171
-
172
- model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
173
- _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
174
-
175
- def __init__(
176
- self,
177
- vae: AutoencoderKL,
178
- text_encoder: CLIPTextModel,
179
- text_encoder_2: CLIPTextModelWithProjection,
180
- tokenizer: CLIPTokenizer,
181
- tokenizer_2: CLIPTokenizer,
182
- unet: UNet2DConditionModel,
183
- scheduler: KarrasDiffusionSchedulers,
184
- force_zeros_for_empty_prompt: bool = True,
185
- add_watermarker: Optional[bool] = None,
186
- ):
187
- super().__init__()
188
-
189
- self.register_modules(
190
- vae=vae,
191
- text_encoder=text_encoder,
192
- text_encoder_2=text_encoder_2,
193
- tokenizer=tokenizer,
194
- tokenizer_2=tokenizer_2,
195
- unet=unet,
196
- scheduler=scheduler,
197
- )
198
- self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
199
- self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
200
- self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
201
- self.default_sample_size = self.unet.config.sample_size
202
-
203
- add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
204
-
205
- if add_watermarker:
206
- self.watermark = StableDiffusionXLWatermarker()
207
- else:
208
- self.watermark = None
209
-
210
- def encode_prompt(
211
- self,
212
- prompt: str,
213
- prompt_2: Optional[str] = None,
214
- device: Optional[torch.device] = None,
215
- num_images_per_prompt: int = 1,
216
- do_classifier_free_guidance: bool = True,
217
- negative_prompt: Optional[str] = None,
218
- negative_prompt_2: Optional[str] = None,
219
- prompt_embeds: Optional[torch.FloatTensor] = None,
220
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
221
- pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
222
- negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
223
- lora_scale: Optional[float] = None,
224
- ):
225
- r"""
226
- Encodes the prompt into text encoder hidden states.
227
-
228
- Args:
229
- prompt (`str` or `List[str]`, *optional*):
230
- prompt to be encoded
231
- prompt_2 (`str` or `List[str]`, *optional*):
232
- The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
233
- used in both text-encoders
234
- device: (`torch.device`):
235
- torch device
236
- num_images_per_prompt (`int`):
237
- number of images that should be generated per prompt
238
- do_classifier_free_guidance (`bool`):
239
- whether to use classifier free guidance or not
240
- negative_prompt (`str` or `List[str]`, *optional*):
241
- The prompt or prompts not to guide the image generation. If not defined, one has to pass
242
- `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
243
- less than `1`).
244
- negative_prompt_2 (`str` or `List[str]`, *optional*):
245
- The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
246
- `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
247
- prompt_embeds (`torch.FloatTensor`, *optional*):
248
- Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
249
- provided, text embeddings will be generated from `prompt` input argument.
250
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
251
- Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
252
- weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
253
- argument.
254
- pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
255
- Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
256
- If not provided, pooled text embeddings will be generated from `prompt` input argument.
257
- negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
258
- Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
259
- weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
260
- input argument.
261
- lora_scale (`float`, *optional*):
262
- A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
263
- """
264
- device = device or self._execution_device
265
-
266
- # set lora scale so that monkey patched LoRA
267
- # function of text encoder can correctly access it
268
- if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
269
- self._lora_scale = lora_scale
270
-
271
- # dynamically adjust the LoRA scale
272
- if self.text_encoder is not None:
273
- if not USE_PEFT_BACKEND:
274
- adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
275
- else:
276
- scale_lora_layers(self.text_encoder, lora_scale)
277
-
278
- if self.text_encoder_2 is not None:
279
- if not USE_PEFT_BACKEND:
280
- adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
281
- else:
282
- scale_lora_layers(self.text_encoder_2, lora_scale)
283
-
284
- if prompt is not None and isinstance(prompt, str):
285
- batch_size = 1
286
- elif prompt is not None and isinstance(prompt, list):
287
- batch_size = len(prompt)
288
- else:
289
- batch_size = prompt_embeds.shape[0]
290
-
291
- # Define tokenizers and text encoders
292
- tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
293
- text_encoders = (
294
- [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
295
- )
296
-
297
- if prompt_embeds is None:
298
- prompt_2 = prompt_2 or prompt
299
- # textual inversion: process multi-vector tokens if necessary
300
- prompt_embeds_list = []
301
- prompts = [prompt, prompt_2]
302
- for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
303
- if isinstance(self, TextualInversionLoaderMixin):
304
- prompt = self.maybe_convert_prompt(prompt, tokenizer)
305
-
306
- text_inputs = tokenizer(
307
- prompt,
308
- padding="max_length",
309
- max_length=tokenizer.model_max_length,
310
- truncation=True,
311
- return_tensors="pt",
312
- )
313
-
314
- text_input_ids = text_inputs.input_ids
315
- untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
316
-
317
- if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
318
- text_input_ids, untruncated_ids
319
- ):
320
- removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
321
- logger.warning(
322
- "The following part of your input was truncated because CLIP can only handle sequences up to"
323
- f" {tokenizer.model_max_length} tokens: {removed_text}"
324
- )
325
-
326
- prompt_embeds = text_encoder(
327
- text_input_ids.to(device),
328
- output_hidden_states=True,
329
- )
330
-
331
- # We are only ALWAYS interested in the pooled output of the final text encoder
332
- pooled_prompt_embeds = prompt_embeds[0]
333
- prompt_embeds = prompt_embeds.hidden_states[-2]
334
-
335
- prompt_embeds_list.append(prompt_embeds)
336
-
337
- prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
338
-
339
- # get unconditional embeddings for classifier free guidance
340
- zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
341
- if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
342
- negative_prompt_embeds = torch.zeros_like(prompt_embeds)
343
- negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
344
- elif do_classifier_free_guidance and negative_prompt_embeds is None:
345
- negative_prompt = negative_prompt or ""
346
- negative_prompt_2 = negative_prompt_2 or negative_prompt
347
-
348
- uncond_tokens: List[str]
349
- if prompt is not None and type(prompt) is not type(negative_prompt):
350
- raise TypeError(
351
- f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
352
- f" {type(prompt)}."
353
- )
354
- elif isinstance(negative_prompt, str):
355
- uncond_tokens = [negative_prompt, negative_prompt_2]
356
- elif batch_size != len(negative_prompt):
357
- raise ValueError(
358
- f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
359
- f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
360
- " the batch size of `prompt`."
361
- )
362
- else:
363
- uncond_tokens = [negative_prompt, negative_prompt_2]
364
-
365
- negative_prompt_embeds_list = []
366
- for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
367
- if isinstance(self, TextualInversionLoaderMixin):
368
- negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
369
-
370
- max_length = prompt_embeds.shape[1]
371
- uncond_input = tokenizer(
372
- negative_prompt,
373
- padding="max_length",
374
- max_length=max_length,
375
- truncation=True,
376
- return_tensors="pt",
377
- )
378
-
379
- negative_prompt_embeds = text_encoder(
380
- uncond_input.input_ids.to(device),
381
- output_hidden_states=True,
382
- )
383
- # We are only ALWAYS interested in the pooled output of the final text encoder
384
- negative_pooled_prompt_embeds = negative_prompt_embeds[0]
385
- negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
386
-
387
- negative_prompt_embeds_list.append(negative_prompt_embeds)
388
-
389
- negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
390
-
391
- prompt_embeds_dtype = self.text_encoder_2.dtype if self.text_encoder_2 is not None else self.unet.dtype
392
- prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
393
- bs_embed, seq_len, _ = prompt_embeds.shape
394
- # duplicate text embeddings for each generation per prompt, using mps friendly method
395
- prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
396
- prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
397
-
398
- if do_classifier_free_guidance:
399
- # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
400
- seq_len = negative_prompt_embeds.shape[1]
401
- negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
402
- negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
403
- negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
404
-
405
- pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
406
- bs_embed * num_images_per_prompt, -1
407
- )
408
- if do_classifier_free_guidance:
409
- negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
410
- bs_embed * num_images_per_prompt, -1
411
- )
412
-
413
- return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
414
-
415
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
416
- def prepare_extra_step_kwargs(self, generator, eta):
417
- # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
418
- # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
419
- # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
420
- # and should be between [0, 1]
421
-
422
- accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
423
- extra_step_kwargs = {}
424
- if accepts_eta:
425
- extra_step_kwargs["eta"] = eta
426
-
427
- # check if the scheduler accepts generator
428
- accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
429
- if accepts_generator:
430
- extra_step_kwargs["generator"] = generator
431
- return extra_step_kwargs
432
-
433
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix.StableDiffusionInstructPix2PixPipeline.check_inputs
434
- def check_inputs(
435
- self,
436
- prompt,
437
- callback_steps,
438
- negative_prompt=None,
439
- prompt_embeds=None,
440
- negative_prompt_embeds=None,
441
- callback_on_step_end_tensor_inputs=None,
442
- ):
443
- if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
444
- raise ValueError(
445
- f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
446
- f" {type(callback_steps)}."
447
- )
448
-
449
- if callback_on_step_end_tensor_inputs is not None and not all(
450
- k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
451
- ):
452
- raise ValueError(
453
- f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
454
- )
455
-
456
- if prompt is not None and prompt_embeds is not None:
457
- raise ValueError(
458
- f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
459
- " only forward one of the two."
460
- )
461
- elif prompt is None and prompt_embeds is None:
462
- raise ValueError(
463
- "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
464
- )
465
- elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
466
- raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
467
-
468
- if negative_prompt is not None and negative_prompt_embeds is not None:
469
- raise ValueError(
470
- f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
471
- f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
472
- )
473
-
474
- if prompt_embeds is not None and negative_prompt_embeds is not None:
475
- if prompt_embeds.shape != negative_prompt_embeds.shape:
476
- raise ValueError(
477
- "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
478
- f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
479
- f" {negative_prompt_embeds.shape}."
480
- )
481
-
482
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
483
- def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
484
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
485
- if isinstance(generator, list) and len(generator) != batch_size:
486
- raise ValueError(
487
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
488
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
489
- )
490
-
491
- if latents is None:
492
- latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
493
- else:
494
- latents = latents.to(device)
495
-
496
- # scale the initial noise by the standard deviation required by the scheduler
497
- latents = latents * self.scheduler.init_noise_sigma
498
- return latents
499
-
500
- def prepare_image_latents(
501
- self, image, batch_size, num_images_per_prompt, dtype, device, do_classifier_free_guidance, generator=None
502
- ):
503
- if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
504
- raise ValueError(
505
- f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
506
- )
507
-
508
- image = image.to(device=device, dtype=dtype)
509
-
510
- batch_size = batch_size * num_images_per_prompt
511
-
512
- if image.shape[1] == 4:
513
- image_latents = image
514
- else:
515
- # make sure the VAE is in float32 mode, as it overflows in float16
516
- needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
517
- if needs_upcasting:
518
- self.upcast_vae()
519
- image = image.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
520
-
521
- image_latents = retrieve_latents(self.vae.encode(image), sample_mode="argmax")
522
-
523
- # cast back to fp16 if needed
524
- if needs_upcasting:
525
- self.vae.to(dtype=torch.float16)
526
-
527
- if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
528
- # expand image_latents for batch_size
529
- deprecation_message = (
530
- f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial"
531
- " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
532
- " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
533
- " your script to pass as many initial images as text prompts to suppress this warning."
534
- )
535
- deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
536
- additional_image_per_prompt = batch_size // image_latents.shape[0]
537
- image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
538
- elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
539
- raise ValueError(
540
- f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
541
- )
542
- else:
543
- image_latents = torch.cat([image_latents], dim=0)
544
-
545
- if do_classifier_free_guidance:
546
- uncond_image_latents = torch.zeros_like(image_latents)
547
- image_latents = torch.cat([image_latents, image_latents, uncond_image_latents], dim=0)
548
-
549
- if image_latents.dtype != self.vae.dtype:
550
- image_latents = image_latents.to(dtype=self.vae.dtype)
551
-
552
- return image_latents
553
-
554
- # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
555
- def _get_add_time_ids(
556
- self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
557
- ):
558
- add_time_ids = list(original_size + crops_coords_top_left + target_size)
559
-
560
- passed_add_embed_dim = (
561
- self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
562
- )
563
- expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
564
-
565
- if expected_add_embed_dim != passed_add_embed_dim:
566
- raise ValueError(
567
- f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
568
- )
569
-
570
- add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
571
- return add_time_ids
572
-
573
- # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
574
- def upcast_vae(self):
575
- dtype = self.vae.dtype
576
- self.vae.to(dtype=torch.float32)
577
- use_torch_2_0_or_xformers = isinstance(
578
- self.vae.decoder.mid_block.attentions[0].processor,
579
- (
580
- AttnProcessor2_0,
581
- XFormersAttnProcessor,
582
- LoRAXFormersAttnProcessor,
583
- LoRAAttnProcessor2_0,
584
- FusedAttnProcessor2_0,
585
- ),
586
- )
587
- # if xformers or torch_2_0 is used attention block does not need
588
- # to be in float32 which can save lots of memory
589
- if use_torch_2_0_or_xformers:
590
- self.vae.post_quant_conv.to(dtype)
591
- self.vae.decoder.conv_in.to(dtype)
592
- self.vae.decoder.mid_block.to(dtype)
593
-
594
- @torch.no_grad()
595
- @replace_example_docstring(EXAMPLE_DOC_STRING)
596
- def __call__(
597
- self,
598
- prompt: Union[str, List[str]] = None,
599
- prompt_2: Optional[Union[str, List[str]]] = None,
600
- image: PipelineImageInput = None,
601
- height: Optional[int] = None,
602
- width: Optional[int] = None,
603
- num_inference_steps: int = 100,
604
- denoising_end: Optional[float] = None,
605
- guidance_scale: float = 5.0,
606
- image_guidance_scale: float = 1.5,
607
- negative_prompt: Optional[Union[str, List[str]]] = None,
608
- negative_prompt_2: Optional[Union[str, List[str]]] = None,
609
- num_images_per_prompt: Optional[int] = 1,
610
- eta: float = 0.0,
611
- generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
612
- latents: Optional[torch.FloatTensor] = None,
613
- prompt_embeds: Optional[torch.FloatTensor] = None,
614
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
615
- pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
616
- negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
617
- output_type: Optional[str] = "pil",
618
- return_dict: bool = True,
619
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
620
- callback_steps: int = 1,
621
- cross_attention_kwargs: Optional[Dict[str, Any]] = None,
622
- guidance_rescale: float = 0.0,
623
- original_size: Tuple[int, int] = None,
624
- crops_coords_top_left: Tuple[int, int] = (0, 0),
625
- target_size: Tuple[int, int] = None,
626
- ):
627
- r"""
628
- Function invoked when calling the pipeline for generation.
629
-
630
- Args:
631
- prompt (`str` or `List[str]`, *optional*):
632
- The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
633
- instead.
634
- prompt_2 (`str` or `List[str]`, *optional*):
635
- The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
636
- used in both text-encoders
637
- image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`):
638
- The image(s) to modify with the pipeline.
639
- height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
640
- The height in pixels of the generated image.
641
- width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
642
- The width in pixels of the generated image.
643
- num_inference_steps (`int`, *optional*, defaults to 50):
644
- The number of denoising steps. More denoising steps usually lead to a higher quality image at the
645
- expense of slower inference.
646
- denoising_end (`float`, *optional*):
647
- When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
648
- completed before it is intentionally prematurely terminated. As a result, the returned sample will
649
- still retain a substantial amount of noise as determined by the discrete timesteps selected by the
650
- scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
651
- "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
652
- Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
653
- guidance_scale (`float`, *optional*, defaults to 5.0):
654
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
655
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
656
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
657
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
658
- usually at the expense of lower image quality.
659
- image_guidance_scale (`float`, *optional*, defaults to 1.5):
660
- Image guidance scale is to push the generated image towards the initial image `image`. Image guidance
661
- scale is enabled by setting `image_guidance_scale > 1`. Higher image guidance scale encourages to
662
- generate images that are closely linked to the source image `image`, usually at the expense of lower
663
- image quality. This pipeline requires a value of at least `1`.
664
- negative_prompt (`str` or `List[str]`, *optional*):
665
- The prompt or prompts not to guide the image generation. If not defined, one has to pass
666
- `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
667
- less than `1`).
668
- negative_prompt_2 (`str` or `List[str]`, *optional*):
669
- The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
670
- `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders.
671
- num_images_per_prompt (`int`, *optional*, defaults to 1):
672
- The number of images to generate per prompt.
673
- eta (`float`, *optional*, defaults to 0.0):
674
- Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
675
- [`schedulers.DDIMScheduler`], will be ignored for others.
676
- generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
677
- One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
678
- to make generation deterministic.
679
- latents (`torch.FloatTensor`, *optional*):
680
- Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
681
- generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
682
- tensor will ge generated by sampling using the supplied random `generator`.
683
- prompt_embeds (`torch.FloatTensor`, *optional*):
684
- Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
685
- provided, text embeddings will be generated from `prompt` input argument.
686
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
687
- Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
688
- weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
689
- argument.
690
- pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
691
- Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
692
- If not provided, pooled text embeddings will be generated from `prompt` input argument.
693
- negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
694
- Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
695
- weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
696
- input argument.
697
- output_type (`str`, *optional*, defaults to `"pil"`):
698
- The output format of the generate image. Choose between
699
- [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
700
- return_dict (`bool`, *optional*, defaults to `True`):
701
- Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
702
- plain tuple.
703
- callback (`Callable`, *optional*):
704
- A function that will be called every `callback_steps` steps during inference. The function will be
705
- called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
706
- callback_steps (`int`, *optional*, defaults to 1):
707
- The frequency at which the `callback` function will be called. If not specified, the callback will be
708
- called at every step.
709
- cross_attention_kwargs (`dict`, *optional*):
710
- A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
711
- `self.processor` in
712
- [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
713
- guidance_rescale (`float`, *optional*, defaults to 0.0):
714
- Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
715
- Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
716
- [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
717
- Guidance rescale factor should fix overexposure when using zero terminal SNR.
718
- original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
719
- If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
720
- `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
721
- explained in section 2.2 of
722
- [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
723
- crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
724
- `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
725
- `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
726
- `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
727
- [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
728
- target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
729
- For most cases, `target_size` should be set to the desired height and width of the generated image. If
730
- not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
731
- section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
732
- aesthetic_score (`float`, *optional*, defaults to 6.0):
733
- Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
734
- Part of SDXL's micro-conditioning as explained in section 2.2 of
735
- [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
736
- negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
737
- Part of SDXL's micro-conditioning as explained in section 2.2 of
738
- [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
739
- simulate an aesthetic score of the generated image by influencing the negative text condition.
740
-
741
- Examples:
742
-
743
- Returns:
744
- [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
745
- [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
746
- `tuple`. When returning a tuple, the first element is a list with the generated images.
747
- """
748
- # 0. Default height and width to unet
749
- height = height or self.default_sample_size * self.vae_scale_factor
750
- width = width or self.default_sample_size * self.vae_scale_factor
751
-
752
- original_size = original_size or (height, width)
753
- target_size = target_size or (height, width)
754
-
755
- # 1. Check inputs. Raise error if not correct
756
- self.check_inputs(prompt, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
757
-
758
- if image is None:
759
- raise ValueError("`image` input cannot be undefined.")
760
-
761
- # 2. Define call parameters
762
- if prompt is not None and isinstance(prompt, str):
763
- batch_size = 1
764
- elif prompt is not None and isinstance(prompt, list):
765
- batch_size = len(prompt)
766
- else:
767
- batch_size = prompt_embeds.shape[0]
768
-
769
- device = self._execution_device
770
-
771
- # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
772
- # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
773
- # corresponds to doing no classifier free guidance.
774
- do_classifier_free_guidance = guidance_scale > 1.0 and image_guidance_scale >= 1.0
775
-
776
- # 3. Encode input prompt
777
- text_encoder_lora_scale = (
778
- cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
779
- )
780
- (
781
- prompt_embeds,
782
- negative_prompt_embeds,
783
- pooled_prompt_embeds,
784
- negative_pooled_prompt_embeds,
785
- ) = self.encode_prompt(
786
- prompt=prompt,
787
- prompt_2=prompt_2,
788
- device=device,
789
- num_images_per_prompt=num_images_per_prompt,
790
- do_classifier_free_guidance=do_classifier_free_guidance,
791
- negative_prompt=negative_prompt,
792
- negative_prompt_2=negative_prompt_2,
793
- prompt_embeds=prompt_embeds,
794
- negative_prompt_embeds=negative_prompt_embeds,
795
- pooled_prompt_embeds=pooled_prompt_embeds,
796
- negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
797
- lora_scale=text_encoder_lora_scale,
798
- )
799
-
800
- # 4. Preprocess image
801
- image = self.image_processor.preprocess(image, height=height, width=width).to(device)
802
-
803
- # 5. Prepare timesteps
804
- self.scheduler.set_timesteps(num_inference_steps, device=device)
805
- timesteps = self.scheduler.timesteps
806
-
807
- # 6. Prepare Image latents
808
- image_latents = self.prepare_image_latents(
809
- image,
810
- batch_size,
811
- num_images_per_prompt,
812
- prompt_embeds.dtype,
813
- device,
814
- do_classifier_free_guidance,
815
- )
816
-
817
- # 7. Prepare latent variables
818
- num_channels_latents = self.vae.config.latent_channels
819
- latents = self.prepare_latents(
820
- batch_size * num_images_per_prompt,
821
- num_channels_latents,
822
- height,
823
- width,
824
- prompt_embeds.dtype,
825
- device,
826
- generator,
827
- latents,
828
- )
829
-
830
- # 8. Check that shapes of latents and image match the UNet channels
831
- num_channels_image = image_latents.shape[1]
832
- if num_channels_latents + num_channels_image != self.unet.config.in_channels:
833
- raise ValueError(
834
- f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
835
- f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
836
- f" `num_channels_image`: {num_channels_image} "
837
- f" = {num_channels_latents + num_channels_image}. Please verify the config of"
838
- " `pipeline.unet` or your `image` input."
839
- )
840
-
841
- # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
842
- extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
843
-
844
- # 10. Prepare added time ids & embeddings
845
- add_text_embeds = pooled_prompt_embeds
846
- if self.text_encoder_2 is None:
847
- text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
848
- else:
849
- text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
850
-
851
- add_time_ids = self._get_add_time_ids(
852
- original_size,
853
- crops_coords_top_left,
854
- target_size,
855
- dtype=prompt_embeds.dtype,
856
- text_encoder_projection_dim=text_encoder_projection_dim,
857
- )
858
- add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
859
-
860
- if do_classifier_free_guidance:
861
- # The extra concat similar to how it's done in SD InstructPix2Pix.
862
- prompt_embeds = torch.cat([prompt_embeds, negative_prompt_embeds, negative_prompt_embeds], dim=0)
863
- add_text_embeds = torch.cat(
864
- [add_text_embeds, negative_pooled_prompt_embeds, negative_pooled_prompt_embeds], dim=0
865
- )
866
- add_time_ids = torch.cat([add_time_ids, add_time_ids, add_time_ids], dim=0)
867
-
868
- prompt_embeds = prompt_embeds.to(device)
869
- add_text_embeds = add_text_embeds.to(device)
870
-
871
- # 11. Denoising loop
872
- num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
873
- with self.progress_bar(total=num_inference_steps) as progress_bar:
874
- for i, t in enumerate(timesteps):
875
- # expand the latents if we are doing classifier free guidance
876
- latent_model_input = torch.cat([latents] * 3) if do_classifier_free_guidance else latents
877
- latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
878
-
879
- # predict the noise residual
880
- added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
881
- noise_pred = self.unet(
882
- torch.cat([latent_model_input, image_latents], dim=1),
883
- t,
884
- encoder_hidden_states=prompt_embeds,
885
- cross_attention_kwargs=cross_attention_kwargs,
886
- added_cond_kwargs=added_cond_kwargs,
887
- return_dict=False,
888
- )[0]
889
-
890
- # perform guidance
891
- if do_classifier_free_guidance:
892
- noise_pred_text, noise_pred_image, noise_pred_uncond = noise_pred.chunk(3)
893
- noise_pred = (
894
- noise_pred_uncond
895
- + guidance_scale * (noise_pred_text - noise_pred_uncond)
896
- + image_guidance_scale * (noise_pred_image - noise_pred_uncond)
897
- )
898
-
899
- if do_classifier_free_guidance and guidance_rescale > 0.0:
900
- # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
901
- noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
902
-
903
- # compute the previous noisy sample x_t -> x_t-1
904
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
905
-
906
- # call the callback, if provided
907
- if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
908
- progress_bar.update()
909
- if callback is not None and i % callback_steps == 0:
910
- step_idx = i // getattr(self.scheduler, "order", 1)
911
- callback(step_idx, t, latents)
912
-
913
- if not output_type == "latent":
914
- image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
915
- else:
916
- return StableDiffusionXLPipelineOutput(images=latents)
917
-
918
- # apply watermark if available
919
- if self.watermark is not None:
920
- image = self.watermark.apply_watermark(image)
921
-
922
- image = self.image_processor.postprocess(image, output_type=output_type)
923
-
924
- # Offload all models
925
- self.maybe_free_model_hooks()
926
-
927
- if not return_dict:
928
- return (image,)
929
-
930
- return StableDiffusionXLPipelineOutput(images=image)