junnyu commited on
Commit
a4a6c30
·
1 Parent(s): 212ca9f

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +277 -150
pipeline.py CHANGED
@@ -15,10 +15,9 @@
15
  #
16
  # modified from https://github.com/AUTOMATIC1111/stable-diffusion-webui
17
  # Here is the AGPL-3.0 license https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/master/LICENSE.txt
18
- from ppdiffusers.utils import check_min_version
19
- check_min_version("0.14.1")
20
 
21
  import inspect
 
22
  from typing import Any, Callable, Dict, List, Optional, Union
23
 
24
  import paddle
@@ -39,9 +38,102 @@ from ppdiffusers.utils import (
39
  logging,
40
  randn_tensor,
41
  safetensors_load,
 
42
  torch_load,
43
  )
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
46
 
47
 
@@ -138,6 +230,7 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
138
  "kdpm2-ancestral",
139
  "kdpm2",
140
  ]
 
141
 
142
  def add_ti_embedding_dir(self, embeddings_dir):
143
  self.sj.embedding_db.add_embedding_dir(embeddings_dir)
@@ -147,6 +240,9 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
147
  self.sj.embedding_db.clear_embedding_dirs()
148
  self.sj.embedding_db.load_textual_inversion_embeddings(True)
149
 
 
 
 
150
  def switch_scheduler(self, scheduler_type="ddim"):
151
  scheduler_type = scheduler_type.lower()
152
  from ppdiffusers import (
@@ -409,8 +505,9 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
409
  callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
410
  callback_steps: Optional[int] = 1,
411
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
412
- clip_skip: int = 0,
413
  controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
 
414
  ):
415
  r"""
416
  Function invoked when calling the pipeline for generation.
@@ -468,12 +565,14 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
468
  A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
469
  `self.processor` in
470
  [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
471
- clip_skip (`int`, *optional*, defaults to 0):
472
- CLIP_stop_at_last_layers, if clip_skip < 1, we will use the last_hidden_state from text_encoder.
473
  controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
474
  The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
475
  to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
476
  corresponding scale as a list.
 
 
477
  Examples:
478
 
479
  Returns:
@@ -483,172 +582,200 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
483
  list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
484
  (nsfw) content, according to the `safety_checker`.
485
  """
486
- # 0. Default height and width to unet
487
- height, width = self._default_height_width(height, width, image)
488
-
489
- # 1. Check inputs. Raise error if not correct
490
- self.check_inputs(
491
- prompt,
492
- image,
493
- height,
494
- width,
495
- callback_steps,
496
- negative_prompt,
497
- controlnet_conditioning_scale,
498
- )
499
-
500
- batch_size = 1
501
-
502
- image = self.prepare_image(
503
- image=image,
504
- width=width,
505
- height=height,
506
- dtype=self.controlnet.dtype,
507
- )
508
 
509
- # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
510
- # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
511
- # corresponds to doing no classifier free guidance.
512
- do_classifier_free_guidance = guidance_scale > 1.0
513
 
514
- prompts, extra_network_data = parse_prompts([prompt])
 
 
 
 
 
515
 
516
- self.sj.clip.CLIP_stop_at_last_layers = clip_skip
517
- # 3. Encode input prompt
518
- prompt_embeds, negative_prompt_embeds = self._encode_prompt(
519
- prompts,
520
- do_classifier_free_guidance,
521
- negative_prompt,
522
- num_inference_steps=num_inference_steps,
523
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
 
525
- # 4. Prepare timesteps
526
- self.scheduler.set_timesteps(num_inference_steps)
527
- timesteps = self.scheduler.timesteps
528
-
529
- # 5. Prepare latent variables
530
- num_channels_latents = self.unet.in_channels
531
- latents = self.prepare_latents(
532
- batch_size,
533
- num_channels_latents,
534
- height,
535
- width,
536
- self.unet.dtype,
537
- generator,
538
- latents,
539
- )
540
 
541
- # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
542
- extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
 
544
- # 7. Denoising loop
545
- num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
546
- with self.progress_bar(total=num_inference_steps) as progress_bar:
547
- for i, t in enumerate(timesteps):
548
- step = i // self.scheduler.order
549
- do_batch = False
550
- conds_list, cond_tensor = reconstruct_multicond_batch(prompt_embeds, step)
551
- try:
552
- weight = conds_list[0][0][1]
553
- except Exception:
554
- weight = 1.0
555
- if do_classifier_free_guidance:
556
- uncond_tensor = reconstruct_cond_batch(negative_prompt_embeds, step)
557
- do_batch = cond_tensor.shape[1] == uncond_tensor.shape[1]
558
-
559
- # expand the latents if we are doing classifier free guidance
560
- latent_model_input = paddle.concat([latents] * 2) if do_batch else latents
561
- latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
562
-
563
- if do_batch:
564
- encoder_hidden_states = paddle.concat([uncond_tensor, cond_tensor])
565
- down_block_res_samples, mid_block_res_sample = self.controlnet(
566
- latent_model_input,
567
- t,
568
- encoder_hidden_states=encoder_hidden_states,
569
- controlnet_cond=paddle.concat([image, image]),
570
- conditioning_scale=controlnet_conditioning_scale,
571
- return_dict=False,
572
- )
573
- noise_pred = self.unet(
574
- latent_model_input,
575
- t,
576
- encoder_hidden_states=encoder_hidden_states,
577
- cross_attention_kwargs=cross_attention_kwargs,
578
- down_block_additional_residuals=down_block_res_samples,
579
- mid_block_additional_residual=mid_block_res_sample,
580
- ).sample
581
- noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
582
- noise_pred = noise_pred_uncond + weight * guidance_scale * (noise_pred_text - noise_pred_uncond)
583
- else:
584
- down_block_res_samples, mid_block_res_sample = self.controlnet(
585
- latent_model_input,
586
- t,
587
- encoder_hidden_states=cond_tensor,
588
- controlnet_cond=image,
589
- conditioning_scale=controlnet_conditioning_scale,
590
- return_dict=False,
591
- )
592
- noise_pred = self.unet(
593
- latent_model_input,
594
- t,
595
- encoder_hidden_states=cond_tensor,
596
- cross_attention_kwargs=cross_attention_kwargs,
597
- down_block_additional_residuals=down_block_res_samples,
598
- mid_block_additional_residual=mid_block_res_sample,
599
- ).sample
600
 
601
- if do_classifier_free_guidance:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
602
  down_block_res_samples, mid_block_res_sample = self.controlnet(
603
  latent_model_input,
604
  t,
605
- encoder_hidden_states=uncond_tensor,
606
  controlnet_cond=image,
607
  conditioning_scale=controlnet_conditioning_scale,
608
  return_dict=False,
609
  )
610
- noise_pred_uncond = self.unet(
611
  latent_model_input,
612
  t,
613
- encoder_hidden_states=uncond_tensor,
614
  cross_attention_kwargs=cross_attention_kwargs,
615
  down_block_additional_residuals=down_block_res_samples,
616
  mid_block_additional_residual=mid_block_res_sample,
617
  ).sample
618
- noise_pred = noise_pred_uncond + weight * guidance_scale * (noise_pred - noise_pred_uncond)
619
-
620
- # compute the previous noisy sample x_t -> x_t-1
621
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
622
 
623
- # call the callback, if provided
624
- if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
625
- progress_bar.update()
626
- if callback is not None and i % callback_steps == 0:
627
- callback(i, t, latents)
628
-
629
- if output_type == "latent":
630
- image = latents
631
- has_nsfw_concept = None
632
- elif output_type == "pil":
633
- # 8. Post-processing
634
- image = self.decode_latents(latents)
635
-
636
- # 9. Run safety checker
637
- image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype)
638
-
639
- # 10. Convert to PIL
640
- image = self.numpy_to_pil(image)
641
- else:
642
- # 8. Post-processing
643
- image = self.decode_latents(latents)
644
-
645
- # 9. Run safety checker
646
- image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype)
647
-
648
- if not return_dict:
649
- return (image, has_nsfw_concept)
650
-
651
- return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652
 
653
 
654
  # clip.py
 
15
  #
16
  # modified from https://github.com/AUTOMATIC1111/stable-diffusion-webui
17
  # Here is the AGPL-3.0 license https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/master/LICENSE.txt
 
 
18
 
19
  import inspect
20
+ from pathlib import Path
21
  from typing import Any, Callable, Dict, List, Optional, Union
22
 
23
  import paddle
 
38
  logging,
39
  randn_tensor,
40
  safetensors_load,
41
+ smart_load,
42
  torch_load,
43
  )
44
 
45
+
46
+ @paddle.no_grad()
47
+ def load_lora(
48
+ pipeline,
49
+ state_dict: dict,
50
+ LORA_PREFIX_UNET: str = "lora_unet",
51
+ LORA_PREFIX_TEXT_ENCODER: str = "lora_te",
52
+ ratio: float = 1.0,
53
+ ):
54
+ ratio = float(ratio)
55
+ visited = []
56
+ for key in state_dict:
57
+ if ".alpha" in key or ".lora_up" in key or key in visited:
58
+ continue
59
+
60
+ if "text" in key:
61
+ tmp_layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
62
+ hf_to_ppnlp = {
63
+ "encoder": "transformer",
64
+ "fc1": "linear1",
65
+ "fc2": "linear2",
66
+ }
67
+ layer_infos = []
68
+ for layer_info in tmp_layer_infos:
69
+ if layer_info == "mlp":
70
+ continue
71
+ layer_infos.append(hf_to_ppnlp.get(layer_info, layer_info))
72
+ curr_layer: paddle.nn.Linear = pipeline.text_encoder
73
+ else:
74
+ layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_")
75
+ curr_layer: paddle.nn.Linear = pipeline.unet
76
+
77
+ temp_name = layer_infos.pop(0)
78
+ while len(layer_infos) > -1:
79
+ try:
80
+ if temp_name == "to":
81
+ raise ValueError()
82
+ curr_layer = curr_layer.__getattr__(temp_name)
83
+ if len(layer_infos) > 0:
84
+ temp_name = layer_infos.pop(0)
85
+ elif len(layer_infos) == 0:
86
+ break
87
+ except Exception:
88
+ if len(temp_name) > 0:
89
+ temp_name += "_" + layer_infos.pop(0)
90
+ else:
91
+ temp_name = layer_infos.pop(0)
92
+
93
+ triplet_keys = [key, key.replace("lora_down", "lora_up"), key.replace("lora_down.weight", "alpha")]
94
+ dtype: paddle.dtype = curr_layer.weight.dtype
95
+ weight_down: paddle.Tensor = state_dict[triplet_keys[0]].cast(dtype)
96
+ weight_up: paddle.Tensor = state_dict[triplet_keys[1]].cast(dtype)
97
+ rank: float = float(weight_down.shape[0])
98
+ if triplet_keys[2] in state_dict:
99
+ alpha: float = state_dict[triplet_keys[2]].cast(dtype).item()
100
+ scale: float = alpha / rank
101
+ else:
102
+ scale = 1.0
103
+
104
+ if not hasattr(curr_layer, "backup_weights"):
105
+ curr_layer.backup_weights = curr_layer.weight.clone()
106
+
107
+ if len(weight_down.shape) == 4:
108
+ if weight_down.shape[2:4] == [1, 1]:
109
+ # conv2d 1x1
110
+ curr_layer.weight.copy_(
111
+ curr_layer.weight
112
+ + ratio
113
+ * paddle.matmul(weight_up.squeeze([-1, -2]), weight_down.squeeze([-1, -2])).unsqueeze([-1, -2])
114
+ * scale,
115
+ True,
116
+ )
117
+ else:
118
+ # conv2d 3x3
119
+ curr_layer.weight.copy_(
120
+ curr_layer.weight
121
+ + ratio
122
+ * paddle.nn.functional.conv2d(weight_down.transpose([1, 0, 2, 3]), weight_up).transpose(
123
+ [1, 0, 2, 3]
124
+ )
125
+ * scale,
126
+ True,
127
+ )
128
+ else:
129
+ # linear
130
+ curr_layer.weight.copy_(curr_layer.weight + ratio * paddle.matmul(weight_up, weight_down).T * scale, True)
131
+
132
+ # update visited list
133
+ visited.extend(triplet_keys)
134
+ return pipeline
135
+
136
+
137
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
138
 
139
 
 
230
  "kdpm2-ancestral",
231
  "kdpm2",
232
  ]
233
+ self.weights_has_changed = False
234
 
235
  def add_ti_embedding_dir(self, embeddings_dir):
236
  self.sj.embedding_db.add_embedding_dir(embeddings_dir)
 
240
  self.sj.embedding_db.clear_embedding_dirs()
241
  self.sj.embedding_db.load_textual_inversion_embeddings(True)
242
 
243
+ def change_scheduler(self, scheduler_type="ddim"):
244
+ self.switch_scheduler(scheduler_type)
245
+
246
  def switch_scheduler(self, scheduler_type="ddim"):
247
  scheduler_type = scheduler_type.lower()
248
  from ppdiffusers import (
 
505
  callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
506
  callback_steps: Optional[int] = 1,
507
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
508
+ clip_skip: int = 1,
509
  controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
510
+ lora_dir: str = "./loras",
511
  ):
512
  r"""
513
  Function invoked when calling the pipeline for generation.
 
565
  A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
566
  `self.processor` in
567
  [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
568
+ clip_skip (`int`, *optional*, defaults to 1):
569
+ CLIP_stop_at_last_layers, if clip_skip <= 1, we will use the last_hidden_state from text_encoder.
570
  controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
571
  The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
572
  to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
573
  corresponding scale as a list.
574
+ lora_dir (`str`, *optional*):
575
+ Path to lora which we want to load.
576
  Examples:
577
 
578
  Returns:
 
582
  list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
583
  (nsfw) content, according to the `safety_checker`.
584
  """
585
+ try:
586
+ # 0. Default height and width to unet
587
+ height, width = self._default_height_width(height, width, image)
588
+
589
+ # 1. Check inputs. Raise error if not correct
590
+ self.check_inputs(
591
+ prompt,
592
+ image,
593
+ height,
594
+ width,
595
+ callback_steps,
596
+ negative_prompt,
597
+ controlnet_conditioning_scale,
598
+ )
 
 
 
 
 
 
 
 
599
 
600
+ batch_size = 1
 
 
 
601
 
602
+ image = self.prepare_image(
603
+ image=image,
604
+ width=width,
605
+ height=height,
606
+ dtype=self.controlnet.dtype,
607
+ )
608
 
609
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
610
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
611
+ # corresponds to doing no classifier free guidance.
612
+ do_classifier_free_guidance = guidance_scale > 1.0
613
+
614
+ prompts, extra_network_data = parse_prompts([prompt])
615
+
616
+ if lora_dir is not None and os.path.exists(lora_dir):
617
+ lora_mapping = {p.stem: p.absolute() for p in Path(lora_dir).glob("*.safetensors")}
618
+ for params in extra_network_data["lora"]:
619
+ assert len(params.items) > 0
620
+ name = params.items[0]
621
+ if name in lora_mapping:
622
+ ratio = float(params.items[1]) if len(params.items) > 1 else 1.0
623
+ lora_state_dict = smart_load(lora_mapping[name], map_location=paddle.get_device())
624
+ self.weights_has_changed = True
625
+ load_lora(self, state_dict=lora_state_dict, ratio=ratio)
626
+ del lora_state_dict
627
+ else:
628
+ print(f"We can't find lora weight: {name}! Please make sure that exists!")
629
+
630
+ self.sj.clip.CLIP_stop_at_last_layers = clip_skip
631
+ # 3. Encode input prompt
632
+ prompt_embeds, negative_prompt_embeds = self._encode_prompt(
633
+ prompts,
634
+ do_classifier_free_guidance,
635
+ negative_prompt,
636
+ num_inference_steps=num_inference_steps,
637
+ )
638
 
639
+ # 4. Prepare timesteps
640
+ self.scheduler.set_timesteps(num_inference_steps)
641
+ timesteps = self.scheduler.timesteps
642
+
643
+ # 5. Prepare latent variables
644
+ num_channels_latents = self.unet.in_channels
645
+ latents = self.prepare_latents(
646
+ batch_size,
647
+ num_channels_latents,
648
+ height,
649
+ width,
650
+ self.unet.dtype,
651
+ generator,
652
+ latents,
653
+ )
654
 
655
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
656
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
657
+
658
+ # 7. Denoising loop
659
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
660
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
661
+ for i, t in enumerate(timesteps):
662
+ step = i // self.scheduler.order
663
+ do_batch = False
664
+ conds_list, cond_tensor = reconstruct_multicond_batch(prompt_embeds, step)
665
+ try:
666
+ weight = conds_list[0][0][1]
667
+ except Exception:
668
+ weight = 1.0
669
+ if do_classifier_free_guidance:
670
+ uncond_tensor = reconstruct_cond_batch(negative_prompt_embeds, step)
671
+ do_batch = cond_tensor.shape[1] == uncond_tensor.shape[1]
672
 
673
+ # expand the latents if we are doing classifier free guidance
674
+ latent_model_input = paddle.concat([latents] * 2) if do_batch else latents
675
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
676
 
677
+ if do_batch:
678
+ encoder_hidden_states = paddle.concat([uncond_tensor, cond_tensor])
679
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
680
+ latent_model_input,
681
+ t,
682
+ encoder_hidden_states=encoder_hidden_states,
683
+ controlnet_cond=paddle.concat([image, image]),
684
+ conditioning_scale=controlnet_conditioning_scale,
685
+ return_dict=False,
686
+ )
687
+ noise_pred = self.unet(
688
+ latent_model_input,
689
+ t,
690
+ encoder_hidden_states=encoder_hidden_states,
691
+ cross_attention_kwargs=cross_attention_kwargs,
692
+ down_block_additional_residuals=down_block_res_samples,
693
+ mid_block_additional_residual=mid_block_res_sample,
694
+ ).sample
695
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
696
+ noise_pred = noise_pred_uncond + weight * guidance_scale * (
697
+ noise_pred_text - noise_pred_uncond
698
+ )
699
+ else:
700
  down_block_res_samples, mid_block_res_sample = self.controlnet(
701
  latent_model_input,
702
  t,
703
+ encoder_hidden_states=cond_tensor,
704
  controlnet_cond=image,
705
  conditioning_scale=controlnet_conditioning_scale,
706
  return_dict=False,
707
  )
708
+ noise_pred = self.unet(
709
  latent_model_input,
710
  t,
711
+ encoder_hidden_states=cond_tensor,
712
  cross_attention_kwargs=cross_attention_kwargs,
713
  down_block_additional_residuals=down_block_res_samples,
714
  mid_block_additional_residual=mid_block_res_sample,
715
  ).sample
 
 
 
 
716
 
717
+ if do_classifier_free_guidance:
718
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
719
+ latent_model_input,
720
+ t,
721
+ encoder_hidden_states=uncond_tensor,
722
+ controlnet_cond=image,
723
+ conditioning_scale=controlnet_conditioning_scale,
724
+ return_dict=False,
725
+ )
726
+ noise_pred_uncond = self.unet(
727
+ latent_model_input,
728
+ t,
729
+ encoder_hidden_states=uncond_tensor,
730
+ cross_attention_kwargs=cross_attention_kwargs,
731
+ down_block_additional_residuals=down_block_res_samples,
732
+ mid_block_additional_residual=mid_block_res_sample,
733
+ ).sample
734
+ noise_pred = noise_pred_uncond + weight * guidance_scale * (noise_pred - noise_pred_uncond)
735
+
736
+ # compute the previous noisy sample x_t -> x_t-1
737
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
738
+
739
+ # call the callback, if provided
740
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
741
+ progress_bar.update()
742
+ if callback is not None and i % callback_steps == 0:
743
+ callback(i, t, latents)
744
+
745
+ if output_type == "latent":
746
+ image = latents
747
+ has_nsfw_concept = None
748
+ elif output_type == "pil":
749
+ # 8. Post-processing
750
+ image = self.decode_latents(latents)
751
+
752
+ # 9. Run safety checker
753
+ image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype)
754
+
755
+ # 10. Convert to PIL
756
+ image = self.numpy_to_pil(image)
757
+ else:
758
+ # 8. Post-processing
759
+ image = self.decode_latents(latents)
760
+
761
+ # 9. Run safety checker
762
+ image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype)
763
+
764
+ if not return_dict:
765
+ return (image, has_nsfw_concept)
766
+
767
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
768
+ except Exception as e:
769
+ raise ValueError(e)
770
+ finally:
771
+ if self.weights_has_changed:
772
+ for sub_layer in self.text_encoder.sublayers(include_self=True):
773
+ if hasattr(sub_layer, "backup_weights"):
774
+ sub_layer.weight.copy_(sub_layer.backup_weights, True)
775
+ for sub_layer in self.unet.sublayers(include_self=True):
776
+ if hasattr(sub_layer, "backup_weights"):
777
+ sub_layer.weight.copy_(sub_layer.backup_weights, True)
778
+ self.weights_has_changed = False
779
 
780
 
781
  # clip.py