Update pipeline.py
Browse files- pipeline.py +277 -150
pipeline.py
CHANGED
@@ -15,10 +15,9 @@
|
|
15 |
#
|
16 |
# modified from https://github.com/AUTOMATIC1111/stable-diffusion-webui
|
17 |
# Here is the AGPL-3.0 license https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/master/LICENSE.txt
|
18 |
-
from ppdiffusers.utils import check_min_version
|
19 |
-
check_min_version("0.14.1")
|
20 |
|
21 |
import inspect
|
|
|
22 |
from typing import Any, Callable, Dict, List, Optional, Union
|
23 |
|
24 |
import paddle
|
@@ -39,9 +38,102 @@ from ppdiffusers.utils import (
|
|
39 |
logging,
|
40 |
randn_tensor,
|
41 |
safetensors_load,
|
|
|
42 |
torch_load,
|
43 |
)
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
46 |
|
47 |
|
@@ -138,6 +230,7 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
|
|
138 |
"kdpm2-ancestral",
|
139 |
"kdpm2",
|
140 |
]
|
|
|
141 |
|
142 |
def add_ti_embedding_dir(self, embeddings_dir):
|
143 |
self.sj.embedding_db.add_embedding_dir(embeddings_dir)
|
@@ -147,6 +240,9 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
|
|
147 |
self.sj.embedding_db.clear_embedding_dirs()
|
148 |
self.sj.embedding_db.load_textual_inversion_embeddings(True)
|
149 |
|
|
|
|
|
|
|
150 |
def switch_scheduler(self, scheduler_type="ddim"):
|
151 |
scheduler_type = scheduler_type.lower()
|
152 |
from ppdiffusers import (
|
@@ -409,8 +505,9 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
|
|
409 |
callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
|
410 |
callback_steps: Optional[int] = 1,
|
411 |
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
412 |
-
clip_skip: int =
|
413 |
controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
|
|
|
414 |
):
|
415 |
r"""
|
416 |
Function invoked when calling the pipeline for generation.
|
@@ -468,12 +565,14 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
|
|
468 |
A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
|
469 |
`self.processor` in
|
470 |
[diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
|
471 |
-
clip_skip (`int`, *optional*, defaults to
|
472 |
-
CLIP_stop_at_last_layers, if clip_skip
|
473 |
controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
|
474 |
The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
|
475 |
to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
|
476 |
corresponding scale as a list.
|
|
|
|
|
477 |
Examples:
|
478 |
|
479 |
Returns:
|
@@ -483,172 +582,200 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
|
|
483 |
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
484 |
(nsfw) content, according to the `safety_checker`.
|
485 |
"""
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
batch_size = 1
|
501 |
-
|
502 |
-
image = self.prepare_image(
|
503 |
-
image=image,
|
504 |
-
width=width,
|
505 |
-
height=height,
|
506 |
-
dtype=self.controlnet.dtype,
|
507 |
-
)
|
508 |
|
509 |
-
|
510 |
-
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
511 |
-
# corresponds to doing no classifier free guidance.
|
512 |
-
do_classifier_free_guidance = guidance_scale > 1.0
|
513 |
|
514 |
-
|
|
|
|
|
|
|
|
|
|
|
515 |
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
524 |
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
|
541 |
-
|
542 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
543 |
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
for i, t in enumerate(timesteps):
|
548 |
-
step = i // self.scheduler.order
|
549 |
-
do_batch = False
|
550 |
-
conds_list, cond_tensor = reconstruct_multicond_batch(prompt_embeds, step)
|
551 |
-
try:
|
552 |
-
weight = conds_list[0][0][1]
|
553 |
-
except Exception:
|
554 |
-
weight = 1.0
|
555 |
-
if do_classifier_free_guidance:
|
556 |
-
uncond_tensor = reconstruct_cond_batch(negative_prompt_embeds, step)
|
557 |
-
do_batch = cond_tensor.shape[1] == uncond_tensor.shape[1]
|
558 |
-
|
559 |
-
# expand the latents if we are doing classifier free guidance
|
560 |
-
latent_model_input = paddle.concat([latents] * 2) if do_batch else latents
|
561 |
-
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
562 |
-
|
563 |
-
if do_batch:
|
564 |
-
encoder_hidden_states = paddle.concat([uncond_tensor, cond_tensor])
|
565 |
-
down_block_res_samples, mid_block_res_sample = self.controlnet(
|
566 |
-
latent_model_input,
|
567 |
-
t,
|
568 |
-
encoder_hidden_states=encoder_hidden_states,
|
569 |
-
controlnet_cond=paddle.concat([image, image]),
|
570 |
-
conditioning_scale=controlnet_conditioning_scale,
|
571 |
-
return_dict=False,
|
572 |
-
)
|
573 |
-
noise_pred = self.unet(
|
574 |
-
latent_model_input,
|
575 |
-
t,
|
576 |
-
encoder_hidden_states=encoder_hidden_states,
|
577 |
-
cross_attention_kwargs=cross_attention_kwargs,
|
578 |
-
down_block_additional_residuals=down_block_res_samples,
|
579 |
-
mid_block_additional_residual=mid_block_res_sample,
|
580 |
-
).sample
|
581 |
-
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
582 |
-
noise_pred = noise_pred_uncond + weight * guidance_scale * (noise_pred_text - noise_pred_uncond)
|
583 |
-
else:
|
584 |
-
down_block_res_samples, mid_block_res_sample = self.controlnet(
|
585 |
-
latent_model_input,
|
586 |
-
t,
|
587 |
-
encoder_hidden_states=cond_tensor,
|
588 |
-
controlnet_cond=image,
|
589 |
-
conditioning_scale=controlnet_conditioning_scale,
|
590 |
-
return_dict=False,
|
591 |
-
)
|
592 |
-
noise_pred = self.unet(
|
593 |
-
latent_model_input,
|
594 |
-
t,
|
595 |
-
encoder_hidden_states=cond_tensor,
|
596 |
-
cross_attention_kwargs=cross_attention_kwargs,
|
597 |
-
down_block_additional_residuals=down_block_res_samples,
|
598 |
-
mid_block_additional_residual=mid_block_res_sample,
|
599 |
-
).sample
|
600 |
|
601 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
602 |
down_block_res_samples, mid_block_res_sample = self.controlnet(
|
603 |
latent_model_input,
|
604 |
t,
|
605 |
-
encoder_hidden_states=
|
606 |
controlnet_cond=image,
|
607 |
conditioning_scale=controlnet_conditioning_scale,
|
608 |
return_dict=False,
|
609 |
)
|
610 |
-
|
611 |
latent_model_input,
|
612 |
t,
|
613 |
-
encoder_hidden_states=
|
614 |
cross_attention_kwargs=cross_attention_kwargs,
|
615 |
down_block_additional_residuals=down_block_res_samples,
|
616 |
mid_block_additional_residual=mid_block_res_sample,
|
617 |
).sample
|
618 |
-
noise_pred = noise_pred_uncond + weight * guidance_scale * (noise_pred - noise_pred_uncond)
|
619 |
-
|
620 |
-
# compute the previous noisy sample x_t -> x_t-1
|
621 |
-
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
622 |
|
623 |
-
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
-
|
635 |
-
|
636 |
-
|
637 |
-
|
638 |
-
|
639 |
-
|
640 |
-
|
641 |
-
|
642 |
-
|
643 |
-
|
644 |
-
|
645 |
-
|
646 |
-
|
647 |
-
|
648 |
-
|
649 |
-
|
650 |
-
|
651 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
652 |
|
653 |
|
654 |
# clip.py
|
|
|
15 |
#
|
16 |
# modified from https://github.com/AUTOMATIC1111/stable-diffusion-webui
|
17 |
# Here is the AGPL-3.0 license https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/master/LICENSE.txt
|
|
|
|
|
18 |
|
19 |
import inspect
|
20 |
+
from pathlib import Path
|
21 |
from typing import Any, Callable, Dict, List, Optional, Union
|
22 |
|
23 |
import paddle
|
|
|
38 |
logging,
|
39 |
randn_tensor,
|
40 |
safetensors_load,
|
41 |
+
smart_load,
|
42 |
torch_load,
|
43 |
)
|
44 |
|
45 |
+
|
46 |
+
@paddle.no_grad()
|
47 |
+
def load_lora(
|
48 |
+
pipeline,
|
49 |
+
state_dict: dict,
|
50 |
+
LORA_PREFIX_UNET: str = "lora_unet",
|
51 |
+
LORA_PREFIX_TEXT_ENCODER: str = "lora_te",
|
52 |
+
ratio: float = 1.0,
|
53 |
+
):
|
54 |
+
ratio = float(ratio)
|
55 |
+
visited = []
|
56 |
+
for key in state_dict:
|
57 |
+
if ".alpha" in key or ".lora_up" in key or key in visited:
|
58 |
+
continue
|
59 |
+
|
60 |
+
if "text" in key:
|
61 |
+
tmp_layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
|
62 |
+
hf_to_ppnlp = {
|
63 |
+
"encoder": "transformer",
|
64 |
+
"fc1": "linear1",
|
65 |
+
"fc2": "linear2",
|
66 |
+
}
|
67 |
+
layer_infos = []
|
68 |
+
for layer_info in tmp_layer_infos:
|
69 |
+
if layer_info == "mlp":
|
70 |
+
continue
|
71 |
+
layer_infos.append(hf_to_ppnlp.get(layer_info, layer_info))
|
72 |
+
curr_layer: paddle.nn.Linear = pipeline.text_encoder
|
73 |
+
else:
|
74 |
+
layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_")
|
75 |
+
curr_layer: paddle.nn.Linear = pipeline.unet
|
76 |
+
|
77 |
+
temp_name = layer_infos.pop(0)
|
78 |
+
while len(layer_infos) > -1:
|
79 |
+
try:
|
80 |
+
if temp_name == "to":
|
81 |
+
raise ValueError()
|
82 |
+
curr_layer = curr_layer.__getattr__(temp_name)
|
83 |
+
if len(layer_infos) > 0:
|
84 |
+
temp_name = layer_infos.pop(0)
|
85 |
+
elif len(layer_infos) == 0:
|
86 |
+
break
|
87 |
+
except Exception:
|
88 |
+
if len(temp_name) > 0:
|
89 |
+
temp_name += "_" + layer_infos.pop(0)
|
90 |
+
else:
|
91 |
+
temp_name = layer_infos.pop(0)
|
92 |
+
|
93 |
+
triplet_keys = [key, key.replace("lora_down", "lora_up"), key.replace("lora_down.weight", "alpha")]
|
94 |
+
dtype: paddle.dtype = curr_layer.weight.dtype
|
95 |
+
weight_down: paddle.Tensor = state_dict[triplet_keys[0]].cast(dtype)
|
96 |
+
weight_up: paddle.Tensor = state_dict[triplet_keys[1]].cast(dtype)
|
97 |
+
rank: float = float(weight_down.shape[0])
|
98 |
+
if triplet_keys[2] in state_dict:
|
99 |
+
alpha: float = state_dict[triplet_keys[2]].cast(dtype).item()
|
100 |
+
scale: float = alpha / rank
|
101 |
+
else:
|
102 |
+
scale = 1.0
|
103 |
+
|
104 |
+
if not hasattr(curr_layer, "backup_weights"):
|
105 |
+
curr_layer.backup_weights = curr_layer.weight.clone()
|
106 |
+
|
107 |
+
if len(weight_down.shape) == 4:
|
108 |
+
if weight_down.shape[2:4] == [1, 1]:
|
109 |
+
# conv2d 1x1
|
110 |
+
curr_layer.weight.copy_(
|
111 |
+
curr_layer.weight
|
112 |
+
+ ratio
|
113 |
+
* paddle.matmul(weight_up.squeeze([-1, -2]), weight_down.squeeze([-1, -2])).unsqueeze([-1, -2])
|
114 |
+
* scale,
|
115 |
+
True,
|
116 |
+
)
|
117 |
+
else:
|
118 |
+
# conv2d 3x3
|
119 |
+
curr_layer.weight.copy_(
|
120 |
+
curr_layer.weight
|
121 |
+
+ ratio
|
122 |
+
* paddle.nn.functional.conv2d(weight_down.transpose([1, 0, 2, 3]), weight_up).transpose(
|
123 |
+
[1, 0, 2, 3]
|
124 |
+
)
|
125 |
+
* scale,
|
126 |
+
True,
|
127 |
+
)
|
128 |
+
else:
|
129 |
+
# linear
|
130 |
+
curr_layer.weight.copy_(curr_layer.weight + ratio * paddle.matmul(weight_up, weight_down).T * scale, True)
|
131 |
+
|
132 |
+
# update visited list
|
133 |
+
visited.extend(triplet_keys)
|
134 |
+
return pipeline
|
135 |
+
|
136 |
+
|
137 |
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
138 |
|
139 |
|
|
|
230 |
"kdpm2-ancestral",
|
231 |
"kdpm2",
|
232 |
]
|
233 |
+
self.weights_has_changed = False
|
234 |
|
235 |
def add_ti_embedding_dir(self, embeddings_dir):
|
236 |
self.sj.embedding_db.add_embedding_dir(embeddings_dir)
|
|
|
240 |
self.sj.embedding_db.clear_embedding_dirs()
|
241 |
self.sj.embedding_db.load_textual_inversion_embeddings(True)
|
242 |
|
243 |
+
def change_scheduler(self, scheduler_type="ddim"):
|
244 |
+
self.switch_scheduler(scheduler_type)
|
245 |
+
|
246 |
def switch_scheduler(self, scheduler_type="ddim"):
|
247 |
scheduler_type = scheduler_type.lower()
|
248 |
from ppdiffusers import (
|
|
|
505 |
callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
|
506 |
callback_steps: Optional[int] = 1,
|
507 |
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
508 |
+
clip_skip: int = 1,
|
509 |
controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
|
510 |
+
lora_dir: str = "./loras",
|
511 |
):
|
512 |
r"""
|
513 |
Function invoked when calling the pipeline for generation.
|
|
|
565 |
A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
|
566 |
`self.processor` in
|
567 |
[diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
|
568 |
+
clip_skip (`int`, *optional*, defaults to 1):
|
569 |
+
CLIP_stop_at_last_layers, if clip_skip <= 1, we will use the last_hidden_state from text_encoder.
|
570 |
controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
|
571 |
The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
|
572 |
to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
|
573 |
corresponding scale as a list.
|
574 |
+
lora_dir (`str`, *optional*):
|
575 |
+
Path to lora which we want to load.
|
576 |
Examples:
|
577 |
|
578 |
Returns:
|
|
|
582 |
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
583 |
(nsfw) content, according to the `safety_checker`.
|
584 |
"""
|
585 |
+
try:
|
586 |
+
# 0. Default height and width to unet
|
587 |
+
height, width = self._default_height_width(height, width, image)
|
588 |
+
|
589 |
+
# 1. Check inputs. Raise error if not correct
|
590 |
+
self.check_inputs(
|
591 |
+
prompt,
|
592 |
+
image,
|
593 |
+
height,
|
594 |
+
width,
|
595 |
+
callback_steps,
|
596 |
+
negative_prompt,
|
597 |
+
controlnet_conditioning_scale,
|
598 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
599 |
|
600 |
+
batch_size = 1
|
|
|
|
|
|
|
601 |
|
602 |
+
image = self.prepare_image(
|
603 |
+
image=image,
|
604 |
+
width=width,
|
605 |
+
height=height,
|
606 |
+
dtype=self.controlnet.dtype,
|
607 |
+
)
|
608 |
|
609 |
+
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
610 |
+
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
611 |
+
# corresponds to doing no classifier free guidance.
|
612 |
+
do_classifier_free_guidance = guidance_scale > 1.0
|
613 |
+
|
614 |
+
prompts, extra_network_data = parse_prompts([prompt])
|
615 |
+
|
616 |
+
if lora_dir is not None and os.path.exists(lora_dir):
|
617 |
+
lora_mapping = {p.stem: p.absolute() for p in Path(lora_dir).glob("*.safetensors")}
|
618 |
+
for params in extra_network_data["lora"]:
|
619 |
+
assert len(params.items) > 0
|
620 |
+
name = params.items[0]
|
621 |
+
if name in lora_mapping:
|
622 |
+
ratio = float(params.items[1]) if len(params.items) > 1 else 1.0
|
623 |
+
lora_state_dict = smart_load(lora_mapping[name], map_location=paddle.get_device())
|
624 |
+
self.weights_has_changed = True
|
625 |
+
load_lora(self, state_dict=lora_state_dict, ratio=ratio)
|
626 |
+
del lora_state_dict
|
627 |
+
else:
|
628 |
+
print(f"We can't find lora weight: {name}! Please make sure that exists!")
|
629 |
+
|
630 |
+
self.sj.clip.CLIP_stop_at_last_layers = clip_skip
|
631 |
+
# 3. Encode input prompt
|
632 |
+
prompt_embeds, negative_prompt_embeds = self._encode_prompt(
|
633 |
+
prompts,
|
634 |
+
do_classifier_free_guidance,
|
635 |
+
negative_prompt,
|
636 |
+
num_inference_steps=num_inference_steps,
|
637 |
+
)
|
638 |
|
639 |
+
# 4. Prepare timesteps
|
640 |
+
self.scheduler.set_timesteps(num_inference_steps)
|
641 |
+
timesteps = self.scheduler.timesteps
|
642 |
+
|
643 |
+
# 5. Prepare latent variables
|
644 |
+
num_channels_latents = self.unet.in_channels
|
645 |
+
latents = self.prepare_latents(
|
646 |
+
batch_size,
|
647 |
+
num_channels_latents,
|
648 |
+
height,
|
649 |
+
width,
|
650 |
+
self.unet.dtype,
|
651 |
+
generator,
|
652 |
+
latents,
|
653 |
+
)
|
654 |
|
655 |
+
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
656 |
+
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
657 |
+
|
658 |
+
# 7. Denoising loop
|
659 |
+
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
660 |
+
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
661 |
+
for i, t in enumerate(timesteps):
|
662 |
+
step = i // self.scheduler.order
|
663 |
+
do_batch = False
|
664 |
+
conds_list, cond_tensor = reconstruct_multicond_batch(prompt_embeds, step)
|
665 |
+
try:
|
666 |
+
weight = conds_list[0][0][1]
|
667 |
+
except Exception:
|
668 |
+
weight = 1.0
|
669 |
+
if do_classifier_free_guidance:
|
670 |
+
uncond_tensor = reconstruct_cond_batch(negative_prompt_embeds, step)
|
671 |
+
do_batch = cond_tensor.shape[1] == uncond_tensor.shape[1]
|
672 |
|
673 |
+
# expand the latents if we are doing classifier free guidance
|
674 |
+
latent_model_input = paddle.concat([latents] * 2) if do_batch else latents
|
675 |
+
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
676 |
|
677 |
+
if do_batch:
|
678 |
+
encoder_hidden_states = paddle.concat([uncond_tensor, cond_tensor])
|
679 |
+
down_block_res_samples, mid_block_res_sample = self.controlnet(
|
680 |
+
latent_model_input,
|
681 |
+
t,
|
682 |
+
encoder_hidden_states=encoder_hidden_states,
|
683 |
+
controlnet_cond=paddle.concat([image, image]),
|
684 |
+
conditioning_scale=controlnet_conditioning_scale,
|
685 |
+
return_dict=False,
|
686 |
+
)
|
687 |
+
noise_pred = self.unet(
|
688 |
+
latent_model_input,
|
689 |
+
t,
|
690 |
+
encoder_hidden_states=encoder_hidden_states,
|
691 |
+
cross_attention_kwargs=cross_attention_kwargs,
|
692 |
+
down_block_additional_residuals=down_block_res_samples,
|
693 |
+
mid_block_additional_residual=mid_block_res_sample,
|
694 |
+
).sample
|
695 |
+
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
696 |
+
noise_pred = noise_pred_uncond + weight * guidance_scale * (
|
697 |
+
noise_pred_text - noise_pred_uncond
|
698 |
+
)
|
699 |
+
else:
|
700 |
down_block_res_samples, mid_block_res_sample = self.controlnet(
|
701 |
latent_model_input,
|
702 |
t,
|
703 |
+
encoder_hidden_states=cond_tensor,
|
704 |
controlnet_cond=image,
|
705 |
conditioning_scale=controlnet_conditioning_scale,
|
706 |
return_dict=False,
|
707 |
)
|
708 |
+
noise_pred = self.unet(
|
709 |
latent_model_input,
|
710 |
t,
|
711 |
+
encoder_hidden_states=cond_tensor,
|
712 |
cross_attention_kwargs=cross_attention_kwargs,
|
713 |
down_block_additional_residuals=down_block_res_samples,
|
714 |
mid_block_additional_residual=mid_block_res_sample,
|
715 |
).sample
|
|
|
|
|
|
|
|
|
716 |
|
717 |
+
if do_classifier_free_guidance:
|
718 |
+
down_block_res_samples, mid_block_res_sample = self.controlnet(
|
719 |
+
latent_model_input,
|
720 |
+
t,
|
721 |
+
encoder_hidden_states=uncond_tensor,
|
722 |
+
controlnet_cond=image,
|
723 |
+
conditioning_scale=controlnet_conditioning_scale,
|
724 |
+
return_dict=False,
|
725 |
+
)
|
726 |
+
noise_pred_uncond = self.unet(
|
727 |
+
latent_model_input,
|
728 |
+
t,
|
729 |
+
encoder_hidden_states=uncond_tensor,
|
730 |
+
cross_attention_kwargs=cross_attention_kwargs,
|
731 |
+
down_block_additional_residuals=down_block_res_samples,
|
732 |
+
mid_block_additional_residual=mid_block_res_sample,
|
733 |
+
).sample
|
734 |
+
noise_pred = noise_pred_uncond + weight * guidance_scale * (noise_pred - noise_pred_uncond)
|
735 |
+
|
736 |
+
# compute the previous noisy sample x_t -> x_t-1
|
737 |
+
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
738 |
+
|
739 |
+
# call the callback, if provided
|
740 |
+
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
741 |
+
progress_bar.update()
|
742 |
+
if callback is not None and i % callback_steps == 0:
|
743 |
+
callback(i, t, latents)
|
744 |
+
|
745 |
+
if output_type == "latent":
|
746 |
+
image = latents
|
747 |
+
has_nsfw_concept = None
|
748 |
+
elif output_type == "pil":
|
749 |
+
# 8. Post-processing
|
750 |
+
image = self.decode_latents(latents)
|
751 |
+
|
752 |
+
# 9. Run safety checker
|
753 |
+
image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype)
|
754 |
+
|
755 |
+
# 10. Convert to PIL
|
756 |
+
image = self.numpy_to_pil(image)
|
757 |
+
else:
|
758 |
+
# 8. Post-processing
|
759 |
+
image = self.decode_latents(latents)
|
760 |
+
|
761 |
+
# 9. Run safety checker
|
762 |
+
image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype)
|
763 |
+
|
764 |
+
if not return_dict:
|
765 |
+
return (image, has_nsfw_concept)
|
766 |
+
|
767 |
+
return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
|
768 |
+
except Exception as e:
|
769 |
+
raise ValueError(e)
|
770 |
+
finally:
|
771 |
+
if self.weights_has_changed:
|
772 |
+
for sub_layer in self.text_encoder.sublayers(include_self=True):
|
773 |
+
if hasattr(sub_layer, "backup_weights"):
|
774 |
+
sub_layer.weight.copy_(sub_layer.backup_weights, True)
|
775 |
+
for sub_layer in self.unet.sublayers(include_self=True):
|
776 |
+
if hasattr(sub_layer, "backup_weights"):
|
777 |
+
sub_layer.weight.copy_(sub_layer.backup_weights, True)
|
778 |
+
self.weights_has_changed = False
|
779 |
|
780 |
|
781 |
# clip.py
|