junnyu
/

webui_controlnet_ppdiffusers

English

Model card Files Files and versions Community

junnyu commited on Apr 24, 2023

Commit

a4a6c30

1 Parent(s): 212ca9f

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +277 -150

pipeline.py CHANGED Viewed

@@ -15,10 +15,9 @@
 #
 # modified from https://github.com/AUTOMATIC1111/stable-diffusion-webui
 # Here is the AGPL-3.0 license https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/master/LICENSE.txt
-from ppdiffusers.utils import check_min_version
-check_min_version("0.14.1")
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Union
 import paddle
@@ -39,9 +38,102 @@ from ppdiffusers.utils import (
     logging,
     randn_tensor,
     safetensors_load,
     torch_load,
 )
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -138,6 +230,7 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
             "kdpm2-ancestral",
             "kdpm2",
         ]
     def add_ti_embedding_dir(self, embeddings_dir):
         self.sj.embedding_db.add_embedding_dir(embeddings_dir)
@@ -147,6 +240,9 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
         self.sj.embedding_db.clear_embedding_dirs()
         self.sj.embedding_db.load_textual_inversion_embeddings(True)
     def switch_scheduler(self, scheduler_type="ddim"):
         scheduler_type = scheduler_type.lower()
         from ppdiffusers import (
@@ -409,8 +505,9 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
         callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
         callback_steps: Optional[int] = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        clip_skip: int = 0,
         controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -468,12 +565,14 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
                 A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
                 `self.processor` in
                 [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-            clip_skip (`int`, *optional*, defaults to 0):
-                CLIP_stop_at_last_layers, if clip_skip < 1, we will use the last_hidden_state from text_encoder.
             controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
                 The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
                 to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
                 corresponding scale as a list.
         Examples:
         Returns:
@@ -483,172 +582,200 @@ class WebUIStableDiffusionControlNetPipeline(DiffusionPipeline):
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
-        # 0. Default height and width to unet
-        height, width = self._default_height_width(height, width, image)
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            image,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            controlnet_conditioning_scale,
-        )
-        batch_size = 1
-        image = self.prepare_image(
-            image=image,
-            width=width,
-            height=height,
-            dtype=self.controlnet.dtype,
-        )
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-        prompts, extra_network_data = parse_prompts([prompt])
-        self.sj.clip.CLIP_stop_at_last_layers = clip_skip
-        # 3. Encode input prompt
-        prompt_embeds, negative_prompt_embeds = self._encode_prompt(
-            prompts,
-            do_classifier_free_guidance,
-            negative_prompt,
-            num_inference_steps=num_inference_steps,
-        )
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.in_channels
-        latents = self.prepare_latents(
-            batch_size,
-            num_channels_latents,
-            height,
-            width,
-            self.unet.dtype,
-            generator,
-            latents,
-        )
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                step = i // self.scheduler.order
-                do_batch = False
-                conds_list, cond_tensor = reconstruct_multicond_batch(prompt_embeds, step)
-                try:
-                    weight = conds_list[0][0][1]
-                except Exception:
-                    weight = 1.0
-                if do_classifier_free_guidance:
-                    uncond_tensor = reconstruct_cond_batch(negative_prompt_embeds, step)
-                    do_batch = cond_tensor.shape[1] == uncond_tensor.shape[1]
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_batch else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                if do_batch:
-                    encoder_hidden_states = paddle.concat([uncond_tensor, cond_tensor])
-                    down_block_res_samples, mid_block_res_sample = self.controlnet(
-                        latent_model_input,
-                        t,
-                        encoder_hidden_states=encoder_hidden_states,
-                        controlnet_cond=paddle.concat([image, image]),
-                        conditioning_scale=controlnet_conditioning_scale,
-                        return_dict=False,
-                    )
-                    noise_pred = self.unet(
-                        latent_model_input,
-                        t,
-                        encoder_hidden_states=encoder_hidden_states,
-                        cross_attention_kwargs=cross_attention_kwargs,
-                        down_block_additional_residuals=down_block_res_samples,
-                        mid_block_additional_residual=mid_block_res_sample,
-                    ).sample
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + weight * guidance_scale * (noise_pred_text - noise_pred_uncond)
-                else:
-                    down_block_res_samples, mid_block_res_sample = self.controlnet(
-                        latent_model_input,
-                        t,
-                        encoder_hidden_states=cond_tensor,
-                        controlnet_cond=image,
-                        conditioning_scale=controlnet_conditioning_scale,
-                        return_dict=False,
-                    )
-                    noise_pred = self.unet(
-                        latent_model_input,
-                        t,
-                        encoder_hidden_states=cond_tensor,
-                        cross_attention_kwargs=cross_attention_kwargs,
-                        down_block_additional_residuals=down_block_res_samples,
-                        mid_block_additional_residual=mid_block_res_sample,
-                    ).sample
-                    if do_classifier_free_guidance:
                         down_block_res_samples, mid_block_res_sample = self.controlnet(
                             latent_model_input,
                             t,
-                            encoder_hidden_states=uncond_tensor,
                             controlnet_cond=image,
                             conditioning_scale=controlnet_conditioning_scale,
                             return_dict=False,
                         )
-                        noise_pred_uncond = self.unet(
                             latent_model_input,
                             t,
-                            encoder_hidden_states=uncond_tensor,
                             cross_attention_kwargs=cross_attention_kwargs,
                             down_block_additional_residuals=down_block_res_samples,
                             mid_block_additional_residual=mid_block_res_sample,
                         ).sample
-                        noise_pred = noise_pred_uncond + weight * guidance_scale * (noise_pred - noise_pred_uncond)
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-        if output_type == "latent":
-            image = latents
-            has_nsfw_concept = None
-        elif output_type == "pil":
-            # 8. Post-processing
-            image = self.decode_latents(latents)
-            # 9. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype)
-            # 10. Convert to PIL
-            image = self.numpy_to_pil(image)
-        else:
-            # 8. Post-processing
-            image = self.decode_latents(latents)
-            # 9. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype)
-        if not return_dict:
-            return (image, has_nsfw_concept)
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
 # clip.py

 #
 # modified from https://github.com/AUTOMATIC1111/stable-diffusion-webui
 # Here is the AGPL-3.0 license https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/master/LICENSE.txt
 import inspect
+from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Union
 import paddle
     logging,
     randn_tensor,
     safetensors_load,
+    smart_load,
     torch_load,
 )
+@paddle.no_grad()
+def load_lora(
+    pipeline,
+    state_dict: dict,
+    LORA_PREFIX_UNET: str = "lora_unet",
+    LORA_PREFIX_TEXT_ENCODER: str = "lora_te",
+    ratio: float = 1.0,
+):
+    ratio = float(ratio)
+    visited = []
+    for key in state_dict:
+        if ".alpha" in key or ".lora_up" in key or key in visited:
+            continue
+        if "text" in key:
+            tmp_layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
+            hf_to_ppnlp = {
+                "encoder": "transformer",
+                "fc1": "linear1",
+                "fc2": "linear2",
+            }
+            layer_infos = []
+            for layer_info in tmp_layer_infos:
+                if layer_info == "mlp":
+                    continue
+                layer_infos.append(hf_to_ppnlp.get(layer_info, layer_info))
+            curr_layer: paddle.nn.Linear = pipeline.text_encoder
+        else:
+            layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_")
+            curr_layer: paddle.nn.Linear = pipeline.unet
+        temp_name = layer_infos.pop(0)
+        while len(layer_infos) > -1:
+            try:
+                if temp_name == "to":
+                    raise ValueError()
+                curr_layer = curr_layer.__getattr__(temp_name)
+                if len(layer_infos) > 0:
+                    temp_name = layer_infos.pop(0)
+                elif len(layer_infos) == 0:
+                    break
+            except Exception:
+                if len(temp_name) > 0:
+                    temp_name += "_" + layer_infos.pop(0)
+                else:
+                    temp_name = layer_infos.pop(0)
+        triplet_keys = [key, key.replace("lora_down", "lora_up"), key.replace("lora_down.weight", "alpha")]
+        dtype: paddle.dtype = curr_layer.weight.dtype
+        weight_down: paddle.Tensor = state_dict[triplet_keys[0]].cast(dtype)
+        weight_up: paddle.Tensor = state_dict[triplet_keys[1]].cast(dtype)
+        rank: float = float(weight_down.shape[0])
+        if triplet_keys[2] in state_dict:
+            alpha: float = state_dict[triplet_keys[2]].cast(dtype).item()
+            scale: float = alpha / rank
+        else:
+            scale = 1.0
+        if not hasattr(curr_layer, "backup_weights"):
+            curr_layer.backup_weights = curr_layer.weight.clone()
+        if len(weight_down.shape) == 4:
+            if weight_down.shape[2:4] == [1, 1]:
+                # conv2d 1x1
+                curr_layer.weight.copy_(
+                    curr_layer.weight
+                    + ratio
+                    * paddle.matmul(weight_up.squeeze([-1, -2]), weight_down.squeeze([-1, -2])).unsqueeze([-1, -2])
+                    * scale,
+                    True,
+                )
+            else:
+                # conv2d 3x3
+                curr_layer.weight.copy_(
+                    curr_layer.weight
+                    + ratio
+                    * paddle.nn.functional.conv2d(weight_down.transpose([1, 0, 2, 3]), weight_up).transpose(
+                        [1, 0, 2, 3]
+                    )
+                    * scale,
+                    True,
+                )
+        else:
+            # linear
+            curr_layer.weight.copy_(curr_layer.weight + ratio * paddle.matmul(weight_up, weight_down).T * scale, True)
+        # update visited list
+        visited.extend(triplet_keys)
+    return pipeline
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
             "kdpm2-ancestral",
             "kdpm2",
         ]
+        self.weights_has_changed = False
     def add_ti_embedding_dir(self, embeddings_dir):
         self.sj.embedding_db.add_embedding_dir(embeddings_dir)
         self.sj.embedding_db.clear_embedding_dirs()
         self.sj.embedding_db.load_textual_inversion_embeddings(True)
+    def change_scheduler(self, scheduler_type="ddim"):
+        self.switch_scheduler(scheduler_type)
     def switch_scheduler(self, scheduler_type="ddim"):
         scheduler_type = scheduler_type.lower()
         from ppdiffusers import (
         callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
         callback_steps: Optional[int] = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: int = 1,
         controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        lora_dir: str = "./loras",
     ):
         r"""
         Function invoked when calling the pipeline for generation.
                 A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
                 `self.processor` in
                 [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+            clip_skip (`int`, *optional*, defaults to 1):
+                CLIP_stop_at_last_layers, if clip_skip <= 1, we will use the last_hidden_state from text_encoder.
             controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
                 The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
                 to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
                 corresponding scale as a list.
+            lora_dir (`str`, *optional*):
+                Path to lora which we want to load.
         Examples:
         Returns:
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
+        try:
+            # 0. Default height and width to unet
+            height, width = self._default_height_width(height, width, image)
+            # 1. Check inputs. Raise error if not correct
+            self.check_inputs(
+                prompt,
+                image,
+                height,
+                width,
+                callback_steps,
+                negative_prompt,
+                controlnet_conditioning_scale,
+            )
+            batch_size = 1
+            image = self.prepare_image(
+                image=image,
+                width=width,
+                height=height,
+                dtype=self.controlnet.dtype,
+            )
+            # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+            # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+            # corresponds to doing no classifier free guidance.
+            do_classifier_free_guidance = guidance_scale > 1.0
+            prompts, extra_network_data = parse_prompts([prompt])
+            if lora_dir is not None and os.path.exists(lora_dir):
+                lora_mapping = {p.stem: p.absolute() for p in Path(lora_dir).glob("*.safetensors")}
+                for params in extra_network_data["lora"]:
+                    assert len(params.items) > 0
+                    name = params.items[0]
+                    if name in lora_mapping:
+                        ratio = float(params.items[1]) if len(params.items) > 1 else 1.0
+                        lora_state_dict = smart_load(lora_mapping[name], map_location=paddle.get_device())
+                        self.weights_has_changed = True
+                        load_lora(self, state_dict=lora_state_dict, ratio=ratio)
+                        del lora_state_dict
+                    else:
+                        print(f"We can't find lora weight: {name}! Please make sure that exists!")
+            self.sj.clip.CLIP_stop_at_last_layers = clip_skip
+            # 3. Encode input prompt
+            prompt_embeds, negative_prompt_embeds = self._encode_prompt(
+                prompts,
+                do_classifier_free_guidance,
+                negative_prompt,
+                num_inference_steps=num_inference_steps,
+            )
+            # 4. Prepare timesteps
+            self.scheduler.set_timesteps(num_inference_steps)
+            timesteps = self.scheduler.timesteps
+            # 5. Prepare latent variables
+            num_channels_latents = self.unet.in_channels
+            latents = self.prepare_latents(
+                batch_size,
+                num_channels_latents,
+                height,
+                width,
+                self.unet.dtype,
+                generator,
+                latents,
+            )
+            # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+            # 7. Denoising loop
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+            with self.progress_bar(total=num_inference_steps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    step = i // self.scheduler.order
+                    do_batch = False
+                    conds_list, cond_tensor = reconstruct_multicond_batch(prompt_embeds, step)
+                    try:
+                        weight = conds_list[0][0][1]
+                    except Exception:
+                        weight = 1.0
+                    if do_classifier_free_guidance:
+                        uncond_tensor = reconstruct_cond_batch(negative_prompt_embeds, step)
+                        do_batch = cond_tensor.shape[1] == uncond_tensor.shape[1]
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = paddle.concat([latents] * 2) if do_batch else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    if do_batch:
+                        encoder_hidden_states = paddle.concat([uncond_tensor, cond_tensor])
+                        down_block_res_samples, mid_block_res_sample = self.controlnet(
+                            latent_model_input,
+                            t,
+                            encoder_hidden_states=encoder_hidden_states,
+                            controlnet_cond=paddle.concat([image, image]),
+                            conditioning_scale=controlnet_conditioning_scale,
+                            return_dict=False,
+                        )
+                        noise_pred = self.unet(
+                            latent_model_input,
+                            t,
+                            encoder_hidden_states=encoder_hidden_states,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                            down_block_additional_residuals=down_block_res_samples,
+                            mid_block_additional_residual=mid_block_res_sample,
+                        ).sample
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + weight * guidance_scale * (
+                            noise_pred_text - noise_pred_uncond
+                        )
+                    else:
                         down_block_res_samples, mid_block_res_sample = self.controlnet(
                             latent_model_input,
                             t,
+                            encoder_hidden_states=cond_tensor,
                             controlnet_cond=image,
                             conditioning_scale=controlnet_conditioning_scale,
                             return_dict=False,
                         )
+                        noise_pred = self.unet(
                             latent_model_input,
                             t,
+                            encoder_hidden_states=cond_tensor,
                             cross_attention_kwargs=cross_attention_kwargs,
                             down_block_additional_residuals=down_block_res_samples,
                             mid_block_additional_residual=mid_block_res_sample,
                         ).sample
+                        if do_classifier_free_guidance:
+                            down_block_res_samples, mid_block_res_sample = self.controlnet(
+                                latent_model_input,
+                                t,
+                                encoder_hidden_states=uncond_tensor,
+                                controlnet_cond=image,
+                                conditioning_scale=controlnet_conditioning_scale,
+                                return_dict=False,
+                            )
+                            noise_pred_uncond = self.unet(
+                                latent_model_input,
+                                t,
+                                encoder_hidden_states=uncond_tensor,
+                                cross_attention_kwargs=cross_attention_kwargs,
+                                down_block_additional_residuals=down_block_res_samples,
+                                mid_block_additional_residual=mid_block_res_sample,
+                            ).sample
+                            noise_pred = noise_pred_uncond + weight * guidance_scale * (noise_pred - noise_pred_uncond)
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+                        if callback is not None and i % callback_steps == 0:
+                            callback(i, t, latents)
+            if output_type == "latent":
+                image = latents
+                has_nsfw_concept = None
+            elif output_type == "pil":
+                # 8. Post-processing
+                image = self.decode_latents(latents)
+                # 9. Run safety checker
+                image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype)
+                # 10. Convert to PIL
+                image = self.numpy_to_pil(image)
+            else:
+                # 8. Post-processing
+                image = self.decode_latents(latents)
+                # 9. Run safety checker
+                image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype)
+            if not return_dict:
+                return (image, has_nsfw_concept)
+            return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+        except Exception as e:
+            raise ValueError(e)
+        finally:
+            if self.weights_has_changed:
+                for sub_layer in self.text_encoder.sublayers(include_self=True):
+                    if hasattr(sub_layer, "backup_weights"):
+                        sub_layer.weight.copy_(sub_layer.backup_weights, True)
+                for sub_layer in self.unet.sublayers(include_self=True):
+                    if hasattr(sub_layer, "backup_weights"):
+                        sub_layer.weight.copy_(sub_layer.backup_weights, True)
+                self.weights_has_changed = False
 # clip.py