Spaces:

Kunbyte
/

DRA-Ctrl

Running on Zero

App Files Files Community

caohy666 commited on 1 day ago

Commit

f4b19f4

1 Parent(s): cb59ffa

<fix> remove pipe_lock

Browse files

Files changed (1) hide show

app.py +190 -193

app.py CHANGED Viewed

@@ -47,8 +47,6 @@ there's no need to manually input edge maps, depth maps, or other condition imag
 The corresponding condition images will be automatically extracted.
 """
-pipe_lock = threading.Lock()
 def init_basemodel():
     global transformer, scheduler, vae, text_encoder, text_encoder_2, tokenizer, tokenizer_2, image_processor, pipe, current_task
@@ -105,201 +103,200 @@ def init_basemodel():
 @spaces.GPU
 def process_image_and_text(condition_image, target_prompt, condition_image_prompt, task, random_seed, num_steps, inpainting, fill_x1, fill_x2, fill_y1, fill_y2):
     # set up the model
-    with pipe_lock:
-        global pipe, current_task, transformer
-        if current_task != task:
-            if current_task is None:
-                # insert LoRA
-                lora_config = LoraConfig(
-                    r=16,
-                    lora_alpha=16,
-                    init_lora_weights="gaussian",
-                    target_modules=[
-                        'attn.to_k', 'attn.to_q', 'attn.to_v', 'attn.to_out.0',
-                        'attn.add_k_proj', 'attn.add_q_proj', 'attn.add_v_proj', 'attn.to_add_out',
-                        'ff.net.0.proj', 'ff.net.2',
-                        'ff_context.net.0.proj', 'ff_context.net.2',
-                        'norm1_context.linear', 'norm1.linear',
-                        'norm.linear', 'proj_mlp', 'proj_out',
-                    ]
-                )
-                transformer.add_adapter(lora_config)
-            else:
-                def restore_forward(module):
-                    def restored_forward(self, x, *args, **kwargs):
-                        return module.original_forward(x, *args, **kwargs)
-                    return restored_forward.__get__(module, type(module))
-                for n, m in transformer.named_modules():
-                    if isinstance(m, peft.tuners.lora.layer.Linear):
-                        m.forward = restore_forward(m)
-            current_task = task
-            # hack LoRA forward
-            def create_hacked_forward(module):
-                if not hasattr(module, 'original_forward'):
-                    module.original_forward = module.forward
-                lora_forward = module.forward
-                non_lora_forward = module.base_layer.forward
-                img_sequence_length = int((512 / 8 / 2) ** 2)
-                encoder_sequence_length = 144 + 252 # encoder sequence: 144 img 252 txt
-                num_imgs = 4
-                num_generated_imgs = 3
-                num_encoder_sequences = 2 if task in ['subject_driven', 'style_transfer'] else 1
-                def hacked_lora_forward(self, x, *args, **kwargs):
-                    if x.shape[1] == img_sequence_length * num_imgs and len(x.shape) > 2:
-                        return torch.cat((
-                            lora_forward(x[:, :-img_sequence_length*num_generated_imgs], *args, **kwargs),
-                            non_lora_forward(x[:, -img_sequence_length*num_generated_imgs:], *args, **kwargs)
-                        ), dim=1)
-                    elif x.shape[1] == encoder_sequence_length * num_encoder_sequences or x.shape[1] == encoder_sequence_length:
-                        return lora_forward(x, *args, **kwargs)
-                    elif x.shape[1] == img_sequence_length * num_imgs + encoder_sequence_length * num_encoder_sequences:
-                        return torch.cat((
-                            lora_forward(x[:, :(num_imgs - num_generated_imgs)*img_sequence_length], *args, **kwargs),
-                            non_lora_forward(x[:, (num_imgs - num_generated_imgs)*img_sequence_length:-num_encoder_sequences*encoder_sequence_length], *args, **kwargs),
-                            lora_forward(x[:, -num_encoder_sequences*encoder_sequence_length:], *args, **kwargs)
-                        ), dim=1)
-                    elif x.shape[1] == 3072:
-                        return non_lora_forward(x, *args, **kwargs)
-                    else:
-                        raise ValueError(
-                            f"hacked_lora_forward receives unexpected sequence length: {x.shape[1]}, input shape: {x.shape}!"
-                        )
-                return hacked_lora_forward.__get__(module, type(module))
             for n, m in transformer.named_modules():
                 if isinstance(m, peft.tuners.lora.layer.Linear):
-                    m.forward = create_hacked_forward(m)
-            # load LoRA weights
-            model_root = hf_hub_download(
-                repo_id="Kunbyte/DRA-Ctrl",
-                filename=f"{task}.safetensors",
-                resume_download=True)
-            try:
-                with safe_open(model_root, framework="pt") as f:
-                    lora_weights = {}
-                    for k in f.keys():
-                        param = f.get_tensor(k)
-                        if k.endswith(".weight"):
-                            k = k.replace('.weight', '.default.weight')
-                        lora_weights[k] = param
-                    transformer.load_state_dict(lora_weights, strict=False)
-            except Exception as e:
-                raise ValueError(f'{e}')
-            transformer.requires_grad_(False)
-        # start generation
-        c_txt = None if condition_image_prompt == "" else condition_image_prompt
-        c_img = condition_image.resize((512, 512))
-        t_txt = target_prompt
-        if task not in ['subject_driven', 'style_transfer']:
-            if task == "canny":
-                def get_canny_edge(img):
-                    img_np = np.array(img)
-                    img_gray = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
-                    edges = cv2.Canny(img_gray, 100, 200)
-                    edges_tmp = Image.fromarray(edges).convert("RGB")
-                    edges[edges == 0] = 128
-                    return Image.fromarray(edges).convert("RGB")
-                c_img = get_canny_edge(c_img)
-            elif task == "coloring":
-                c_img = (
-                    c_img.resize((512, 512))
-                    .convert("L")
-                    .convert("RGB")
-                )
-            elif task == "deblurring":
-                blur_radius = 10
-                c_img = (
-                    c_img.convert("RGB")
-                    .filter(ImageFilter.GaussianBlur(blur_radius))
-                    .resize((512, 512))
-                    .convert("RGB")
-                )
-            elif task == "depth":
-                def get_depth_map(img):
-                    from transformers import pipeline
-                    depth_pipe = pipeline(
-                        task="depth-estimation",
-                        model="LiheYoung/depth-anything-small-hf",
-                        device="cpu",
                     )
-                    return depth_pipe(img)["depth"].convert("RGB").resize((512, 512))
-                c_img = get_depth_map(c_img)
-                k = (255 - 128) / 255
-                b = 128
-                c_img = c_img.point(lambda x: k * x + b)
-            elif task == "depth_pred":
-                c_img = c_img
-            elif task == "fill":
-                c_img = c_img.resize((512, 512)).convert("RGB")
-                x1, x2 = fill_x1, fill_x2
-                y1, y2 = fill_y1, fill_y2
-                mask = Image.new("L", (512, 512), 0)
-                draw = ImageDraw.Draw(mask)
-                draw.rectangle((x1, y1, x2, y2), fill=255)
-                if inpainting:
-                    mask = Image.eval(mask, lambda a: 255 - a)
-                c_img = Image.composite(
-                    c_img,
-                    Image.new("RGB", (512, 512), (255, 255, 255)),
-                    mask
-                )
-                c_img = Image.composite(
-                    c_img,
-                    Image.new("RGB", (512, 512), (128, 128, 128)),
-                    mask
                 )
-            elif task == "sr":
-                c_img = c_img.resize((int(512 / 4), int(512 / 4))).convert("RGB")
-                c_img = c_img.resize((512, 512))
-        gen_img = pipe(
-            image=c_img,
-            prompt=[t_txt.strip()],
-            prompt_condition=[c_txt.strip()] if c_txt is not None else None,
-            prompt_2=[t_txt],
-            height=512,
-            width=512,
-            num_frames=5,
-            num_inference_steps=num_steps,
-            guidance_scale=6.0,
-            num_videos_per_prompt=1,
-            generator=torch.Generator(device=pipe.transformer.device).manual_seed(random_seed),
-            output_type='pt',
-            image_embed_interleave=4,
-            frame_gap=48,
-            mixup=True,
-            mixup_num_imgs=2,
-            enhance_tp=task in ['subject_driven'],
-        ).frames
-        output_images = []
-        for i in range(10):
-            out = gen_img[:, i:i+1, :, :, :]
-            out = out.squeeze(0).squeeze(0).cpu().to(torch.float32).numpy()
-            out = np.transpose(out, (1, 2, 0))
-            out = (out * 255).astype(np.uint8)
-            out = Image.fromarray(out)
-            output_images.append(out)
-        # video = [np.array(img.convert('RGB')) for img in output_images[1:] + [output_images[0]]]
-        # video = np.stack(video, axis=0)
-        with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
-            video_path = f.name
-        imageio.mimsave(video_path, output_images[1:]+[output_images[0]], fps=5)
-        return output_images[0], video_path
 def get_samples():
     sample_list = [

 The corresponding condition images will be automatically extracted.
 """
 def init_basemodel():
     global transformer, scheduler, vae, text_encoder, text_encoder_2, tokenizer, tokenizer_2, image_processor, pipe, current_task
 @spaces.GPU
 def process_image_and_text(condition_image, target_prompt, condition_image_prompt, task, random_seed, num_steps, inpainting, fill_x1, fill_x2, fill_y1, fill_y2):
     # set up the model
+    global pipe, current_task, transformer
+    if current_task != task:
+        if current_task is None:
+            # insert LoRA
+            lora_config = LoraConfig(
+                r=16,
+                lora_alpha=16,
+                init_lora_weights="gaussian",
+                target_modules=[
+                    'attn.to_k', 'attn.to_q', 'attn.to_v', 'attn.to_out.0',
+                    'attn.add_k_proj', 'attn.add_q_proj', 'attn.add_v_proj', 'attn.to_add_out',
+                    'ff.net.0.proj', 'ff.net.2',
+                    'ff_context.net.0.proj', 'ff_context.net.2',
+                    'norm1_context.linear', 'norm1.linear',
+                    'norm.linear', 'proj_mlp', 'proj_out',
+                ]
+            )
+            transformer.add_adapter(lora_config)
+        else:
+            def restore_forward(module):
+                def restored_forward(self, x, *args, **kwargs):
+                    return module.original_forward(x, *args, **kwargs)
+                return restored_forward.__get__(module, type(module))
             for n, m in transformer.named_modules():
                 if isinstance(m, peft.tuners.lora.layer.Linear):
+                    m.forward = restore_forward(m)
+        current_task = task
+        # hack LoRA forward
+        def create_hacked_forward(module):
+            if not hasattr(module, 'original_forward'):
+                module.original_forward = module.forward
+            lora_forward = module.forward
+            non_lora_forward = module.base_layer.forward
+            img_sequence_length = int((512 / 8 / 2) ** 2)
+            encoder_sequence_length = 144 + 252 # encoder sequence: 144 img 252 txt
+            num_imgs = 4
+            num_generated_imgs = 3
+            num_encoder_sequences = 2 if task in ['subject_driven', 'style_transfer'] else 1
+            def hacked_lora_forward(self, x, *args, **kwargs):
+                if x.shape[1] == img_sequence_length * num_imgs and len(x.shape) > 2:
+                    return torch.cat((
+                        lora_forward(x[:, :-img_sequence_length*num_generated_imgs], *args, **kwargs),
+                        non_lora_forward(x[:, -img_sequence_length*num_generated_imgs:], *args, **kwargs)
+                    ), dim=1)
+                elif x.shape[1] == encoder_sequence_length * num_encoder_sequences or x.shape[1] == encoder_sequence_length:
+                    return lora_forward(x, *args, **kwargs)
+                elif x.shape[1] == img_sequence_length * num_imgs + encoder_sequence_length * num_encoder_sequences:
+                    return torch.cat((
+                        lora_forward(x[:, :(num_imgs - num_generated_imgs)*img_sequence_length], *args, **kwargs),
+                        non_lora_forward(x[:, (num_imgs - num_generated_imgs)*img_sequence_length:-num_encoder_sequences*encoder_sequence_length], *args, **kwargs),
+                        lora_forward(x[:, -num_encoder_sequences*encoder_sequence_length:], *args, **kwargs)
+                    ), dim=1)
+                elif x.shape[1] == 3072:
+                    return non_lora_forward(x, *args, **kwargs)
+                else:
+                    raise ValueError(
+                        f"hacked_lora_forward receives unexpected sequence length: {x.shape[1]}, input shape: {x.shape}!"
                     )
+            return hacked_lora_forward.__get__(module, type(module))
+        for n, m in transformer.named_modules():
+            if isinstance(m, peft.tuners.lora.layer.Linear):
+                m.forward = create_hacked_forward(m)
+        # load LoRA weights
+        model_root = hf_hub_download(
+            repo_id="Kunbyte/DRA-Ctrl",
+            filename=f"{task}.safetensors",
+            resume_download=True)
+        try:
+            with safe_open(model_root, framework="pt") as f:
+                lora_weights = {}
+                for k in f.keys():
+                    param = f.get_tensor(k)
+                    if k.endswith(".weight"):
+                        k = k.replace('.weight', '.default.weight')
+                    lora_weights[k] = param
+                transformer.load_state_dict(lora_weights, strict=False)
+        except Exception as e:
+            raise ValueError(f'{e}')
+        transformer.requires_grad_(False)
+    # start generation
+    c_txt = None if condition_image_prompt == "" else condition_image_prompt
+    c_img = condition_image.resize((512, 512))
+    t_txt = target_prompt
+    if task not in ['subject_driven', 'style_transfer']:
+        if task == "canny":
+            def get_canny_edge(img):
+                img_np = np.array(img)
+                img_gray = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
+                edges = cv2.Canny(img_gray, 100, 200)
+                edges_tmp = Image.fromarray(edges).convert("RGB")
+                edges[edges == 0] = 128
+                return Image.fromarray(edges).convert("RGB")
+            c_img = get_canny_edge(c_img)
+        elif task == "coloring":
+            c_img = (
+                c_img.resize((512, 512))
+                .convert("L")
+                .convert("RGB")
+            )
+        elif task == "deblurring":
+            blur_radius = 10
+            c_img = (
+                c_img.convert("RGB")
+                .filter(ImageFilter.GaussianBlur(blur_radius))
+                .resize((512, 512))
+                .convert("RGB")
+            )
+        elif task == "depth":
+            def get_depth_map(img):
+                from transformers import pipeline
+                depth_pipe = pipeline(
+                    task="depth-estimation",
+                    model="LiheYoung/depth-anything-small-hf",
+                    device="cpu",
                 )
+                return depth_pipe(img)["depth"].convert("RGB").resize((512, 512))
+            c_img = get_depth_map(c_img)
+            k = (255 - 128) / 255
+            b = 128
+            c_img = c_img.point(lambda x: k * x + b)
+        elif task == "depth_pred":
+            c_img = c_img
+        elif task == "fill":
+            c_img = c_img.resize((512, 512)).convert("RGB")
+            x1, x2 = fill_x1, fill_x2
+            y1, y2 = fill_y1, fill_y2
+            mask = Image.new("L", (512, 512), 0)
+            draw = ImageDraw.Draw(mask)
+            draw.rectangle((x1, y1, x2, y2), fill=255)
+            if inpainting:
+                mask = Image.eval(mask, lambda a: 255 - a)
+            c_img = Image.composite(
+                c_img,
+                Image.new("RGB", (512, 512), (255, 255, 255)),
+                mask
+            )
+            c_img = Image.composite(
+                c_img,
+                Image.new("RGB", (512, 512), (128, 128, 128)),
+                mask
+            )
+        elif task == "sr":
+            c_img = c_img.resize((int(512 / 4), int(512 / 4))).convert("RGB")
+            c_img = c_img.resize((512, 512))
+    gen_img = pipe(
+        image=c_img,
+        prompt=[t_txt.strip()],
+        prompt_condition=[c_txt.strip()] if c_txt is not None else None,
+        prompt_2=[t_txt],
+        height=512,
+        width=512,
+        num_frames=5,
+        num_inference_steps=num_steps,
+        guidance_scale=6.0,
+        num_videos_per_prompt=1,
+        generator=torch.Generator(device=pipe.transformer.device).manual_seed(random_seed),
+        output_type='pt',
+        image_embed_interleave=4,
+        frame_gap=48,
+        mixup=True,
+        mixup_num_imgs=2,
+        enhance_tp=task in ['subject_driven'],
+    ).frames
+    output_images = []
+    for i in range(10):
+        out = gen_img[:, i:i+1, :, :, :]
+        out = out.squeeze(0).squeeze(0).cpu().to(torch.float32).numpy()
+        out = np.transpose(out, (1, 2, 0))
+        out = (out * 255).astype(np.uint8)
+        out = Image.fromarray(out)
+        output_images.append(out)
+    # video = [np.array(img.convert('RGB')) for img in output_images[1:] + [output_images[0]]]
+    # video = np.stack(video, axis=0)
+    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
+        video_path = f.name
+    imageio.mimsave(video_path, output_images[1:]+[output_images[0]], fps=5)
+    return output_images[0], video_path
 def get_samples():
     sample_list = [