##!/usr/bin/python3 # -*- coding: utf-8 -*- import os, random, sys import numpy as np import requests import torch import gradio as gr from PIL import Image from huggingface_hub import hf_hub_download, snapshot_download from scipy.ndimage import binary_dilation, binary_erosion from transformers import (LlavaNextProcessor, LlavaNextForConditionalGeneration, Qwen2VLForConditionalGeneration, Qwen2VLProcessor) from segment_anything import SamPredictor, build_sam, SamAutomaticMaskGenerator from diffusers import StableDiffusionBrushNetPipeline, BrushNetModel, UniPCMultistepScheduler from diffusers.image_processor import VaeImageProcessor from app.src.vlm_pipeline import ( vlm_response_editing_type, vlm_response_object_wait_for_edit, vlm_response_mask, vlm_response_prompt_after_apply_instruction ) from app.src.brushedit_all_in_one_pipeline import BrushEdit_Pipeline from app.utils.utils import load_grounding_dino_model from app.src.vlm_template import vlms_template from app.src.base_model_template import base_models_template from app.src.aspect_ratio_template import aspect_ratios from openai import OpenAI base_openai_url = "https://api.deepseek.com/" base_api_key = "sk-d145b963a92649a88843caeb741e8bbc" from transformers import BlipProcessor, BlipForConditionalGeneration from app.deepseek.instructions import ( create_apply_editing_messages_deepseek, create_decomposed_query_messages_deepseek ) #### Description #### logo = r""" BrushEdit logo

""" head = r"""

基于扩散模型先验和大语言模型的零样本组合查询图像检索

""" descriptions = r""" Demo for ZS-CIR""" instructions = r""" Demo for ZS-CIR""" tips = r""" Demo for ZS-CIR """ citation = r""" Demo for ZS-CIR""" # - - - - - examples - - - - - # EXAMPLES = [ [ Image.open("./assets/frog/frog.jpeg").convert("RGBA"), "add a magic hat on frog head.", 642087011, "frog", "frog", True, False, "GPT4-o (Highly Recommended)" ], [ Image.open("./assets/chinese_girl/chinese_girl.png").convert("RGBA"), "replace the background to ancient China.", 648464818, "chinese_girl", "chinese_girl", True, False, "GPT4-o (Highly Recommended)" ], [ Image.open("./assets/angel_christmas/angel_christmas.png").convert("RGBA"), "remove the deer.", 648464818, "angel_christmas", "angel_christmas", False, False, "GPT4-o (Highly Recommended)" ], [ Image.open("./assets/sunflower_girl/sunflower_girl.png").convert("RGBA"), "add a wreath on head.", 648464818, "sunflower_girl", "sunflower_girl", True, False, "GPT4-o (Highly Recommended)" ], [ Image.open("./assets/girl_on_sun/girl_on_sun.png").convert("RGBA"), "add a butterfly fairy.", 648464818, "girl_on_sun", "girl_on_sun", True, False, "GPT4-o (Highly Recommended)" ], [ Image.open("./assets/spider_man_rm/spider_man.png").convert("RGBA"), "remove the christmas hat.", 642087011, "spider_man_rm", "spider_man_rm", False, False, "GPT4-o (Highly Recommended)" ], [ Image.open("./assets/anime_flower/anime_flower.png").convert("RGBA"), "remove the flower.", 642087011, "anime_flower", "anime_flower", False, False, "GPT4-o (Highly Recommended)" ], [ Image.open("./assets/chenduling/chengduling.jpg").convert("RGBA"), "replace the clothes to a delicated floral skirt.", 648464818, "chenduling", "chenduling", True, False, "GPT4-o (Highly Recommended)" ], [ Image.open("./assets/hedgehog_rp_bg/hedgehog.png").convert("RGBA"), "make the hedgehog in Italy.", 648464818, "hedgehog_rp_bg", "hedgehog_rp_bg", True, False, "GPT4-o (Highly Recommended)" ], ] INPUT_IMAGE_PATH = { "frog": "./assets/frog/frog.jpeg", "chinese_girl": "./assets/chinese_girl/chinese_girl.png", "angel_christmas": "./assets/angel_christmas/angel_christmas.png", "sunflower_girl": "./assets/sunflower_girl/sunflower_girl.png", "girl_on_sun": "./assets/girl_on_sun/girl_on_sun.png", "spider_man_rm": "./assets/spider_man_rm/spider_man.png", "anime_flower": "./assets/anime_flower/anime_flower.png", "chenduling": "./assets/chenduling/chengduling.jpg", "hedgehog_rp_bg": "./assets/hedgehog_rp_bg/hedgehog.png", } MASK_IMAGE_PATH = { "frog": "./assets/frog/mask_f7b350de-6f2c-49e3-b535-995c486d78e7.png", "chinese_girl": "./assets/chinese_girl/mask_54759648-0989-48e0-bc82-f20e28b5ec29.png", "angel_christmas": "./assets/angel_christmas/mask_f15d9b45-c978-4e3d-9f5f-251e308560c3.png", "sunflower_girl": "./assets/sunflower_girl/mask_99cc50b4-7dc4-4de5-8748-ec10772f0317.png", "girl_on_sun": "./assets/girl_on_sun/mask_264eac8b-8b65-479c-9755-020a60880c37.png", "spider_man_rm": "./assets/spider_man_rm/mask_a5d410e6-8e8d-432f-8144-defbc3e1eae9.png", "anime_flower": "./assets/anime_flower/mask_37553172-9b38-4727-bf2e-37d7e2b93461.png", "chenduling": "./assets/chenduling/mask_68e3ff6f-da07-4b37-91df-13d6eed7b997.png", "hedgehog_rp_bg": "./assets/hedgehog_rp_bg/mask_db7f8bf8-8349-46d3-b14e-43d67fbe25d3.png", } MASKED_IMAGE_PATH = { "frog": "./assets/frog/masked_image_f7b350de-6f2c-49e3-b535-995c486d78e7.png", "chinese_girl": "./assets/chinese_girl/masked_image_54759648-0989-48e0-bc82-f20e28b5ec29.png", "angel_christmas": "./assets/angel_christmas/masked_image_f15d9b45-c978-4e3d-9f5f-251e308560c3.png", "sunflower_girl": "./assets/sunflower_girl/masked_image_99cc50b4-7dc4-4de5-8748-ec10772f0317.png", "girl_on_sun": "./assets/girl_on_sun/masked_image_264eac8b-8b65-479c-9755-020a60880c37.png", "spider_man_rm": "./assets/spider_man_rm/masked_image_a5d410e6-8e8d-432f-8144-defbc3e1eae9.png", "anime_flower": "./assets/anime_flower/masked_image_37553172-9b38-4727-bf2e-37d7e2b93461.png", "chenduling": "./assets/chenduling/masked_image_68e3ff6f-da07-4b37-91df-13d6eed7b997.png", "hedgehog_rp_bg": "./assets/hedgehog_rp_bg/masked_image_db7f8bf8-8349-46d3-b14e-43d67fbe25d3.png", } OUTPUT_IMAGE_PATH = { "frog": "./assets/frog/image_edit_f7b350de-6f2c-49e3-b535-995c486d78e7_1.png", "chinese_girl": "./assets/chinese_girl/image_edit_54759648-0989-48e0-bc82-f20e28b5ec29_1.png", "angel_christmas": "./assets/angel_christmas/image_edit_f15d9b45-c978-4e3d-9f5f-251e308560c3_0.png", "sunflower_girl": "./assets/sunflower_girl/image_edit_99cc50b4-7dc4-4de5-8748-ec10772f0317_3.png", "girl_on_sun": "./assets/girl_on_sun/image_edit_264eac8b-8b65-479c-9755-020a60880c37_0.png", "spider_man_rm": "./assets/spider_man_rm/image_edit_a5d410e6-8e8d-432f-8144-defbc3e1eae9_0.png", "anime_flower": "./assets/anime_flower/image_edit_37553172-9b38-4727-bf2e-37d7e2b93461_2.png", "chenduling": "./assets/chenduling/image_edit_68e3ff6f-da07-4b37-91df-13d6eed7b997_0.png", "hedgehog_rp_bg": "./assets/hedgehog_rp_bg/image_edit_db7f8bf8-8349-46d3-b14e-43d67fbe25d3_3.png", } # os.environ['GRADIO_TEMP_DIR'] = 'gradio_temp_dir' # os.makedirs('gradio_temp_dir', exist_ok=True) VLM_MODEL_NAMES = list(vlms_template.keys()) DEFAULT_VLM_MODEL_NAME = "Qwen2-VL-7B-Instruct (Default)" BASE_MODELS = list(base_models_template.keys()) DEFAULT_BASE_MODEL = "realisticVision (Default)" ASPECT_RATIO_LABELS = list(aspect_ratios) DEFAULT_ASPECT_RATIO = ASPECT_RATIO_LABELS[0] ## init device try: if torch.cuda.is_available(): device = "cuda" elif sys.platform == "darwin" and torch.backends.mps.is_available(): device = "mps" else: device = "cpu" except: device = "cpu" # ## init torch dtype # if torch.cuda.is_available() and torch.cuda.is_bf16_supported(): # torch_dtype = torch.bfloat16 # else: # torch_dtype = torch.float16 # if device == "mps": # torch_dtype = torch.float16 torch_dtype = torch.float16 # download hf models BrushEdit_path = "models/" if not os.path.exists(BrushEdit_path): BrushEdit_path = snapshot_download( repo_id="TencentARC/BrushEdit", local_dir=BrushEdit_path, token=os.getenv("HF_TOKEN"), ) ## init default VLM vlm_type, vlm_local_path, vlm_processor, vlm_model = vlms_template[DEFAULT_VLM_MODEL_NAME] if vlm_processor != "" and vlm_model != "": vlm_model.to(device) else: raise gr.Error("Please Download default VLM model "+ DEFAULT_VLM_MODEL_NAME +" first.") ## init default LLM llm_model = OpenAI(api_key=base_api_key, base_url=base_openai_url) ## init base model base_model_path = os.path.join(BrushEdit_path, "base_model/realisticVisionV60B1_v51VAE") brushnet_path = os.path.join(BrushEdit_path, "brushnetX") sam_path = os.path.join(BrushEdit_path, "sam/sam_vit_h_4b8939.pth") groundingdino_path = os.path.join(BrushEdit_path, "grounding_dino/groundingdino_swint_ogc.pth") # input brushnetX ckpt path brushnet = BrushNetModel.from_pretrained(brushnet_path, torch_dtype=torch_dtype) pipe = StableDiffusionBrushNetPipeline.from_pretrained( base_model_path, brushnet=brushnet, torch_dtype=torch_dtype, low_cpu_mem_usage=False ) # speed up diffusion process with faster scheduler and memory optimization pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config) # remove following line if xformers is not installed or when using Torch 2.0. # pipe.enable_xformers_memory_efficient_attention() pipe.enable_model_cpu_offload() ## init SAM sam = build_sam(checkpoint=sam_path) sam.to(device=device) sam_predictor = SamPredictor(sam) sam_automask_generator = SamAutomaticMaskGenerator(sam) ## init groundingdino_model config_file = 'app/utils/GroundingDINO_SwinT_OGC.py' groundingdino_model = load_grounding_dino_model(config_file, groundingdino_path, device=device) ## Ordinary function def crop_and_resize(image: Image.Image, target_width: int, target_height: int) -> Image.Image: """ Crops and resizes an image while preserving the aspect ratio. Args: image (Image.Image): Input PIL image to be cropped and resized. target_width (int): Target width of the output image. target_height (int): Target height of the output image. Returns: Image.Image: Cropped and resized image. """ # Original dimensions original_width, original_height = image.size original_aspect = original_width / original_height target_aspect = target_width / target_height # Calculate crop box to maintain aspect ratio if original_aspect > target_aspect: # Crop horizontally new_width = int(original_height * target_aspect) new_height = original_height left = (original_width - new_width) / 2 top = 0 right = left + new_width bottom = original_height else: # Crop vertically new_width = original_width new_height = int(original_width / target_aspect) left = 0 top = (original_height - new_height) / 2 right = original_width bottom = top + new_height # Crop and resize cropped_image = image.crop((left, top, right, bottom)) resized_image = cropped_image.resize((target_width, target_height), Image.NEAREST) return resized_image ## Ordinary function def resize(image: Image.Image, target_width: int, target_height: int) -> Image.Image: """ Crops and resizes an image while preserving the aspect ratio. Args: image (Image.Image): Input PIL image to be cropped and resized. target_width (int): Target width of the output image. target_height (int): Target height of the output image. Returns: Image.Image: Cropped and resized image. """ # Original dimensions resized_image = image.resize((target_width, target_height), Image.NEAREST) return resized_image def move_mask_func(mask, direction, units): binary_mask = mask.squeeze()>0 rows, cols = binary_mask.shape moved_mask = np.zeros_like(binary_mask, dtype=bool) if direction == 'down': # move down moved_mask[max(0, units):, :] = binary_mask[:rows - units, :] elif direction == 'up': # move up moved_mask[:rows - units, :] = binary_mask[units:, :] elif direction == 'right': # move left moved_mask[:, max(0, units):] = binary_mask[:, :cols - units] elif direction == 'left': # move right moved_mask[:, :cols - units] = binary_mask[:, units:] return moved_mask def random_mask_func(mask, dilation_type='square', dilation_size=20): # Randomly select the size of dilation binary_mask = mask.squeeze()>0 if dilation_type == 'square_dilation': structure = np.ones((dilation_size, dilation_size), dtype=bool) dilated_mask = binary_dilation(binary_mask, structure=structure) elif dilation_type == 'square_erosion': structure = np.ones((dilation_size, dilation_size), dtype=bool) dilated_mask = binary_erosion(binary_mask, structure=structure) elif dilation_type == 'bounding_box': # find the most left top and left bottom point rows, cols = np.where(binary_mask) if len(rows) == 0 or len(cols) == 0: return mask # return original mask if no valid points min_row = np.min(rows) max_row = np.max(rows) min_col = np.min(cols) max_col = np.max(cols) # create a bounding box dilated_mask = np.zeros_like(binary_mask, dtype=bool) dilated_mask[min_row:max_row + 1, min_col:max_col + 1] = True elif dilation_type == 'bounding_ellipse': # find the most left top and left bottom point rows, cols = np.where(binary_mask) if len(rows) == 0 or len(cols) == 0: return mask # return original mask if no valid points min_row = np.min(rows) max_row = np.max(rows) min_col = np.min(cols) max_col = np.max(cols) # calculate the center and axis length of the ellipse center = ((min_col + max_col) // 2, (min_row + max_row) // 2) a = (max_col - min_col) // 2 # half long axis b = (max_row - min_row) // 2 # half short axis # create a bounding ellipse y, x = np.ogrid[:mask.shape[0], :mask.shape[1]] ellipse_mask = ((x - center[0])**2 / a**2 + (y - center[1])**2 / b**2) <= 1 dilated_mask = np.zeros_like(binary_mask, dtype=bool) dilated_mask[ellipse_mask] = True else: ValueError("dilation_type must be 'square' or 'ellipse'") # use binary dilation dilated_mask = np.uint8(dilated_mask[:,:,np.newaxis]) * 255 return dilated_mask ## Gradio component function def update_vlm_model(vlm_name): global vlm_model, vlm_processor if vlm_model is not None: del vlm_model torch.cuda.empty_cache() vlm_type, vlm_local_path, vlm_processor, vlm_model = vlms_template[vlm_name] ## we recommend using preload models, otherwise it will take a long time to download the model. you can edit the code via vlm_template.py if vlm_type == "llava-next": if vlm_processor != "" and vlm_model != "": vlm_model.to(device) return vlm_model_dropdown else: if os.path.exists(vlm_local_path): vlm_processor = LlavaNextProcessor.from_pretrained(vlm_local_path) vlm_model = LlavaNextForConditionalGeneration.from_pretrained(vlm_local_path, torch_dtype="auto", device_map="auto") else: if vlm_name == "llava-v1.6-mistral-7b-hf (Preload)": vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf") vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype="auto", device_map="auto") elif vlm_name == "llama3-llava-next-8b-hf (Preload)": vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llama3-llava-next-8b-hf") vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llama3-llava-next-8b-hf", torch_dtype="auto", device_map="auto") elif vlm_name == "llava-v1.6-vicuna-13b-hf (Preload)": vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-vicuna-13b-hf") vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-vicuna-13b-hf", torch_dtype="auto", device_map="auto") elif vlm_name == "llava-v1.6-34b-hf (Preload)": vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-34b-hf") vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-34b-hf", torch_dtype="auto", device_map="auto") elif vlm_name == "llava-next-72b-hf (Preload)": vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-next-72b-hf") vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-next-72b-hf", torch_dtype="auto", device_map="auto") elif vlm_type == "qwen2-vl": if vlm_processor != "" and vlm_model != "": vlm_model.to(device) return vlm_model_dropdown else: if os.path.exists(vlm_local_path): vlm_processor = Qwen2VLProcessor.from_pretrained(vlm_local_path) vlm_model = Qwen2VLForConditionalGeneration.from_pretrained(vlm_local_path, torch_dtype="auto", device_map="auto") else: if vlm_name == "qwen2-vl-2b-instruct (Preload)": vlm_processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct") vlm_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto") elif vlm_name == "qwen2-vl-7b-instruct (Preload)": vlm_processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") vlm_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto") elif vlm_name == "qwen2-vl-72b-instruct (Preload)": vlm_processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-72B-Instruct") vlm_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-72B-Instruct", torch_dtype="auto", device_map="auto") elif vlm_type == "openai": pass return "success" def update_base_model(base_model_name): global pipe ## we recommend using preload models, otherwise it will take a long time to download the model. you can edit the code via base_model_template.py if pipe is not None: del pipe torch.cuda.empty_cache() base_model_path, pipe = base_models_template[base_model_name] if pipe != "": pipe.to(device) else: if os.path.exists(base_model_path): pipe = StableDiffusionBrushNetPipeline.from_pretrained( base_model_path, brushnet=brushnet, torch_dtype=torch_dtype, low_cpu_mem_usage=False ) # pipe.enable_xformers_memory_efficient_attention() pipe.enable_model_cpu_offload() else: raise gr.Error(f"The base model {base_model_name} does not exist") return "success" def process(input_image, original_image, original_mask, prompt, negative_prompt, control_strength, seed, randomize_seed, guidance_scale, num_inference_steps, num_samples, blending, category, target_prompt, resize_default, aspect_ratio_name, invert_mask_state): if original_image is None: if input_image is None: raise gr.Error('Please upload the input image') else: print("input_image的键：", input_image.keys()) # 打印字典键 image_pil = input_image["background"].convert("RGB") original_image = np.array(image_pil) if prompt is None or prompt == "": if target_prompt is None or target_prompt == "": raise gr.Error("Please input your instructions, e.g., remove the xxx") alpha_mask = input_image["layers"][0].split()[3] input_mask = np.asarray(alpha_mask) output_w, output_h = aspect_ratios[aspect_ratio_name] if output_w == "" or output_h == "": output_h, output_w = original_image.shape[:2] if resize_default: short_side = min(output_w, output_h) scale_ratio = 640 / short_side output_w = int(output_w * scale_ratio) output_h = int(output_h * scale_ratio) original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h)) original_image = np.array(original_image) if input_mask is not None: input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h)) input_mask = np.array(input_mask) if original_mask is not None: original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h)) original_mask = np.array(original_mask) gr.Info(f"Output aspect ratio: {output_w}:{output_h}") else: gr.Info(f"Output aspect ratio: {output_w}:{output_h}") pass else: if resize_default: short_side = min(output_w, output_h) scale_ratio = 640 / short_side output_w = int(output_w * scale_ratio) output_h = int(output_h * scale_ratio) gr.Info(f"Output aspect ratio: {output_w}:{output_h}") original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h)) original_image = np.array(original_image) if input_mask is not None: input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h)) input_mask = np.array(input_mask) if original_mask is not None: original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h)) original_mask = np.array(original_mask) if invert_mask_state: original_mask = original_mask else: if input_mask.max() == 0: original_mask = original_mask else: original_mask = input_mask ## inpainting directly if target_prompt is not None if category is not None: pass elif target_prompt is not None and len(target_prompt) >= 1 and original_mask is not None: pass else: try: category = vlm_response_editing_type(vlm_processor, vlm_model, original_image, prompt, device) except Exception as e: raise gr.Error("Please select the correct VLM model and input the correct API Key first!") if original_mask is not None: original_mask = np.clip(original_mask, 0, 255).astype(np.uint8) else: try: object_wait_for_edit = vlm_response_object_wait_for_edit( vlm_processor, vlm_model, original_image, category, prompt, device) original_mask = vlm_response_mask(vlm_processor, vlm_model, category, original_image, prompt, object_wait_for_edit, sam, sam_predictor, sam_automask_generator, groundingdino_model, device).astype(np.uint8) except Exception as e: raise gr.Error("Please select the correct VLM model and input the correct API Key first!") if original_mask.ndim == 2: original_mask = original_mask[:,:,None] if target_prompt is not None and len(target_prompt) >= 1: prompt_after_apply_instruction = target_prompt else: try: prompt_after_apply_instruction = vlm_response_prompt_after_apply_instruction( vlm_processor, vlm_model, original_image, prompt, device) except Exception as e: raise gr.Error("Please select the correct VLM model and input the correct API Key first!") generator = torch.Generator(device).manual_seed(random.randint(0, 2147483647) if randomize_seed else seed) with torch.autocast(device): image, mask_image, mask_np, init_image_np = BrushEdit_Pipeline(pipe, prompt_after_apply_instruction, original_mask, original_image, generator, num_inference_steps, guidance_scale, control_strength, negative_prompt, num_samples, blending) original_image = np.array(init_image_np) masked_image = original_image * (1 - (mask_np>0)) masked_image = masked_image.astype(np.uint8) masked_image = Image.fromarray(masked_image) # Save the images (optional) # import uuid # uuid = str(uuid.uuid4()) # image[0].save(f"outputs/image_edit_{uuid}_0.png") # image[1].save(f"outputs/image_edit_{uuid}_1.png") # image[2].save(f"outputs/image_edit_{uuid}_2.png") # image[3].save(f"outputs/image_edit_{uuid}_3.png") # mask_image.save(f"outputs/mask_{uuid}.png") # masked_image.save(f"outputs/masked_image_{uuid}.png") gr.Info(f"Target Prompt: {prompt_after_apply_instruction}", duration=20) return image, [mask_image], [masked_image], prompt, '', False def process_mask(input_image, original_image, prompt, resize_default, aspect_ratio_name): if original_image is None: raise gr.Error('Please upload the input image') if prompt is None: raise gr.Error("Please input your instructions, e.g., remove the xxx") ## load mask alpha_mask = input_image["layers"][0].split()[3] input_mask = np.array(alpha_mask) # load example image if isinstance(original_image, str): original_image = input_image["background"] if input_mask.max() == 0: category = vlm_response_editing_type(vlm_processor, vlm_model, original_image, prompt, device) object_wait_for_edit = vlm_response_object_wait_for_edit(vlm_processor, vlm_model, original_image, category, prompt, device) # original mask: h,w,1 [0, 255] original_mask = vlm_response_mask( vlm_processor, vlm_model, category, original_image, prompt, object_wait_for_edit, sam, sam_predictor, sam_automask_generator, groundingdino_model, device).astype(np.uint8) else: original_mask = input_mask.astype(np.uint8) category = None ## resize mask if needed output_w, output_h = aspect_ratios[aspect_ratio_name] if output_w == "" or output_h == "": output_h, output_w = original_image.shape[:2] if resize_default: short_side = min(output_w, output_h) scale_ratio = 640 / short_side output_w = int(output_w * scale_ratio) output_h = int(output_h * scale_ratio) original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h)) original_image = np.array(original_image) if input_mask is not None: input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h)) input_mask = np.array(input_mask) if original_mask is not None: original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h)) original_mask = np.array(original_mask) gr.Info(f"Output aspect ratio: {output_w}:{output_h}") else: gr.Info(f"Output aspect ratio: {output_w}:{output_h}") pass else: if resize_default: short_side = min(output_w, output_h) scale_ratio = 640 / short_side output_w = int(output_w * scale_ratio) output_h = int(output_h * scale_ratio) gr.Info(f"Output aspect ratio: {output_w}:{output_h}") original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h)) original_image = np.array(original_image) if input_mask is not None: input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h)) input_mask = np.array(input_mask) if original_mask is not None: original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h)) original_mask = np.array(original_mask) if original_mask.ndim == 2: original_mask = original_mask[:,:,None] mask_image = Image.fromarray(original_mask.squeeze().astype(np.uint8)).convert("RGB") masked_image = original_image * (1 - (original_mask>0)) masked_image = masked_image.astype(np.uint8) masked_image = Image.fromarray(masked_image) return [masked_image], [mask_image], original_mask.astype(np.uint8), category def process_random_mask(input_image, original_image, original_mask, resize_default, aspect_ratio_name, ): alpha_mask = input_image["layers"][0].split()[3] input_mask = np.asarray(alpha_mask) output_w, output_h = aspect_ratios[aspect_ratio_name] if output_w == "" or output_h == "": output_h, output_w = original_image.shape[:2] if resize_default: short_side = min(output_w, output_h) scale_ratio = 640 / short_side output_w = int(output_w * scale_ratio) output_h = int(output_h * scale_ratio) original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h)) original_image = np.array(original_image) if input_mask is not None: input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h)) input_mask = np.array(input_mask) if original_mask is not None: original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h)) original_mask = np.array(original_mask) gr.Info(f"Output aspect ratio: {output_w}:{output_h}") else: gr.Info(f"Output aspect ratio: {output_w}:{output_h}") pass else: if resize_default: short_side = min(output_w, output_h) scale_ratio = 640 / short_side output_w = int(output_w * scale_ratio) output_h = int(output_h * scale_ratio) gr.Info(f"Output aspect ratio: {output_w}:{output_h}") original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h)) original_image = np.array(original_image) if input_mask is not None: input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h)) input_mask = np.array(input_mask) if original_mask is not None: original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h)) original_mask = np.array(original_mask) if input_mask.max() == 0: original_mask = original_mask else: original_mask = input_mask if original_mask is None: raise gr.Error('Please generate mask first') if original_mask.ndim == 2: original_mask = original_mask[:,:,None] dilation_type = np.random.choice(['bounding_box', 'bounding_ellipse']) random_mask = random_mask_func(original_mask, dilation_type).squeeze() mask_image = Image.fromarray(random_mask.astype(np.uint8)).convert("RGB") masked_image = original_image * (1 - (random_mask[:,:,None]>0)) masked_image = masked_image.astype(original_image.dtype) masked_image = Image.fromarray(masked_image) return [masked_image], [mask_image], random_mask[:,:,None].astype(np.uint8) def process_dilation_mask(input_image, original_image, original_mask, resize_default, aspect_ratio_name, dilation_size=20): alpha_mask = input_image["layers"][0].split()[3] input_mask = np.asarray(alpha_mask) output_w, output_h = aspect_ratios[aspect_ratio_name] if output_w == "" or output_h == "": output_h, output_w = original_image.shape[:2] if resize_default: short_side = min(output_w, output_h) scale_ratio = 640 / short_side output_w = int(output_w * scale_ratio) output_h = int(output_h * scale_ratio) original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h)) original_image = np.array(original_image) if input_mask is not None: input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h)) input_mask = np.array(input_mask) if original_mask is not None: original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h)) original_mask = np.array(original_mask) gr.Info(f"Output aspect ratio: {output_w}:{output_h}") else: gr.Info(f"Output aspect ratio: {output_w}:{output_h}") pass else: if resize_default: short_side = min(output_w, output_h) scale_ratio = 640 / short_side output_w = int(output_w * scale_ratio) output_h = int(output_h * scale_ratio) gr.Info(f"Output aspect ratio: {output_w}:{output_h}") original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h)) original_image = np.array(original_image) if input_mask is not None: input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h)) input_mask = np.array(input_mask) if original_mask is not None: original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h)) original_mask = np.array(original_mask) if input_mask.max() == 0: original_mask = original_mask else: original_mask = input_mask if original_mask is None: raise gr.Error('Please generate mask first') if original_mask.ndim == 2: original_mask = original_mask[:,:,None] dilation_type = np.random.choice(['square_dilation']) random_mask = random_mask_func(original_mask, dilation_type, dilation_size).squeeze() mask_image = Image.fromarray(random_mask.astype(np.uint8)).convert("RGB") masked_image = original_image * (1 - (random_mask[:,:,None]>0)) masked_image = masked_image.astype(original_image.dtype) masked_image = Image.fromarray(masked_image) return [masked_image], [mask_image], random_mask[:,:,None].astype(np.uint8) def process_erosion_mask(input_image, original_image, original_mask, resize_default, aspect_ratio_name, dilation_size=20): alpha_mask = input_image["layers"][0].split()[3] input_mask = np.asarray(alpha_mask) output_w, output_h = aspect_ratios[aspect_ratio_name] if output_w == "" or output_h == "": output_h, output_w = original_image.shape[:2] if resize_default: short_side = min(output_w, output_h) scale_ratio = 640 / short_side output_w = int(output_w * scale_ratio) output_h = int(output_h * scale_ratio) original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h)) original_image = np.array(original_image) if input_mask is not None: input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h)) input_mask = np.array(input_mask) if original_mask is not None: original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h)) original_mask = np.array(original_mask) gr.Info(f"Output aspect ratio: {output_w}:{output_h}") else: gr.Info(f"Output aspect ratio: {output_w}:{output_h}") pass else: if resize_default: short_side = min(output_w, output_h) scale_ratio = 640 / short_side output_w = int(output_w * scale_ratio) output_h = int(output_h * scale_ratio) gr.Info(f"Output aspect ratio: {output_w}:{output_h}") original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h)) original_image = np.array(original_image) if input_mask is not None: input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h)) input_mask = np.array(input_mask) if original_mask is not None: original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h)) original_mask = np.array(original_mask) if input_mask.max() == 0: original_mask = original_mask else: original_mask = input_mask if original_mask is None: raise gr.Error('Please generate mask first') if original_mask.ndim == 2: original_mask = original_mask[:,:,None] dilation_type = np.random.choice(['square_erosion']) random_mask = random_mask_func(original_mask, dilation_type, dilation_size).squeeze() mask_image = Image.fromarray(random_mask.astype(np.uint8)).convert("RGB") masked_image = original_image * (1 - (random_mask[:,:,None]>0)) masked_image = masked_image.astype(original_image.dtype) masked_image = Image.fromarray(masked_image) return [masked_image], [mask_image], random_mask[:,:,None].astype(np.uint8) def move_mask_left(input_image, original_image, original_mask, moving_pixels, resize_default, aspect_ratio_name): alpha_mask = input_image["layers"][0].split()[3] input_mask = np.asarray(alpha_mask) output_w, output_h = aspect_ratios[aspect_ratio_name] if output_w == "" or output_h == "": output_h, output_w = original_image.shape[:2] if resize_default: short_side = min(output_w, output_h) scale_ratio = 640 / short_side output_w = int(output_w * scale_ratio) output_h = int(output_h * scale_ratio) original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h)) original_image = np.array(original_image) if input_mask is not None: input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h)) input_mask = np.array(input_mask) if original_mask is not None: original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h)) original_mask = np.array(original_mask) gr.Info(f"Output aspect ratio: {output_w}:{output_h}") else: gr.Info(f"Output aspect ratio: {output_w}:{output_h}") pass else: if resize_default: short_side = min(output_w, output_h) scale_ratio = 640 / short_side output_w = int(output_w * scale_ratio) output_h = int(output_h * scale_ratio) gr.Info(f"Output aspect ratio: {output_w}:{output_h}") original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h)) original_image = np.array(original_image) if input_mask is not None: input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h)) input_mask = np.array(input_mask) if original_mask is not None: original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h)) original_mask = np.array(original_mask) if input_mask.max() == 0: original_mask = original_mask else: original_mask = input_mask if original_mask is None: raise gr.Error('Please generate mask first') if original_mask.ndim == 2: original_mask = original_mask[:,:,None] moved_mask = move_mask_func(original_mask, 'left', int(moving_pixels)).squeeze() mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB") masked_image = original_image * (1 - (moved_mask[:,:,None]>0)) masked_image = masked_image.astype(original_image.dtype) masked_image = Image.fromarray(masked_image) if moved_mask.max() <= 1: moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8) original_mask = moved_mask return [masked_image], [mask_image], original_mask.astype(np.uint8) def move_mask_right(input_image, original_image, original_mask, moving_pixels, resize_default, aspect_ratio_name): alpha_mask = input_image["layers"][0].split()[3] input_mask = np.asarray(alpha_mask) output_w, output_h = aspect_ratios[aspect_ratio_name] if output_w == "" or output_h == "": output_h, output_w = original_image.shape[:2] if resize_default: short_side = min(output_w, output_h) scale_ratio = 640 / short_side output_w = int(output_w * scale_ratio) output_h = int(output_h * scale_ratio) original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h)) original_image = np.array(original_image) if input_mask is not None: input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h)) input_mask = np.array(input_mask) if original_mask is not None: original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h)) original_mask = np.array(original_mask) gr.Info(f"Output aspect ratio: {output_w}:{output_h}") else: gr.Info(f"Output aspect ratio: {output_w}:{output_h}") pass else: if resize_default: short_side = min(output_w, output_h) scale_ratio = 640 / short_side output_w = int(output_w * scale_ratio) output_h = int(output_h * scale_ratio) gr.Info(f"Output aspect ratio: {output_w}:{output_h}") original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h)) original_image = np.array(original_image) if input_mask is not None: input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h)) input_mask = np.array(input_mask) if original_mask is not None: original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h)) original_mask = np.array(original_mask) if input_mask.max() == 0: original_mask = original_mask else: original_mask = input_mask if original_mask is None: raise gr.Error('Please generate mask first') if original_mask.ndim == 2: original_mask = original_mask[:,:,None] moved_mask = move_mask_func(original_mask, 'right', int(moving_pixels)).squeeze() mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB") masked_image = original_image * (1 - (moved_mask[:,:,None]>0)) masked_image = masked_image.astype(original_image.dtype) masked_image = Image.fromarray(masked_image) if moved_mask.max() <= 1: moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8) original_mask = moved_mask return [masked_image], [mask_image], original_mask.astype(np.uint8) def move_mask_up(input_image, original_image, original_mask, moving_pixels, resize_default, aspect_ratio_name): alpha_mask = input_image["layers"][0].split()[3] input_mask = np.asarray(alpha_mask) output_w, output_h = aspect_ratios[aspect_ratio_name] if output_w == "" or output_h == "": output_h, output_w = original_image.shape[:2] if resize_default: short_side = min(output_w, output_h) scale_ratio = 640 / short_side output_w = int(output_w * scale_ratio) output_h = int(output_h * scale_ratio) original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h)) original_image = np.array(original_image) if input_mask is not None: input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h)) input_mask = np.array(input_mask) if original_mask is not None: original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h)) original_mask = np.array(original_mask) gr.Info(f"Output aspect ratio: {output_w}:{output_h}") else: gr.Info(f"Output aspect ratio: {output_w}:{output_h}") pass else: if resize_default: short_side = min(output_w, output_h) scale_ratio = 640 / short_side output_w = int(output_w * scale_ratio) output_h = int(output_h * scale_ratio) gr.Info(f"Output aspect ratio: {output_w}:{output_h}") original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h)) original_image = np.array(original_image) if input_mask is not None: input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h)) input_mask = np.array(input_mask) if original_mask is not None: original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h)) original_mask = np.array(original_mask) if input_mask.max() == 0: original_mask = original_mask else: original_mask = input_mask if original_mask is None: raise gr.Error('Please generate mask first') if original_mask.ndim == 2: original_mask = original_mask[:,:,None] moved_mask = move_mask_func(original_mask, 'up', int(moving_pixels)).squeeze() mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB") masked_image = original_image * (1 - (moved_mask[:,:,None]>0)) masked_image = masked_image.astype(original_image.dtype) masked_image = Image.fromarray(masked_image) if moved_mask.max() <= 1: moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8) original_mask = moved_mask return [masked_image], [mask_image], original_mask.astype(np.uint8) def move_mask_down(input_image, original_image, original_mask, moving_pixels, resize_default, aspect_ratio_name): alpha_mask = input_image["layers"][0].split()[3] input_mask = np.asarray(alpha_mask) output_w, output_h = aspect_ratios[aspect_ratio_name] if output_w == "" or output_h == "": output_h, output_w = original_image.shape[:2] if resize_default: short_side = min(output_w, output_h) scale_ratio = 640 / short_side output_w = int(output_w * scale_ratio) output_h = int(output_h * scale_ratio) original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h)) original_image = np.array(original_image) if input_mask is not None: input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h)) input_mask = np.array(input_mask) if original_mask is not None: original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h)) original_mask = np.array(original_mask) gr.Info(f"Output aspect ratio: {output_w}:{output_h}") else: gr.Info(f"Output aspect ratio: {output_w}:{output_h}") pass else: if resize_default: short_side = min(output_w, output_h) scale_ratio = 640 / short_side output_w = int(output_w * scale_ratio) output_h = int(output_h * scale_ratio) gr.Info(f"Output aspect ratio: {output_w}:{output_h}") original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h)) original_image = np.array(original_image) if input_mask is not None: input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h)) input_mask = np.array(input_mask) if original_mask is not None: original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h)) original_mask = np.array(original_mask) if input_mask.max() == 0: original_mask = original_mask else: original_mask = input_mask if original_mask is None: raise gr.Error('Please generate mask first') if original_mask.ndim == 2: original_mask = original_mask[:,:,None] moved_mask = move_mask_func(original_mask, 'down', int(moving_pixels)).squeeze() mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB") masked_image = original_image * (1 - (moved_mask[:,:,None]>0)) masked_image = masked_image.astype(original_image.dtype) masked_image = Image.fromarray(masked_image) if moved_mask.max() <= 1: moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8) original_mask = moved_mask return [masked_image], [mask_image], original_mask.astype(np.uint8) def invert_mask(input_image, original_image, original_mask, ): alpha_mask = input_image["layers"][0].split()[3] input_mask = np.asarray(alpha_mask) if input_mask.max() == 0: original_mask = 1 - (original_mask>0).astype(np.uint8) else: original_mask = 1 - (input_mask>0).astype(np.uint8) if original_mask is None: raise gr.Error('Please generate mask first') original_mask = original_mask.squeeze() mask_image = Image.fromarray(original_mask*255).convert("RGB") if original_mask.ndim == 2: original_mask = original_mask[:,:,None] if original_mask.max() <= 1: original_mask = (original_mask * 255).astype(np.uint8) masked_image = original_image * (1 - (original_mask>0)) masked_image = masked_image.astype(original_image.dtype) masked_image = Image.fromarray(masked_image) return [masked_image], [mask_image], original_mask, True def init_img(base, init_type, prompt, aspect_ratio, example_change_times ): image_pil = base["background"].convert("RGB") original_image = np.array(image_pil) if max(original_image.shape[0], original_image.shape[1]) * 1.0 / min(original_image.shape[0], original_image.shape[1])>2.0: raise gr.Error('image aspect ratio cannot be larger than 2.0') if init_type in MASK_IMAGE_PATH.keys() and example_change_times < 2: mask_gallery = [Image.open(MASK_IMAGE_PATH[init_type]).convert("L")] masked_gallery = [Image.open(MASKED_IMAGE_PATH[init_type]).convert("RGB")] result_gallery = [Image.open(OUTPUT_IMAGE_PATH[init_type]).convert("RGB")] width, height = image_pil.size image_processor = VaeImageProcessor(vae_scale_factor=pipe.vae_scale_factor, do_convert_rgb=True) height_new, width_new = image_processor.get_default_height_width(image_pil, height, width) image_pil = image_pil.resize((width_new, height_new)) mask_gallery[0] = mask_gallery[0].resize((width_new, height_new)) masked_gallery[0] = masked_gallery[0].resize((width_new, height_new)) result_gallery[0] = result_gallery[0].resize((width_new, height_new)) original_mask = np.array(mask_gallery[0]).astype(np.uint8)[:,:,None] # h,w,1 return base, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, "", "", "Custom resolution", False, False, example_change_times else: if aspect_ratio not in ASPECT_RATIO_LABELS: aspect_ratio = "Custom resolution" return base, original_image, None, "", None, None, None, "", "", aspect_ratio, True, False, 0 def reset_func(input_image, original_image, original_mask, prompt, target_prompt, ): input_image = None original_image = None original_mask = None prompt = '' mask_gallery = [] masked_gallery = [] result_gallery = [] target_prompt = '' if torch.cuda.is_available(): torch.cuda.empty_cache() return input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt, True, False def update_example(example_type, prompt, example_change_times): input_image = INPUT_IMAGE_PATH[example_type] image_pil = Image.open(input_image).convert("RGB") mask_gallery = [Image.open(MASK_IMAGE_PATH[example_type]).convert("L")] masked_gallery = [Image.open(MASKED_IMAGE_PATH[example_type]).convert("RGB")] result_gallery = [Image.open(OUTPUT_IMAGE_PATH[example_type]).convert("RGB")] width, height = image_pil.size image_processor = VaeImageProcessor(vae_scale_factor=pipe.vae_scale_factor, do_convert_rgb=True) height_new, width_new = image_processor.get_default_height_width(image_pil, height, width) image_pil = image_pil.resize((width_new, height_new)) mask_gallery[0] = mask_gallery[0].resize((width_new, height_new)) masked_gallery[0] = masked_gallery[0].resize((width_new, height_new)) result_gallery[0] = result_gallery[0].resize((width_new, height_new)) original_image = np.array(image_pil) original_mask = np.array(mask_gallery[0]).astype(np.uint8)[:,:,None] # h,w,1 aspect_ratio = "Custom resolution" example_change_times += 1 return input_image, prompt, original_image, original_mask, mask_gallery, masked_gallery, result_gallery, aspect_ratio, "", False, example_change_times def generate_target_prompt(input_image, original_image, prompt): # load example image if isinstance(original_image, str): original_image = input_image prompt_after_apply_instruction = vlm_response_prompt_after_apply_instruction( vlm_processor, vlm_model, original_image, prompt, device) return prompt_after_apply_instruction # 新增事件处理函数 def generate_blip_description(input_image): if input_image is None: return "", "Input image cannot be None" from app.utils.utils import generate_caption blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large", torch_dtype=torch.float16).to(device) try: image_pil = input_image["background"].convert("RGB") except KeyError: return "", "Input image missing 'background' key" except AttributeError as e: return "", f"Invalid image object: {str(e)}" try: description = generate_caption(blip_processor, blip_model, image_pil, device) return description, description # 同时更新state和显示组件 except Exception as e: return "", f"Caption generation failed: {str(e)}" def submit_GPT4o_KEY(GPT4o_KEY): global vlm_model, vlm_processor if vlm_model is not None: del vlm_model torch.cuda.empty_cache() try: vlm_model = OpenAI(api_key=GPT4o_KEY, base_url="https://api.deepseek.com") vlm_processor = "" response = vlm_model.chat.completions.create( model="deepseek-chat", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello."} ] ) response_str = response.choices[0].message.content return "Success. " + response_str, "GPT4-o (Highly Recommended)" except Exception as e: return "Invalid GPT4o API Key", "GPT4-o (Highly Recommended)" def verify_deepseek_api(): try: response = llm_model.chat.completions.create( model="deepseek-chat", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello."} ] ) response_str = response.choices[0].message.content return True, "Success. " + response_str except Exception as e: return False, "Invalid DeepSeek API Key" def llm_enhanced_prompt_after_apply_instruction(image_caption, editing_prompt): try: messages = create_apply_editing_messages_deepseek(image_caption, editing_prompt) response = llm_model.chat.completions.create( model="deepseek-chat", messages=messages ) response_str = response.choices[0].message.content return response_str except Exception as e: raise gr.Error(f"整合指令时遇到错误: {str(e)}，请检查控制台日志获取详细信息") def llm_decomposed_prompt_after_apply_instruction(integrated_query): try: messages = create_decomposed_query_messages_deepseek(integrated_query) response = llm_model.chat.completions.create( model="deepseek-chat", messages=messages ) response_str = response.choices[0].message.content return response_str except Exception as e: raise gr.Error(f"分解指令时遇到错误: {str(e)}，请检查控制台日志获取详细信息") def enhance_description(blip_description, prompt): try: if not prompt or not blip_description: print("Empty prompt or blip_description detected") return "", "" print(f"Enhancing with prompt: {prompt}") enhanced_description = llm_enhanced_prompt_after_apply_instruction(blip_description, prompt) return enhanced_description, enhanced_description except Exception as e: print(f"Enhancement failed: {str(e)}") return "Error occurred", "Error occurred" def decompose_description(enhanced_description): try: if not enhanced_description: print("Empty enhanced_description detected") return "", "" print(f"Decomposing the enhanced description: {enhanced_description}") decomposed_description = llm_decomposed_prompt_after_apply_instruction(enhanced_description) return decomposed_description, decomposed_description except Exception as e: print(f"Decomposition failed: {str(e)}") return "Error occurred", "Error occurred" block = gr.Blocks( theme=gr.themes.Soft( radius_size=gr.themes.sizes.radius_none, text_size=gr.themes.sizes.text_md ) ) with block as demo: with gr.Row(): with gr.Column(): gr.HTML(head) gr.Markdown(descriptions) with gr.Accordion(label="🧭 Instructions:", open=True, elem_id="accordion"): with gr.Row(equal_height=True): gr.Markdown(instructions) original_image = gr.State(value=None) original_mask = gr.State(value=None) category = gr.State(value=None) status = gr.State(value=None) invert_mask_state = gr.State(value=False) example_change_times = gr.State(value=0) deepseek_verified = gr.State(value=False) blip_description = gr.State(value="") enhanced_description = gr.State(value="") decomposed_description = gr.State(value="") with gr.Row(): with gr.Column(): with gr.Row(): input_image = gr.ImageEditor( label="参考图像", type="pil", brush=gr.Brush(colors=["#FFFFFF"], default_size = 30, color_mode="fixed"), layers = False, interactive=True, # height=1024, height=512, sources=["upload"], placeholder="🫧 点击此处或下面的图标上传图像 🫧", ) prompt = gr.Textbox(label="修改指令", placeholder="😜 在此处输入你对参考图像的修改预期 😜", value="",lines=1) run_button = gr.Button("💫 图像编辑") vlm_model_dropdown = gr.Dropdown(label="VLM 模型", choices=VLM_MODEL_NAMES, value=DEFAULT_VLM_MODEL_NAME, interactive=True) with gr.Group(): with gr.Row(): # GPT4o_KEY = gr.Textbox(label="GPT4o API Key", placeholder="Please input your GPT4o API Key when use GPT4o VLM (highly recommended).", value="", lines=1) GPT4o_KEY = gr.Textbox(label="密钥输入", value="sk-d145b963a92649a88843caeb741e8bbc", lines=1) GPT4o_KEY_submit = gr.Button("🙈 验证") aspect_ratio = gr.Dropdown(label="Output aspect ratio", choices=ASPECT_RATIO_LABELS, value=DEFAULT_ASPECT_RATIO) resize_default = gr.Checkbox(label="Short edge resize to 640px", value=True) with gr.Row(): mask_button = gr.Button("💎 掩膜生成") random_mask_button = gr.Button("Square/Circle Mask ") with gr.Row(): generate_target_prompt_button = gr.Button("Generate Target Prompt") target_prompt = gr.Text( label="Input Target Prompt", max_lines=5, placeholder="VLM-generated target prompt, you can first generate if and then modify it (optional)", value='', lines=2 ) with gr.Accordion("Advanced Options", open=False, elem_id="accordion1"): base_model_dropdown = gr.Dropdown(label="Base model", choices=BASE_MODELS, value=DEFAULT_BASE_MODEL, interactive=True) negative_prompt = gr.Text( label="Negative Prompt", max_lines=5, placeholder="Please input your negative prompt", value='ugly, low quality',lines=1 ) control_strength = gr.Slider( label="Control Strength: ", show_label=True, minimum=0, maximum=1.1, value=1, step=0.01 ) with gr.Group(): seed = gr.Slider( label="Seed: ", minimum=0, maximum=2147483647, step=1, value=648464818 ) randomize_seed = gr.Checkbox(label="Randomize seed", value=False) blending = gr.Checkbox(label="Blending mode", value=True) num_samples = gr.Slider( label="Num samples", minimum=0, maximum=4, step=1, value=4 ) with gr.Group(): with gr.Row(): guidance_scale = gr.Slider( label="Guidance scale", minimum=1, maximum=12, step=0.1, value=7.5, ) num_inference_steps = gr.Slider( label="Number of inference steps", minimum=1, maximum=50, step=1, value=50, ) with gr.Group(visible=True): # BLIP生成的描述 blip_output = gr.Textbox(label="原图描述", placeholder="💬 BLIP生成的图像基础描述 💬", interactive=True, lines=3) # DeepSeek API验证 with gr.Row(): deepseek_key = gr.Textbox(label="密钥输入", value="sk-d145b963a92649a88843caeb741e8bbc", lines=1) verify_deepseek = gr.Button("🙈 验证") # 整合后的描述区域 with gr.Row(): enhanced_output = gr.Textbox(label="描述整合", placeholder="💭 DeepSeek生成的增强描述 💭", interactive=True, lines=3) enhance_button = gr.Button("✨ 整合") # 分解后的描述区域 with gr.Row(): decomposed_output = gr.Textbox(label="描述分解", placeholder="🔍 DeepSeek生成的分解描述 🔍", interactive=True, lines=3) decompose_button = gr.Button("🔧 分解") with gr.Row(): with gr.Tab(elem_classes="feedback", label="Masked Image"): masked_gallery = gr.Gallery(label='Masked Image', show_label=True, elem_id="gallery", preview=True, height=360) with gr.Tab(elem_classes="feedback", label="Mask"): mask_gallery = gr.Gallery(label='Mask', show_label=True, elem_id="gallery", preview=True, height=360) invert_mask_button = gr.Button("Invert Mask") dilation_size = gr.Slider( label="Dilation size: ", minimum=0, maximum=50, step=1, value=20 ) with gr.Row(): dilation_mask_button = gr.Button("Dilation Generated Mask") erosion_mask_button = gr.Button("Erosion Generated Mask") moving_pixels = gr.Slider( label="Moving pixels:", show_label=True, minimum=0, maximum=50, value=4, step=1 ) with gr.Row(): move_left_button = gr.Button("Move Left") move_right_button = gr.Button("Move Right") with gr.Row(): move_up_button = gr.Button("Move Up") move_down_button = gr.Button("Move Down") with gr.Tab(elem_classes="feedback", label="Output"): result_gallery = gr.Gallery(label='Output', show_label=True, elem_id="gallery", preview=True, height=400) # target_prompt_output = gr.Text(label="Output Target Prompt", value="", lines=1, interactive=False) reset_button = gr.Button("Reset") init_type = gr.Textbox(label="Init Name", value="", visible=False) example_type = gr.Textbox(label="Example Name", value="", visible=False) with gr.Row(): example = gr.Examples( label="Quick Example", examples=EXAMPLES, inputs=[input_image, prompt, seed, init_type, example_type, blending, resize_default, vlm_model_dropdown], examples_per_page=10, cache_examples=False, ) with gr.Accordion(label="🎬 Feature Details:", open=True, elem_id="accordion"): with gr.Row(equal_height=True): gr.Markdown(tips) with gr.Row(): gr.Markdown(citation) ## gr.examples can not be used to update the gr.Gallery, so we need to use the following two functions to update the gr.Gallery. ## And we need to solve the conflict between the upload and change example functions. input_image.upload( init_img, [input_image, init_type, prompt, aspect_ratio, example_change_times], [input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt, init_type, aspect_ratio, resize_default, invert_mask_state, example_change_times] ) example_type.change(fn=update_example, inputs=[example_type, prompt, example_change_times], outputs=[input_image, prompt, original_image, original_mask, mask_gallery, masked_gallery, result_gallery, aspect_ratio, target_prompt, invert_mask_state, example_change_times]) ## vlm and base model dropdown vlm_model_dropdown.change(fn=update_vlm_model, inputs=[vlm_model_dropdown], outputs=[status]) base_model_dropdown.change(fn=update_base_model, inputs=[base_model_dropdown], outputs=[status]) GPT4o_KEY_submit.click(fn=submit_GPT4o_KEY, inputs=[GPT4o_KEY], outputs=[GPT4o_KEY, vlm_model_dropdown]) invert_mask_button.click(fn=invert_mask, inputs=[input_image, original_image, original_mask], outputs=[masked_gallery, mask_gallery, original_mask, invert_mask_state]) ips=[input_image, original_image, original_mask, prompt, negative_prompt, control_strength, seed, randomize_seed, guidance_scale, num_inference_steps, num_samples, blending, category, target_prompt, resize_default, aspect_ratio, invert_mask_state] ## run brushedit run_button.click(fn=process, inputs=ips, outputs=[result_gallery, mask_gallery, masked_gallery, prompt, target_prompt, invert_mask_state]) ## mask func mask_button.click(fn=process_mask, inputs=[input_image, original_image, prompt, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask, category]) random_mask_button.click(fn=process_random_mask, inputs=[input_image, original_image, original_mask, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask]) dilation_mask_button.click(fn=process_dilation_mask, inputs=[input_image, original_image, original_mask, resize_default, aspect_ratio, dilation_size], outputs=[ masked_gallery, mask_gallery, original_mask]) erosion_mask_button.click(fn=process_erosion_mask, inputs=[input_image, original_image, original_mask, resize_default, aspect_ratio, dilation_size], outputs=[ masked_gallery, mask_gallery, original_mask]) ## move mask func move_left_button.click(fn=move_mask_left, inputs=[input_image, original_image, original_mask, moving_pixels, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask]) move_right_button.click(fn=move_mask_right, inputs=[input_image, original_image, original_mask, moving_pixels, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask]) move_up_button.click(fn=move_mask_up, inputs=[input_image, original_image, original_mask, moving_pixels, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask]) move_down_button.click(fn=move_mask_down, inputs=[input_image, original_image, original_mask, moving_pixels, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask]) ## prompt func generate_target_prompt_button.click(fn=generate_target_prompt, inputs=[input_image, original_image, prompt], outputs=[target_prompt]) ## reset func reset_button.click(fn=reset_func, inputs=[input_image, original_image, original_mask, prompt, target_prompt], outputs=[input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt, resize_default, invert_mask_state]) # 绑定事件处理 input_image.upload(fn=generate_blip_description, inputs=[input_image], outputs=[blip_description, blip_output]) verify_deepseek.click(fn=verify_deepseek_api, outputs=[deepseek_verified, deepseek_key]) enhance_button.click(fn=enhance_description, inputs=[blip_description, prompt], outputs=[enhanced_description, enhanced_output]) decompose_button.click(fn=decompose_description, inputs=[enhanced_description], outputs=[decomposed_description, decomposed_output]) demo.launch(server_name="0.0.0.0", server_port=12345, share=True)