Spaces:

skallewag
/

SEEM

Build error

App Files Files Community

skallewag commited on May 27

Commit

687eada

verified ·

1 Parent(s): dd429be

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -45

app.py CHANGED Viewed

@@ -6,13 +6,22 @@
 # Written by Xueyan Zou ([email protected]), Jianwei Yang ([email protected])
 # --------------------------------------------------------
-# Install dependencies and patch files before any imports
 import os
 import sys
 import subprocess
 print("Setting up SEEM environment...")
 # Create a custom distributed.py file that doesn't need mpi4py
 os.makedirs('utils', exist_ok=True)
 with open('utils/distributed.py', 'w') as f:
@@ -58,16 +67,47 @@ def all_gather(data):
 def reduce_dict(input_dict, average=True):
     return input_dict
 """)
-print("Created custom distributed.py")
-# Install detectron2
-print("Installing detectron2...")
-try:
-    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "git+https://github.com/MaureenZOU/detectron2-xyz.git"])
-    print("Detectron2 installation complete!")
-except Exception as e:
-    print(f"Error installing detectron2: {e}")
-    sys.exit(1)
 # Set Python path to include the repository root
 os.environ["PYTHONPATH"] = os.getcwd()
@@ -92,7 +132,8 @@ from utils.distributed import init_distributed
 from utils.arguments import load_opt_from_config_files
 from utils.constants import COCO_PANOPTIC_CLASSES
-from demo.seem.tasks import *
 def parse_option():
     parser = argparse.ArgumentParser('SEEM Demo', add_help=False)
@@ -125,29 +166,59 @@ build model
 '''
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print(f"Using device: {device}")
-model = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth).eval().to(device)
-with torch.no_grad():
-    model.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(COCO_PANOPTIC_CLASSES + ["background"], is_eval=True)
 '''
 audio
 '''
-audio = whisper.load_model("base")
 @torch.no_grad()
 def inference(image, task, *args, **kwargs):
-    if torch.cuda.is_available():
-        with torch.autocast(device_type='cuda', dtype=torch.float16):
-            if 'Video' in task:
-                return interactive_infer_video(model, audio, image, task, *args, **kwargs)
-            else:
-                return interactive_infer_image(model, audio, image, task, *args, **kwargs)
     else:
-        # Run without autocast on CPU
-        if 'Video' in task:
-            return interactive_infer_video(model, audio, image, task, *args, **kwargs)
-        else:
-            return interactive_infer_image(model, audio, image, task, *args, **kwargs)
 class ImageMask(gr.components.Image):
     """
@@ -180,7 +251,14 @@ class Video(gr.components.Video):
 launch app
 '''
 title = "SEEM: Segment Everything Everywhere All At Once"
-description = """
 <div style="text-align: center; font-weight: bold;">
     <span style="font-size: 18px" id="paper-info">
         [<a href="https://github.com/UX-Decoder/Segment-Everything-Everywhere-All-At-Once" target="_blank">GitHub</a>]
@@ -189,22 +267,12 @@ description = """
 </div>
 <div style="text-align: left; font-weight: bold;">
     <br>
-    &#x1F32A Note: The current model is run on <span style="color:blue;">SEEM {}</span>, for <span style="color:blue;">best performance</span> refer to <a href="https://huggingface.co/spaces/xdecoder/SEEM" target="_blank"><span style="color:red;">our demo</span></a>.
     </p>
 </div>
-""".format(cur_model)
-'''Usage
-Instructions:
-&#x1F388 Try our default examples first (Sketch is not automatically drawed on input and example image);
-&#x1F388 For video demo, it takes about 30-60s to process, please refresh if you meet an error on uploading;
-&#x1F388 Upload an image/video (If you want to use referred region of another image please check "Example" and upload another image in referring image panel);
-&#x1F388 Select at least one type of prompt of your choice (If you want to use referred region of another image please check "Example");
-&#x1F388 Remember to provide the actual prompt for each promt type you select, otherwise you will meet an error (e.g., rember to draw on the referring image);
-&#x1F388 Our model by default support the vocabulary of COCO 133 categories, others will be classified to 'others' or misclassifed.
-'''
-article = "The Demo is Run on SEEM-Tiny."
 inputs = [ImageMask(label="[Stroke] Draw on Image",type="pil"), gr.inputs.CheckboxGroup(choices=["Stroke", "Example", "Text", "Audio", "Video", "Panoptic"], type="value", label="Interative Mode"), ImageMask(label="[Example] Draw on Referring Image",type="pil"), gr.Textbox(label="[Text] Referring Text"), gr.Audio(label="[Audio] Referring Audio", source="microphone", type="filepath"), gr.Video(label="[Video] Referring Video Segmentation",format="mp4",interactive=True)]
 gr.Interface(
     fn=inference,
@@ -218,11 +286,11 @@ gr.Interface(
         ),
     ],
     examples=[
-    ["examples/corgi1.webp", ["Text"], "examples/corgi2.jpg", "The corgi.", None, None],
-    ["examples/river1.png", ["Text", "Audio"], "examples/river2.png", "The green trees.", "examples/river1.wav", None],
-    ["examples/zebras1.jpg", ["Example"], "examples/zebras2.jpg", "", None, None],
-    ["examples/fries1.png", ["Example"], "examples/fries2.png", "", None, None],
-    ["examples/placeholder.png", ["Video"], "examples/ref_vase.JPG", "", None, "examples/vasedeck.mp4"],
     ],
     title=title,
     description=description,

 # Written by Xueyan Zou ([email protected]), Jianwei Yang ([email protected])
 # --------------------------------------------------------
+# Setup paths and install dependencies before any imports
 import os
 import sys
 import subprocess
 print("Setting up SEEM environment...")
+# Install detectron2 first
+print("Installing detectron2...")
+try:
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "git+https://github.com/MaureenZOU/detectron2-xyz.git"])
+    print("Detectron2 installation complete!")
+except Exception as e:
+    print(f"Error installing detectron2: {e}")
+    sys.exit(1)
 # Create a custom distributed.py file that doesn't need mpi4py
 os.makedirs('utils', exist_ok=True)
 with open('utils/distributed.py', 'w') as f:
 def reduce_dict(input_dict, average=True):
     return input_dict
 """)
+# Create a simple visualizer if it doesn't exist
+if not os.path.exists('utils/visualizer.py'):
+    with open('utils/visualizer.py', 'w') as f:
+        f.write("""# Simple visualizer class
+import numpy as np
+import cv2
+class Visualizer:
+    def __init__(self, img_rgb, metadata=None, scale=1.0):
+        self.img = img_rgb
+        self.metadata = metadata
+        self.scale = scale
+    def draw_binary_mask(self, mask, color=None, text=None):
+        if color is None:
+            color = [0, 255, 0]  # Default to green
+        mask_img = np.zeros_like(self.img, dtype=np.float32)
+        color_mask = np.array(color) * 255
+        for c in range(3):
+            mask_img[:, :, c] = color_mask[c]
+        mask_img = mask_img * mask[:, :, None] * 0.5
+        self.img = self.img * (1 - mask[:, :, None] * 0.5) + mask_img
+        if text:
+            # Simplified text placement
+            x, y = np.where(mask)[0][0], np.where(mask)[1][0] if np.any(mask) else (10, 10)
+            cv2.putText(self.img, text, (y, x), cv2.FONT_HERSHEY_SIMPLEX, 0.5, tuple(map(int, color_mask)), 1)
+        return self
+    def draw_panoptic_seg(self, panoptic_seg, segments_info):
+        # Simplified panoptic visualization - just a placeholder
+        return self
+    def get_image(self):
+        return self.img.astype(np.uint8)
+""")
 # Set Python path to include the repository root
 os.environ["PYTHONPATH"] = os.getcwd()
 from utils.arguments import load_opt_from_config_files
 from utils.constants import COCO_PANOPTIC_CLASSES
+# Import the interactive functions from the existing implementation
+from demo.seem.tasks.interactive import interactive_infer_image, interactive_infer_video
 def parse_option():
     parser = argparse.ArgumentParser('SEEM Demo', add_help=False)
 '''
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print(f"Using device: {device}")
+try:
+    model = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth).eval().to(device)
+    with torch.no_grad():
+        model.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(COCO_PANOPTIC_CLASSES + ["background"], is_eval=True)
+    print("Model loaded successfully")
+    model_loaded = True
+except Exception as e:
+    print(f"Error loading model: {e}")
+    print("Continuing with simplified interface")
+    model = None
+    model_loaded = False
 '''
 audio
 '''
+try:
+    audio = whisper.load_model("base")
+    audio_loaded = True
+except Exception as e:
+    print(f"Error loading audio model: {e}")
+    audio = None
+    audio_loaded = False
 @torch.no_grad()
 def inference(image, task, *args, **kwargs):
+    if not model_loaded:
+        # Return a placeholder image if model failed to load
+        warning_img = Image.new('RGB', (600, 400), color=(240, 240, 240))
+        d = ImageDraw.Draw(warning_img)
+        d.text((50, 150), "Model could not be loaded.", fill=(255, 0, 0))
+        d.text((50, 200), "Please check logs for details.", fill=(255, 0, 0))
+        return warning_img, None
+    # Prepare input parameters for the interactive functions
+    image_input = {"image": image, "mask": kwargs.get("mask", None)}
+    referring_image = kwargs.get("referring_image", None)
+    # If referring image is provided, prepare it in the expected format
+    refimg = None
+    if referring_image is not None:
+        refimg = {"image": referring_image, "mask": kwargs.get("referring_mask", None)}
+    # Get text and audio parameters
+    reftxt = kwargs.get("referring_text", "")
+    audio_pth = kwargs.get("referring_audio", None)
+    video_pth = kwargs.get("video", None)
+    # Call the appropriate interactive function
+    if 'Video' in task:
+        return interactive_infer_video(model, audio, image_input, task, refimg, reftxt, audio_pth, video_pth)
     else:
+        return interactive_infer_image(model, audio, image_input, task, refimg, reftxt, audio_pth, video_pth)
 class ImageMask(gr.components.Image):
     """
 launch app
 '''
 title = "SEEM: Segment Everything Everywhere All At Once"
+# Update description based on model loading status
+if model_loaded:
+    model_status = f"<span style=\"color:green;\">✓ Model loaded successfully</span> (SEEM {cur_model})"
+else:
+    model_status = "<span style=\"color:red;\">✗ Model failed to load</span> (see logs for details)"
+description = f"""
 <div style="text-align: center; font-weight: bold;">
     <span style="font-size: 18px" id="paper-info">
         [<a href="https://github.com/UX-Decoder/Segment-Everything-Everywhere-All-At-Once" target="_blank">GitHub</a>]
 </div>
 <div style="text-align: left; font-weight: bold;">
     <br>
+    &#x1F32A Status: {model_status}
     </p>
 </div>
+"""
+article = "The Demo is Run on SEEM"
 inputs = [ImageMask(label="[Stroke] Draw on Image",type="pil"), gr.inputs.CheckboxGroup(choices=["Stroke", "Example", "Text", "Audio", "Video", "Panoptic"], type="value", label="Interative Mode"), ImageMask(label="[Example] Draw on Referring Image",type="pil"), gr.Textbox(label="[Text] Referring Text"), gr.Audio(label="[Audio] Referring Audio", source="microphone", type="filepath"), gr.Video(label="[Video] Referring Video Segmentation",format="mp4",interactive=True)]
 gr.Interface(
     fn=inference,
         ),
     ],
     examples=[
+    ["demo/seem/examples/corgi1.webp", ["Text"], "demo/seem/examples/corgi2.jpg", "The corgi.", None, None],
+    ["demo/seem/examples/river1.png", ["Text", "Audio"], "demo/seem/examples/river2.png", "The green trees.", "demo/seem/examples/river1.wav", None],
+    ["demo/seem/examples/zebras1.jpg", ["Example"], "demo/seem/examples/zebras2.jpg", "", None, None],
+    ["demo/seem/examples/fries1.png", ["Example"], "demo/seem/examples/fries2.png", "", None, None],
+    ["demo/seem/examples/placeholder.png", ["Video"], "demo/seem/examples/ref_vase.JPG", "", None, "demo/seem/examples/vasedeck.mp4"],
     ],
     title=title,
     description=description,