Spaces:

SkalskiP
/

YOLO-World

Runtime error

App Files Files Community

SkalskiP commited on Feb 17, 2024

Commit

d336c38

1 Parent(s): a1ddcc8

Initial video processing

Browse files

Files changed (4) hide show

app.py +171 -53
requirements.txt +2 -1
utils/efficient_sam.py +14 -0
utils/video.py +27 -0

app.py CHANGED Viewed

@@ -1,18 +1,21 @@
 from typing import List
 import cv2
 import gradio as gr
 import numpy as np
 import supervision as sv
 import torch
 from inference.models import YOLOWorld
-from utils.efficient_sam import load, inference_with_box
 MARKDOWN = """
 # YOLO-World + EfficientSAM 🔥
-This is a demo of zero-shot instance segmentation using
 [YOLO-World](https://github.com/AILab-CVC/YOLO-World) and
 [EfficientSAM](https://github.com/yformer/EfficientSAM).
@@ -20,9 +23,15 @@ Powered by Roboflow [Inference](https://github.com/roboflow/inference) and
 [Supervision](https://github.com/roboflow/supervision).
 """
-EXAMPLES = [
     ['https://media.roboflow.com/dog.jpeg', 'dog, eye, nose, tongue, car', 0.005, 0.1, True, False, False],
 ]
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 EFFICIENT_SAM_MODEL = load(device=DEVICE)
@@ -33,10 +42,34 @@ MASK_ANNOTATOR = sv.MaskAnnotator()
 LABEL_ANNOTATOR = sv.LabelAnnotator()
 def process_categories(categories: str) -> List[str]:
     return [category.strip() for category in categories.split(',')]
 def process_image(
     input_image: np.ndarray,
     categories: str,
@@ -52,31 +85,69 @@ def process_image(
     detections = sv.Detections.from_inference(results)
     detections = detections.with_nms(
         class_agnostic=with_class_agnostic_nms,
-        threshold=iou_threshold)
     if with_segmentation:
-        masks = []
-        for [x_min, y_min, x_max, y_max] in detections.xyxy:
-            box = np.array([[x_min, y_min], [x_max, y_max]])
-            mask = inference_with_box(input_image, box, EFFICIENT_SAM_MODEL, DEVICE)
-            masks.append(mask)
-        detections.mask = np.array(masks)
-    labels = [
-        (
-            f"{categories[class_id]}: {confidence:.2f}"
-            if with_confidence
-            else f"{categories[class_id]}"
         )
-        for class_id, confidence in
-        zip(detections.class_id, detections.confidence)
-    ]
-    output_image = input_image.copy()
-    output_image = cv2.cvtColor(output_image, cv2.COLOR_RGB2BGR)
-    output_image = MASK_ANNOTATOR.annotate(output_image, detections)
-    output_image = BOUNDING_BOX_ANNOTATOR.annotate(output_image, detections)
-    output_image = LABEL_ANNOTATOR.annotate(output_image, detections, labels=labels)
-    output_image = cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB)
-    return output_image
 confidence_threshold_component = gr.Slider(
@@ -140,32 +211,80 @@ with gr.Blocks() as demo:
             with_segmentation_component.render()
             with_confidence_component.render()
             with_class_agnostic_nms_component.render()
-    with gr.Row():
-        input_image_component = gr.Image(
-            type='numpy',
-            label='Input Image'
-        )
-        output_image_component = gr.Image(
-            type='numpy',
-            label='Output Image'
-        )
-    with gr.Row():
-        categories_text_component = gr.Textbox(
-            label='Categories',
-            placeholder='comma separated list of categories',
-            scale=7
         )
-        submit_button_component = gr.Button(
-            value='Submit',
-            scale=1,
-            variant='primary'
         )
-    gr.Examples(
         fn=process_image,
-        examples=EXAMPLES,
         inputs=[
             input_image_component,
-            categories_text_component,
             confidence_threshold_component,
             iou_threshold_component,
             with_segmentation_component,
@@ -174,19 +293,18 @@ with gr.Blocks() as demo:
         ],
         outputs=output_image_component
     )
-    submit_button_component.click(
-        fn=process_image,
         inputs=[
-            input_image_component,
-            categories_text_component,
             confidence_threshold_component,
             iou_threshold_component,
             with_segmentation_component,
             with_confidence_component,
             with_class_agnostic_nms_component
         ],
-        outputs=output_image_component
     )
 demo.launch(debug=False, show_error=True)

 from typing import List
+import os
 import cv2
 import gradio as gr
 import numpy as np
 import supervision as sv
 import torch
+from tqdm import tqdm
 from inference.models import YOLOWorld
+from utils.efficient_sam import load, inference_with_boxes
+from utils.video import generate_file_name, calculate_end_frame_index, create_directory
 MARKDOWN = """
 # YOLO-World + EfficientSAM 🔥
+This is a demo of zero-shot object detection and instance segmentation using
 [YOLO-World](https://github.com/AILab-CVC/YOLO-World) and
 [EfficientSAM](https://github.com/yformer/EfficientSAM).
 [Supervision](https://github.com/roboflow/supervision).
 """
+RESULTS = "results"
+IMAGE_EXAMPLES = [
     ['https://media.roboflow.com/dog.jpeg', 'dog, eye, nose, tongue, car', 0.005, 0.1, True, False, False],
 ]
+VIDEO_EXAMPLES = [
+    ['https://media.roboflow.com/supervision/video-examples/croissant-1280x720.mp4', 'croissant', 0.01, 0.2, False, False, False],
+    ['https://media.roboflow.com/supervision/video-examples/suitcases-1280x720.mp4', 'suitcase', 0.1, 0.2, False, False, False],
+]
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 EFFICIENT_SAM_MODEL = load(device=DEVICE)
 LABEL_ANNOTATOR = sv.LabelAnnotator()
+create_directory(directory_path=RESULTS)
 def process_categories(categories: str) -> List[str]:
     return [category.strip() for category in categories.split(',')]
+def annotate_image(
+    input_image: np.ndarray,
+    detections: sv.Detections,
+    categories: List[str],
+    with_confidence: bool = False,
+) -> np.ndarray:
+    labels = [
+        (
+            f"{categories[class_id]}: {confidence:.3f}"
+            if with_confidence
+            else f"{categories[class_id]}"
+        )
+        for class_id, confidence in
+        zip(detections.class_id, detections.confidence)
+    ]
+    output_image = MASK_ANNOTATOR.annotate(input_image, detections)
+    output_image = BOUNDING_BOX_ANNOTATOR.annotate(output_image, detections)
+    output_image = LABEL_ANNOTATOR.annotate(output_image, detections, labels=labels)
+    return output_image
 def process_image(
     input_image: np.ndarray,
     categories: str,
     detections = sv.Detections.from_inference(results)
     detections = detections.with_nms(
         class_agnostic=with_class_agnostic_nms,
+        threshold=iou_threshold
+    )
     if with_segmentation:
+        detections.mask = inference_with_boxes(
+            image=input_image,
+            xyxy=detections.xyxy,
+            model=EFFICIENT_SAM_MODEL,
+            device=DEVICE
         )
+    output_image = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGR)
+    output_image = annotate_image(
+        input_image=output_image,
+        detections=detections,
+        categories=categories,
+        with_confidence=with_confidence
+    )
+    return cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB)
+def process_video(
+    input_video: str,
+    categories: str,
+    confidence_threshold: float = 0.3,
+    iou_threshold: float = 0.5,
+    with_segmentation: bool = True,
+    with_confidence: bool = False,
+    with_class_agnostic_nms: bool = False,
+    progress=gr.Progress(track_tqdm=True)
+) -> str:
+    categories = process_categories(categories)
+    YOLO_WORLD_MODEL.set_classes(categories)
+    video_info = sv.VideoInfo.from_video_path(input_video)
+    total = calculate_end_frame_index(input_video)
+    frame_generator = sv.get_video_frames_generator(
+        source_path=input_video,
+        end=total
+    )
+    result_file_name = generate_file_name(extension="mp4")
+    result_file_path = os.path.join(RESULTS, result_file_name)
+    with sv.VideoSink(result_file_path, video_info=video_info) as sink:
+        for _ in tqdm(range(total), desc="Processing video..."):
+            frame = next(frame_generator)
+            results = YOLO_WORLD_MODEL.infer(frame, confidence=confidence_threshold)
+            detections = sv.Detections.from_inference(results)
+            detections = detections.with_nms(
+                class_agnostic=with_class_agnostic_nms,
+                threshold=iou_threshold
+            )
+            if with_segmentation:
+                detections.mask = inference_with_boxes(
+                    image=frame,
+                    xyxy=detections.xyxy,
+                    model=EFFICIENT_SAM_MODEL,
+                    device=DEVICE
+            )
+            frame = annotate_image(
+                input_image=frame,
+                detections=detections,
+                categories=categories,
+                with_confidence=with_confidence
+            )
+            sink.write_frame(frame)
+    return result_file_path
 confidence_threshold_component = gr.Slider(
             with_segmentation_component.render()
             with_confidence_component.render()
             with_class_agnostic_nms_component.render()
+    with gr.Tab(label="Image"):
+        with gr.Row():
+            input_image_component = gr.Image(
+                type='numpy',
+                label='Input Image'
+            )
+            output_image_component = gr.Image(
+                type='numpy',
+                label='Output Image'
+            )
+        with gr.Row():
+            image_categories_text_component = gr.Textbox(
+                label='Categories',
+                placeholder='comma separated list of categories',
+                scale=7
+            )
+            image_submit_button_component = gr.Button(
+                value='Submit',
+                scale=1,
+                variant='primary'
+            )
+        gr.Examples(
+            fn=process_image,
+            examples=IMAGE_EXAMPLES,
+            inputs=[
+                input_image_component,
+                image_categories_text_component,
+                confidence_threshold_component,
+                iou_threshold_component,
+                with_segmentation_component,
+                with_confidence_component,
+                with_class_agnostic_nms_component
+            ],
+            outputs=output_image_component
         )
+    with gr.Tab(label="Video"):
+        with gr.Row():
+            input_video_component = gr.Video(
+                label='Input Video'
+            )
+            output_video_component = gr.Video(
+                label='Output Video'
+            )
+        with gr.Row():
+            video_categories_text_component = gr.Textbox(
+                label='Categories',
+                placeholder='comma separated list of categories',
+                scale=7
+            )
+            video_submit_button_component = gr.Button(
+                value='Submit',
+                scale=1,
+                variant='primary'
+            )
+        gr.Examples(
+            fn=process_video,
+            examples=VIDEO_EXAMPLES,
+            inputs=[
+                input_video_component,
+                video_categories_text_component,
+                confidence_threshold_component,
+                iou_threshold_component,
+                with_segmentation_component,
+                with_confidence_component,
+                with_class_agnostic_nms_component
+            ],
+            outputs=output_image_component
         )
+    image_submit_button_component.click(
         fn=process_image,
         inputs=[
             input_image_component,
+            image_categories_text_component,
             confidence_threshold_component,
             iou_threshold_component,
             with_segmentation_component,
         ],
         outputs=output_image_component
     )
+    video_submit_button_component.click(
+        fn=process_video,
         inputs=[
+            input_video_component,
+            video_categories_text_component,
             confidence_threshold_component,
             iou_threshold_component,
             with_segmentation_component,
             with_confidence_component,
             with_class_agnostic_nms_component
         ],
+        outputs=output_video_component
     )
 demo.launch(debug=False, show_error=True)

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 inference-gpu[yolo-world]==0.9.13
 supervision==0.19.0rc3
-gradio==4.19.0

 inference-gpu[yolo-world]==0.9.13
 supervision==0.19.0rc3
+gradio==4.19.0
+tqdm==4.66.2

utils/efficient_sam.py CHANGED Viewed

@@ -45,3 +45,17 @@ def inference_with_box(
             max_predicted_iou = curr_predicted_iou
             selected_mask_using_predicted_iou = all_masks[m]
     return selected_mask_using_predicted_iou

             max_predicted_iou = curr_predicted_iou
             selected_mask_using_predicted_iou = all_masks[m]
     return selected_mask_using_predicted_iou
+def inference_with_boxes(
+    image: np.ndarray,
+    xyxy: np.ndarray,
+    model: torch.jit.ScriptModule,
+    device: torch.device
+) -> np.ndarray:
+    masks = []
+    for [x_min, y_min, x_max, y_max] in xyxy:
+        box = np.array([[x_min, y_min], [x_max, y_max]])
+        mask = inference_with_box(image, box, model, device)
+        masks.append(mask)
+    return np.array(masks)

utils/video.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import os
+import datetime
+import uuid
+import supervision as sv
+MAX_VIDEO_LENGTH_SEC = 3
+def generate_file_name(extension="mp4"):
+    current_datetime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+    unique_id = uuid.uuid4()
+    return f"{current_datetime}_{unique_id}.{extension}"
+def calculate_end_frame_index(source_video_path: str) -> int:
+    video_info = sv.VideoInfo.from_video_path(source_video_path)
+    return min(
+        video_info.total_frames,
+        video_info.fps * MAX_VIDEO_LENGTH_SEC
+    )
+def create_directory(directory_path: str) -> None:
+    if not os.path.exists(directory_path):
+        os.makedirs(directory_path)