Spaces:

sergiopaniego
/

vlm_object_understanding

Running on Zero

App Files Files Community

feat: ✨ supervision visualization added

by onuralpszr - opened about 23 hours ago

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+95

-67

Files changed (2) hide show

app.py +94 -67
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -5,6 +5,8 @@ import ast
 import time
 import matplotlib.pyplot as plt
 from PIL import Image, ImageDraw, ImageFont
 import gradio as gr
@@ -38,22 +40,10 @@ model_moondream_name = extract_model_short_name(model_moondream_id)  # → "moon
 min_pixels = 224 * 224
 max_pixels = 1024 * 1024
 processor_qwen = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
-#processor_moondream = AutoProcessor.from_pretrained("vikhyatk/moondream2", trust_remote_code=True)
-label2color = {}
-vivid_colors = ["#e6194b", "#3cb44b", "#0082c8", "#f58231", "#911eb4", "#46f0f0", "#f032e6", "#d2f53c", "#fabebe", "#008080", "#e6beff", "#aa6e28", "#fffac8", "#800000", "#aaffc3", "#808000", "#ffd8b1", "#000080", "#808080", "#000000"]
-def get_color(label, explicit_color=None):
-    if explicit_color:
-        return explicit_color
-    if label not in label2color:
-        index = len(label2color) % len(vivid_colors)
-        label2color[label] = vivid_colors[index]
-    return label2color[label]
 def create_annotated_image(image, json_data, height, width):
     try:
-        json_data = json_data.split('```json')[1].split('```')[0]
         bbox_data = json.loads(json_data)
     except Exception:
         return image
@@ -62,20 +52,13 @@ def create_annotated_image(image, json_data, height, width):
     x_scale = original_width / width
     y_scale = original_height / height
-    scale_factor = max(original_width, original_height) / 512
-    draw_image = image.copy()
-    draw = ImageDraw.Draw(draw_image)
-    try:
-        font = ImageFont.truetype("DejaVuSans-Bold.ttf", int(12 * scale_factor))
-    except:
-        font = ImageFont.load_default()
     for item in bbox_data:
         label = item.get("label", "")
-        color = get_color(label, item.get("color", None))
         if "bbox_2d" in item:
             bbox = item["bbox_2d"]
             scaled_bbox = [
@@ -84,63 +67,108 @@ def create_annotated_image(image, json_data, height, width):
                 int(bbox[2] * x_scale),
                 int(bbox[3] * y_scale)
             ]
-            draw.rectangle(scaled_bbox, outline=color, width=int(2 * scale_factor))
-            draw.text(
-                (scaled_bbox[0], max(0, scaled_bbox[1] - int(15 * scale_factor))),
-                label,
-                fill=color,
-                font=font
-            )
         if "point_2d" in item:
             x, y = item["point_2d"]
             scaled_x = int(x * x_scale)
             scaled_y = int(y * y_scale)
-            r = int(5 * scale_factor)
-            draw.ellipse((scaled_x - r, scaled_y - r, scaled_x + r, scaled_y + r), fill=color, outline=color)
-            draw.text((scaled_x + int(6 * scale_factor), scaled_y), label, fill=color, font=font)
-    return draw_image
-def create_annotated_image_normalized(image, json_data, label="object", explicit_color=None):
     if not isinstance(json_data, dict):
         return image
     original_width, original_height = image.size
-    scale_factor = max(original_width, original_height) / 512
-    draw_image = image.copy()
-    draw = ImageDraw.Draw(draw_image)
-    try:
-        font = ImageFont.truetype("DejaVuSans-Bold.ttf", int(12 * scale_factor))
-    except:
-        font = ImageFont.load_default()
-    color = get_color(label, explicit_color)
-    for point in json_data.get("points", []):
-        x = int(point["x"] * original_width)
-        y = int(point["y"] * original_height)
-        radius = int(4 * scale_factor)
-        draw.ellipse([x - radius, y - radius, x + radius, y + radius], fill=color, outline=color)
-    for item in json_data.get("objects", []):
-        x_min = int(item["x_min"] * original_width)
-        y_min = int(item["y_min"] * original_height)
-        x_max = int(item["x_max"] * original_width)
-        y_max = int(item["y_max"] * original_height)
-        draw.rectangle([x_min, y_min, x_max, y_max], outline=color, width=int(2 * scale_factor))
-        draw.text((x_min, max(0, y_min - int(15 * scale_factor))), label, fill=color, font=font)
     if "reasoning" in json_data:
         for grounding in json_data["reasoning"].get("grounding", []):
             for x_norm, y_norm in grounding.get("points", []):
                 x = int(x_norm * original_width)
                 y = int(y_norm * original_height)
-                radius = int(4 * scale_factor)
-                draw.ellipse([x - radius, y - radius, x + radius, y + radius], fill=color, outline=color)
-    return draw_image
@@ -159,7 +187,7 @@ def detect_qwen(image, prompt):
     t0 = time.perf_counter()
     text = processor_qwen.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    image_inputs, video_inputs = process_vision_info(messages)
     inputs = processor_qwen(
         text=[text],
         images=image_inputs,
@@ -197,7 +225,7 @@ def detect_moondream(image, prompt, category_input):
         output_text = model_moondream.query(image=image, question=prompt, reasoning=True)
     elapsed_ms = (time.perf_counter() - t0) * 1_000
-    annotated_image = create_annotated_image_normalized(image=image, json_data=output_text, label="object", explicit_color=None)
     time_taken = f"**Inference time ({model_moondream_name}):** {elapsed_ms:.0f} ms"
     return annotated_image, output_text, time_taken
@@ -295,4 +323,3 @@ with gr.Blocks(theme=Ocean(), css=css_hide_share) as demo:
 if __name__ == "__main__":
     demo.launch()

 import time
 import matplotlib.pyplot as plt
+import numpy as np
+import supervision as sv
 from PIL import Image, ImageDraw, ImageFont
 import gradio as gr
 min_pixels = 224 * 224
 max_pixels = 1024 * 1024
 processor_qwen = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
 def create_annotated_image(image, json_data, height, width):
     try:
+        json_data = json_data.split("```json")[1].split("```")[0]
         bbox_data = json.loads(json_data)
     except Exception:
         return image
     x_scale = original_width / width
     y_scale = original_height / height
+    boxes = []
+    box_labels = []
+    points = []
+    point_labels = []
     for item in bbox_data:
         label = item.get("label", "")
         if "bbox_2d" in item:
             bbox = item["bbox_2d"]
             scaled_bbox = [
                 int(bbox[2] * x_scale),
                 int(bbox[3] * y_scale)
             ]
+            boxes.append(scaled_bbox)
+            box_labels.append(label)
         if "point_2d" in item:
             x, y = item["point_2d"]
             scaled_x = int(x * x_scale)
             scaled_y = int(y * y_scale)
+            points.append([scaled_x, scaled_y])
+            point_labels.append(label)
+    annotated_image = np.array(image.convert("RGB"))
+    if boxes:
+        detections = sv.Detections(xyxy=np.array(boxes))
+        bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
+        label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)
+        annotated_image = bounding_box_annotator.annotate(
+            scene=annotated_image,
+            detections=detections
+        )
+        annotated_image = label_annotator.annotate(
+            scene=annotated_image,
+            detections=detections,
+            labels=box_labels
+        )
+    if points:
+        points_array = np.array(points).reshape(1, -1, 2)
+        key_points = sv.KeyPoints(xy=points_array)
+        vertex_annotator = sv.VertexAnnotator(radius=5, color=sv.Color.BLUE)
+        #vertex_label_annotator = sv.VertexLabelAnnotator(text_scale=0.5, border_radius=2)
+        annotated_image = vertex_annotator.annotate(
+            scene=annotated_image,
+            key_points=key_points
+        )
+        # annotated_image = vertex_label_annotator.annotate(
+        #     scene=annotated_image,
+        #     key_points=key_points,
+        #     labels=point_labels
+        # )
+    return Image.fromarray(annotated_image)
+def create_annotated_image_normalized(image, json_data, label="object"):
     if not isinstance(json_data, dict):
         return image
     original_width, original_height = image.size
+    annotated_image = np.array(image.convert("RGB"))
+    # Handle points for keypoint detection
+    points = []
+    if "points" in json_data:
+        for point in json_data.get("points", []):
+            x = int(point["x"] * original_width)
+            y = int(point["y"] * original_height)
+            points.append([x, y])
     if "reasoning" in json_data:
         for grounding in json_data["reasoning"].get("grounding", []):
             for x_norm, y_norm in grounding.get("points", []):
                 x = int(x_norm * original_width)
                 y = int(y_norm * original_height)
+                points.append([x,y])
+    if points:
+        points_array = np.array(points).reshape(1, -1, 2)
+        key_points = sv.KeyPoints(xy=points_array)
+        vertex_annotator = sv.VertexAnnotator(radius=5, color=sv.Color.RED)
+        annotated_image = vertex_annotator.annotate(scene=annotated_image, key_points=key_points)
+    # Handle boxes for object detection
+    boxes = []
+    if "objects" in json_data:
+        for item in json_data.get("objects", []):
+            x_min = int(item["x_min"] * original_width)
+            y_min = int(item["y_min"] * original_height)
+            x_max = int(item["x_max"] * original_width)
+            y_max = int(item["y_max"] * original_height)
+            boxes.append([x_min, y_min, x_max, y_max])
+    if boxes:
+        detections = sv.Detections(xyxy=np.array(boxes))
+        bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
+        label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)
+        labels = [label for _ in detections.xyxy]
+        annotated_image = bounding_box_annotator.annotate(
+            scene=annotated_image,
+            detections=detections
+        )
+        annotated_image = label_annotator.annotate(
+            scene=annotated_image,
+            detections=detections,
+            labels=labels
+        )
+    return Image.fromarray(annotated_image)
     t0 = time.perf_counter()
     text = processor_qwen.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs, _ = process_vision_info(messages)
     inputs = processor_qwen(
         text=[text],
         images=image_inputs,
         output_text = model_moondream.query(image=image, question=prompt, reasoning=True)
     elapsed_ms = (time.perf_counter() - t0) * 1_000
+    annotated_image = create_annotated_image_normalized(image=image, json_data=output_text, label="object")
     time_taken = f"**Inference time ({model_moondream_name}):** {elapsed_ms:.0f} ms"
     return annotated_image, output_text, time_taken
 if __name__ == "__main__":
     demo.launch()

requirements.txt CHANGED Viewed

@@ -7,3 +7,4 @@ accelerate
 qwen-vl-utils
 torchvision
 matplotlib

 qwen-vl-utils
 torchvision
 matplotlib
+supervision>=0.26.0rc7