feat: ✨ supervision visualization added

#3
by onuralpszr - opened
Files changed (2) hide show
  1. app.py +94 -67
  2. requirements.txt +1 -0
app.py CHANGED
@@ -5,6 +5,8 @@ import ast
5
  import time
6
 
7
  import matplotlib.pyplot as plt
 
 
8
  from PIL import Image, ImageDraw, ImageFont
9
 
10
  import gradio as gr
@@ -38,22 +40,10 @@ model_moondream_name = extract_model_short_name(model_moondream_id) # → "moon
38
  min_pixels = 224 * 224
39
  max_pixels = 1024 * 1024
40
  processor_qwen = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
41
- #processor_moondream = AutoProcessor.from_pretrained("vikhyatk/moondream2", trust_remote_code=True)
42
-
43
- label2color = {}
44
- vivid_colors = ["#e6194b", "#3cb44b", "#0082c8", "#f58231", "#911eb4", "#46f0f0", "#f032e6", "#d2f53c", "#fabebe", "#008080", "#e6beff", "#aa6e28", "#fffac8", "#800000", "#aaffc3", "#808000", "#ffd8b1", "#000080", "#808080", "#000000"]
45
-
46
- def get_color(label, explicit_color=None):
47
- if explicit_color:
48
- return explicit_color
49
- if label not in label2color:
50
- index = len(label2color) % len(vivid_colors)
51
- label2color[label] = vivid_colors[index]
52
- return label2color[label]
53
 
54
  def create_annotated_image(image, json_data, height, width):
55
  try:
56
- json_data = json_data.split('```json')[1].split('```')[0]
57
  bbox_data = json.loads(json_data)
58
  except Exception:
59
  return image
@@ -62,20 +52,13 @@ def create_annotated_image(image, json_data, height, width):
62
  x_scale = original_width / width
63
  y_scale = original_height / height
64
 
65
- scale_factor = max(original_width, original_height) / 512
66
-
67
- draw_image = image.copy()
68
- draw = ImageDraw.Draw(draw_image)
69
-
70
- try:
71
- font = ImageFont.truetype("DejaVuSans-Bold.ttf", int(12 * scale_factor))
72
- except:
73
- font = ImageFont.load_default()
74
 
75
  for item in bbox_data:
76
  label = item.get("label", "")
77
- color = get_color(label, item.get("color", None))
78
-
79
  if "bbox_2d" in item:
80
  bbox = item["bbox_2d"]
81
  scaled_bbox = [
@@ -84,63 +67,108 @@ def create_annotated_image(image, json_data, height, width):
84
  int(bbox[2] * x_scale),
85
  int(bbox[3] * y_scale)
86
  ]
87
- draw.rectangle(scaled_bbox, outline=color, width=int(2 * scale_factor))
88
- draw.text(
89
- (scaled_bbox[0], max(0, scaled_bbox[1] - int(15 * scale_factor))),
90
- label,
91
- fill=color,
92
- font=font
93
- )
94
 
95
  if "point_2d" in item:
96
  x, y = item["point_2d"]
97
  scaled_x = int(x * x_scale)
98
  scaled_y = int(y * y_scale)
99
- r = int(5 * scale_factor)
100
- draw.ellipse((scaled_x - r, scaled_y - r, scaled_x + r, scaled_y + r), fill=color, outline=color)
101
- draw.text((scaled_x + int(6 * scale_factor), scaled_y), label, fill=color, font=font)
102
 
103
- return draw_image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- def create_annotated_image_normalized(image, json_data, label="object", explicit_color=None):
 
 
106
  if not isinstance(json_data, dict):
107
  return image
108
 
109
  original_width, original_height = image.size
110
- scale_factor = max(original_width, original_height) / 512
111
- draw_image = image.copy()
112
- draw = ImageDraw.Draw(draw_image)
113
-
114
- try:
115
- font = ImageFont.truetype("DejaVuSans-Bold.ttf", int(12 * scale_factor))
116
- except:
117
- font = ImageFont.load_default()
118
-
119
- color = get_color(label, explicit_color)
120
-
121
- for point in json_data.get("points", []):
122
- x = int(point["x"] * original_width)
123
- y = int(point["y"] * original_height)
124
- radius = int(4 * scale_factor)
125
- draw.ellipse([x - radius, y - radius, x + radius, y + radius], fill=color, outline=color)
126
-
127
- for item in json_data.get("objects", []):
128
- x_min = int(item["x_min"] * original_width)
129
- y_min = int(item["y_min"] * original_height)
130
- x_max = int(item["x_max"] * original_width)
131
- y_max = int(item["y_max"] * original_height)
132
- draw.rectangle([x_min, y_min, x_max, y_max], outline=color, width=int(2 * scale_factor))
133
- draw.text((x_min, max(0, y_min - int(15 * scale_factor))), label, fill=color, font=font)
134
-
135
  if "reasoning" in json_data:
136
  for grounding in json_data["reasoning"].get("grounding", []):
137
  for x_norm, y_norm in grounding.get("points", []):
138
  x = int(x_norm * original_width)
139
  y = int(y_norm * original_height)
140
- radius = int(4 * scale_factor)
141
- draw.ellipse([x - radius, y - radius, x + radius, y + radius], fill=color, outline=color)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
- return draw_image
 
 
 
 
 
 
 
 
 
 
144
 
145
 
146
 
@@ -159,7 +187,7 @@ def detect_qwen(image, prompt):
159
 
160
  t0 = time.perf_counter()
161
  text = processor_qwen.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
162
- image_inputs, video_inputs = process_vision_info(messages)
163
  inputs = processor_qwen(
164
  text=[text],
165
  images=image_inputs,
@@ -197,7 +225,7 @@ def detect_moondream(image, prompt, category_input):
197
  output_text = model_moondream.query(image=image, question=prompt, reasoning=True)
198
  elapsed_ms = (time.perf_counter() - t0) * 1_000
199
 
200
- annotated_image = create_annotated_image_normalized(image=image, json_data=output_text, label="object", explicit_color=None)
201
 
202
  time_taken = f"**Inference time ({model_moondream_name}):** {elapsed_ms:.0f} ms"
203
  return annotated_image, output_text, time_taken
@@ -295,4 +323,3 @@ with gr.Blocks(theme=Ocean(), css=css_hide_share) as demo:
295
 
296
  if __name__ == "__main__":
297
  demo.launch()
298
-
 
5
  import time
6
 
7
  import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import supervision as sv
10
  from PIL import Image, ImageDraw, ImageFont
11
 
12
  import gradio as gr
 
40
  min_pixels = 224 * 224
41
  max_pixels = 1024 * 1024
42
  processor_qwen = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  def create_annotated_image(image, json_data, height, width):
45
  try:
46
+ json_data = json_data.split("```json")[1].split("```")[0]
47
  bbox_data = json.loads(json_data)
48
  except Exception:
49
  return image
 
52
  x_scale = original_width / width
53
  y_scale = original_height / height
54
 
55
+ boxes = []
56
+ box_labels = []
57
+ points = []
58
+ point_labels = []
 
 
 
 
 
59
 
60
  for item in bbox_data:
61
  label = item.get("label", "")
 
 
62
  if "bbox_2d" in item:
63
  bbox = item["bbox_2d"]
64
  scaled_bbox = [
 
67
  int(bbox[2] * x_scale),
68
  int(bbox[3] * y_scale)
69
  ]
70
+ boxes.append(scaled_bbox)
71
+ box_labels.append(label)
 
 
 
 
 
72
 
73
  if "point_2d" in item:
74
  x, y = item["point_2d"]
75
  scaled_x = int(x * x_scale)
76
  scaled_y = int(y * y_scale)
77
+ points.append([scaled_x, scaled_y])
78
+ point_labels.append(label)
 
79
 
80
+ annotated_image = np.array(image.convert("RGB"))
81
+
82
+ if boxes:
83
+ detections = sv.Detections(xyxy=np.array(boxes))
84
+ bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
85
+ label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)
86
+
87
+ annotated_image = bounding_box_annotator.annotate(
88
+ scene=annotated_image,
89
+ detections=detections
90
+ )
91
+ annotated_image = label_annotator.annotate(
92
+ scene=annotated_image,
93
+ detections=detections,
94
+ labels=box_labels
95
+ )
96
+
97
+ if points:
98
+ points_array = np.array(points).reshape(1, -1, 2)
99
+ key_points = sv.KeyPoints(xy=points_array)
100
+ vertex_annotator = sv.VertexAnnotator(radius=5, color=sv.Color.BLUE)
101
+ #vertex_label_annotator = sv.VertexLabelAnnotator(text_scale=0.5, border_radius=2)
102
+
103
+ annotated_image = vertex_annotator.annotate(
104
+ scene=annotated_image,
105
+ key_points=key_points
106
+ )
107
+
108
+ # annotated_image = vertex_label_annotator.annotate(
109
+ # scene=annotated_image,
110
+ # key_points=key_points,
111
+ # labels=point_labels
112
+ # )
113
 
114
+ return Image.fromarray(annotated_image)
115
+
116
+ def create_annotated_image_normalized(image, json_data, label="object"):
117
  if not isinstance(json_data, dict):
118
  return image
119
 
120
  original_width, original_height = image.size
121
+ annotated_image = np.array(image.convert("RGB"))
122
+
123
+ # Handle points for keypoint detection
124
+ points = []
125
+ if "points" in json_data:
126
+ for point in json_data.get("points", []):
127
+ x = int(point["x"] * original_width)
128
+ y = int(point["y"] * original_height)
129
+ points.append([x, y])
130
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  if "reasoning" in json_data:
132
  for grounding in json_data["reasoning"].get("grounding", []):
133
  for x_norm, y_norm in grounding.get("points", []):
134
  x = int(x_norm * original_width)
135
  y = int(y_norm * original_height)
136
+ points.append([x,y])
137
+
138
+ if points:
139
+ points_array = np.array(points).reshape(1, -1, 2)
140
+ key_points = sv.KeyPoints(xy=points_array)
141
+ vertex_annotator = sv.VertexAnnotator(radius=5, color=sv.Color.RED)
142
+ annotated_image = vertex_annotator.annotate(scene=annotated_image, key_points=key_points)
143
+
144
+ # Handle boxes for object detection
145
+ boxes = []
146
+ if "objects" in json_data:
147
+ for item in json_data.get("objects", []):
148
+ x_min = int(item["x_min"] * original_width)
149
+ y_min = int(item["y_min"] * original_height)
150
+ x_max = int(item["x_max"] * original_width)
151
+ y_max = int(item["y_max"] * original_height)
152
+ boxes.append([x_min, y_min, x_max, y_max])
153
+
154
+ if boxes:
155
+ detections = sv.Detections(xyxy=np.array(boxes))
156
+ bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
157
+ label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)
158
+
159
+ labels = [label for _ in detections.xyxy]
160
 
161
+ annotated_image = bounding_box_annotator.annotate(
162
+ scene=annotated_image,
163
+ detections=detections
164
+ )
165
+ annotated_image = label_annotator.annotate(
166
+ scene=annotated_image,
167
+ detections=detections,
168
+ labels=labels
169
+ )
170
+
171
+ return Image.fromarray(annotated_image)
172
 
173
 
174
 
 
187
 
188
  t0 = time.perf_counter()
189
  text = processor_qwen.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
190
+ image_inputs, video_inputs, _ = process_vision_info(messages)
191
  inputs = processor_qwen(
192
  text=[text],
193
  images=image_inputs,
 
225
  output_text = model_moondream.query(image=image, question=prompt, reasoning=True)
226
  elapsed_ms = (time.perf_counter() - t0) * 1_000
227
 
228
+ annotated_image = create_annotated_image_normalized(image=image, json_data=output_text, label="object")
229
 
230
  time_taken = f"**Inference time ({model_moondream_name}):** {elapsed_ms:.0f} ms"
231
  return annotated_image, output_text, time_taken
 
323
 
324
  if __name__ == "__main__":
325
  demo.launch()
 
requirements.txt CHANGED
@@ -7,3 +7,4 @@ accelerate
7
  qwen-vl-utils
8
  torchvision
9
  matplotlib
 
 
7
  qwen-vl-utils
8
  torchvision
9
  matplotlib
10
+ supervision>=0.26.0rc7