Spaces:
Running
on
Zero
Running
on
Zero
feat: ✨ supervision visualization added
#3
by
onuralpszr
- opened
- app.py +94 -67
- requirements.txt +1 -0
app.py
CHANGED
@@ -5,6 +5,8 @@ import ast
|
|
5 |
import time
|
6 |
|
7 |
import matplotlib.pyplot as plt
|
|
|
|
|
8 |
from PIL import Image, ImageDraw, ImageFont
|
9 |
|
10 |
import gradio as gr
|
@@ -38,22 +40,10 @@ model_moondream_name = extract_model_short_name(model_moondream_id) # → "moon
|
|
38 |
min_pixels = 224 * 224
|
39 |
max_pixels = 1024 * 1024
|
40 |
processor_qwen = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
|
41 |
-
#processor_moondream = AutoProcessor.from_pretrained("vikhyatk/moondream2", trust_remote_code=True)
|
42 |
-
|
43 |
-
label2color = {}
|
44 |
-
vivid_colors = ["#e6194b", "#3cb44b", "#0082c8", "#f58231", "#911eb4", "#46f0f0", "#f032e6", "#d2f53c", "#fabebe", "#008080", "#e6beff", "#aa6e28", "#fffac8", "#800000", "#aaffc3", "#808000", "#ffd8b1", "#000080", "#808080", "#000000"]
|
45 |
-
|
46 |
-
def get_color(label, explicit_color=None):
|
47 |
-
if explicit_color:
|
48 |
-
return explicit_color
|
49 |
-
if label not in label2color:
|
50 |
-
index = len(label2color) % len(vivid_colors)
|
51 |
-
label2color[label] = vivid_colors[index]
|
52 |
-
return label2color[label]
|
53 |
|
54 |
def create_annotated_image(image, json_data, height, width):
|
55 |
try:
|
56 |
-
json_data = json_data.split(
|
57 |
bbox_data = json.loads(json_data)
|
58 |
except Exception:
|
59 |
return image
|
@@ -62,20 +52,13 @@ def create_annotated_image(image, json_data, height, width):
|
|
62 |
x_scale = original_width / width
|
63 |
y_scale = original_height / height
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
try:
|
71 |
-
font = ImageFont.truetype("DejaVuSans-Bold.ttf", int(12 * scale_factor))
|
72 |
-
except:
|
73 |
-
font = ImageFont.load_default()
|
74 |
|
75 |
for item in bbox_data:
|
76 |
label = item.get("label", "")
|
77 |
-
color = get_color(label, item.get("color", None))
|
78 |
-
|
79 |
if "bbox_2d" in item:
|
80 |
bbox = item["bbox_2d"]
|
81 |
scaled_bbox = [
|
@@ -84,63 +67,108 @@ def create_annotated_image(image, json_data, height, width):
|
|
84 |
int(bbox[2] * x_scale),
|
85 |
int(bbox[3] * y_scale)
|
86 |
]
|
87 |
-
|
88 |
-
|
89 |
-
(scaled_bbox[0], max(0, scaled_bbox[1] - int(15 * scale_factor))),
|
90 |
-
label,
|
91 |
-
fill=color,
|
92 |
-
font=font
|
93 |
-
)
|
94 |
|
95 |
if "point_2d" in item:
|
96 |
x, y = item["point_2d"]
|
97 |
scaled_x = int(x * x_scale)
|
98 |
scaled_y = int(y * y_scale)
|
99 |
-
|
100 |
-
|
101 |
-
draw.text((scaled_x + int(6 * scale_factor), scaled_y), label, fill=color, font=font)
|
102 |
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
-
|
|
|
|
|
106 |
if not isinstance(json_data, dict):
|
107 |
return image
|
108 |
|
109 |
original_width, original_height = image.size
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
for point in json_data.get("points", []):
|
122 |
-
x = int(point["x"] * original_width)
|
123 |
-
y = int(point["y"] * original_height)
|
124 |
-
radius = int(4 * scale_factor)
|
125 |
-
draw.ellipse([x - radius, y - radius, x + radius, y + radius], fill=color, outline=color)
|
126 |
-
|
127 |
-
for item in json_data.get("objects", []):
|
128 |
-
x_min = int(item["x_min"] * original_width)
|
129 |
-
y_min = int(item["y_min"] * original_height)
|
130 |
-
x_max = int(item["x_max"] * original_width)
|
131 |
-
y_max = int(item["y_max"] * original_height)
|
132 |
-
draw.rectangle([x_min, y_min, x_max, y_max], outline=color, width=int(2 * scale_factor))
|
133 |
-
draw.text((x_min, max(0, y_min - int(15 * scale_factor))), label, fill=color, font=font)
|
134 |
-
|
135 |
if "reasoning" in json_data:
|
136 |
for grounding in json_data["reasoning"].get("grounding", []):
|
137 |
for x_norm, y_norm in grounding.get("points", []):
|
138 |
x = int(x_norm * original_width)
|
139 |
y = int(y_norm * original_height)
|
140 |
-
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
|
146 |
|
@@ -159,7 +187,7 @@ def detect_qwen(image, prompt):
|
|
159 |
|
160 |
t0 = time.perf_counter()
|
161 |
text = processor_qwen.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
162 |
-
image_inputs, video_inputs = process_vision_info(messages)
|
163 |
inputs = processor_qwen(
|
164 |
text=[text],
|
165 |
images=image_inputs,
|
@@ -197,7 +225,7 @@ def detect_moondream(image, prompt, category_input):
|
|
197 |
output_text = model_moondream.query(image=image, question=prompt, reasoning=True)
|
198 |
elapsed_ms = (time.perf_counter() - t0) * 1_000
|
199 |
|
200 |
-
annotated_image = create_annotated_image_normalized(image=image, json_data=output_text, label="object"
|
201 |
|
202 |
time_taken = f"**Inference time ({model_moondream_name}):** {elapsed_ms:.0f} ms"
|
203 |
return annotated_image, output_text, time_taken
|
@@ -295,4 +323,3 @@ with gr.Blocks(theme=Ocean(), css=css_hide_share) as demo:
|
|
295 |
|
296 |
if __name__ == "__main__":
|
297 |
demo.launch()
|
298 |
-
|
|
|
5 |
import time
|
6 |
|
7 |
import matplotlib.pyplot as plt
|
8 |
+
import numpy as np
|
9 |
+
import supervision as sv
|
10 |
from PIL import Image, ImageDraw, ImageFont
|
11 |
|
12 |
import gradio as gr
|
|
|
40 |
min_pixels = 224 * 224
|
41 |
max_pixels = 1024 * 1024
|
42 |
processor_qwen = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
def create_annotated_image(image, json_data, height, width):
|
45 |
try:
|
46 |
+
json_data = json_data.split("```json")[1].split("```")[0]
|
47 |
bbox_data = json.loads(json_data)
|
48 |
except Exception:
|
49 |
return image
|
|
|
52 |
x_scale = original_width / width
|
53 |
y_scale = original_height / height
|
54 |
|
55 |
+
boxes = []
|
56 |
+
box_labels = []
|
57 |
+
points = []
|
58 |
+
point_labels = []
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
for item in bbox_data:
|
61 |
label = item.get("label", "")
|
|
|
|
|
62 |
if "bbox_2d" in item:
|
63 |
bbox = item["bbox_2d"]
|
64 |
scaled_bbox = [
|
|
|
67 |
int(bbox[2] * x_scale),
|
68 |
int(bbox[3] * y_scale)
|
69 |
]
|
70 |
+
boxes.append(scaled_bbox)
|
71 |
+
box_labels.append(label)
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
if "point_2d" in item:
|
74 |
x, y = item["point_2d"]
|
75 |
scaled_x = int(x * x_scale)
|
76 |
scaled_y = int(y * y_scale)
|
77 |
+
points.append([scaled_x, scaled_y])
|
78 |
+
point_labels.append(label)
|
|
|
79 |
|
80 |
+
annotated_image = np.array(image.convert("RGB"))
|
81 |
+
|
82 |
+
if boxes:
|
83 |
+
detections = sv.Detections(xyxy=np.array(boxes))
|
84 |
+
bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
|
85 |
+
label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)
|
86 |
+
|
87 |
+
annotated_image = bounding_box_annotator.annotate(
|
88 |
+
scene=annotated_image,
|
89 |
+
detections=detections
|
90 |
+
)
|
91 |
+
annotated_image = label_annotator.annotate(
|
92 |
+
scene=annotated_image,
|
93 |
+
detections=detections,
|
94 |
+
labels=box_labels
|
95 |
+
)
|
96 |
+
|
97 |
+
if points:
|
98 |
+
points_array = np.array(points).reshape(1, -1, 2)
|
99 |
+
key_points = sv.KeyPoints(xy=points_array)
|
100 |
+
vertex_annotator = sv.VertexAnnotator(radius=5, color=sv.Color.BLUE)
|
101 |
+
#vertex_label_annotator = sv.VertexLabelAnnotator(text_scale=0.5, border_radius=2)
|
102 |
+
|
103 |
+
annotated_image = vertex_annotator.annotate(
|
104 |
+
scene=annotated_image,
|
105 |
+
key_points=key_points
|
106 |
+
)
|
107 |
+
|
108 |
+
# annotated_image = vertex_label_annotator.annotate(
|
109 |
+
# scene=annotated_image,
|
110 |
+
# key_points=key_points,
|
111 |
+
# labels=point_labels
|
112 |
+
# )
|
113 |
|
114 |
+
return Image.fromarray(annotated_image)
|
115 |
+
|
116 |
+
def create_annotated_image_normalized(image, json_data, label="object"):
|
117 |
if not isinstance(json_data, dict):
|
118 |
return image
|
119 |
|
120 |
original_width, original_height = image.size
|
121 |
+
annotated_image = np.array(image.convert("RGB"))
|
122 |
+
|
123 |
+
# Handle points for keypoint detection
|
124 |
+
points = []
|
125 |
+
if "points" in json_data:
|
126 |
+
for point in json_data.get("points", []):
|
127 |
+
x = int(point["x"] * original_width)
|
128 |
+
y = int(point["y"] * original_height)
|
129 |
+
points.append([x, y])
|
130 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
if "reasoning" in json_data:
|
132 |
for grounding in json_data["reasoning"].get("grounding", []):
|
133 |
for x_norm, y_norm in grounding.get("points", []):
|
134 |
x = int(x_norm * original_width)
|
135 |
y = int(y_norm * original_height)
|
136 |
+
points.append([x,y])
|
137 |
+
|
138 |
+
if points:
|
139 |
+
points_array = np.array(points).reshape(1, -1, 2)
|
140 |
+
key_points = sv.KeyPoints(xy=points_array)
|
141 |
+
vertex_annotator = sv.VertexAnnotator(radius=5, color=sv.Color.RED)
|
142 |
+
annotated_image = vertex_annotator.annotate(scene=annotated_image, key_points=key_points)
|
143 |
+
|
144 |
+
# Handle boxes for object detection
|
145 |
+
boxes = []
|
146 |
+
if "objects" in json_data:
|
147 |
+
for item in json_data.get("objects", []):
|
148 |
+
x_min = int(item["x_min"] * original_width)
|
149 |
+
y_min = int(item["y_min"] * original_height)
|
150 |
+
x_max = int(item["x_max"] * original_width)
|
151 |
+
y_max = int(item["y_max"] * original_height)
|
152 |
+
boxes.append([x_min, y_min, x_max, y_max])
|
153 |
+
|
154 |
+
if boxes:
|
155 |
+
detections = sv.Detections(xyxy=np.array(boxes))
|
156 |
+
bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
|
157 |
+
label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)
|
158 |
+
|
159 |
+
labels = [label for _ in detections.xyxy]
|
160 |
|
161 |
+
annotated_image = bounding_box_annotator.annotate(
|
162 |
+
scene=annotated_image,
|
163 |
+
detections=detections
|
164 |
+
)
|
165 |
+
annotated_image = label_annotator.annotate(
|
166 |
+
scene=annotated_image,
|
167 |
+
detections=detections,
|
168 |
+
labels=labels
|
169 |
+
)
|
170 |
+
|
171 |
+
return Image.fromarray(annotated_image)
|
172 |
|
173 |
|
174 |
|
|
|
187 |
|
188 |
t0 = time.perf_counter()
|
189 |
text = processor_qwen.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
190 |
+
image_inputs, video_inputs, _ = process_vision_info(messages)
|
191 |
inputs = processor_qwen(
|
192 |
text=[text],
|
193 |
images=image_inputs,
|
|
|
225 |
output_text = model_moondream.query(image=image, question=prompt, reasoning=True)
|
226 |
elapsed_ms = (time.perf_counter() - t0) * 1_000
|
227 |
|
228 |
+
annotated_image = create_annotated_image_normalized(image=image, json_data=output_text, label="object")
|
229 |
|
230 |
time_taken = f"**Inference time ({model_moondream_name}):** {elapsed_ms:.0f} ms"
|
231 |
return annotated_image, output_text, time_taken
|
|
|
323 |
|
324 |
if __name__ == "__main__":
|
325 |
demo.launch()
|
|
requirements.txt
CHANGED
@@ -7,3 +7,4 @@ accelerate
|
|
7 |
qwen-vl-utils
|
8 |
torchvision
|
9 |
matplotlib
|
|
|
|
7 |
qwen-vl-utils
|
8 |
torchvision
|
9 |
matplotlib
|
10 |
+
supervision>=0.26.0rc7
|