yolo12l-person-seg / sample_inference.py
Ryan Pfister
Add YOLO12l-seg person segmentation model with documentation and example code
58343f2
# sample_inference.py
import argparse
import torch
from ultralytics import YOLO
import cv2
import numpy as np
import json
from PIL import Image
def main():
parser = argparse.ArgumentParser(description='Run person segmentation with YOLO12l-seg model')
parser.add_argument('--model', type=str, default='yolo12l-person-seg.pt', help='Model path')
parser.add_argument('--image', type=str, required=True, help='Image path for inference')
parser.add_argument('--output', type=str, default='output.jpg', help='Output visualization image path')
parser.add_argument('--json', type=str, default='detections.json', help='JSON output file for detection data')
parser.add_argument('--conf', type=float, default=0.5, help='Confidence threshold')
args = parser.parse_args()
# Load the model
model = YOLO(args.model)
# Move to appropriate device if available
if torch.cuda.is_available():
print(f"Using CUDA device: {torch.cuda.get_device_name(0)}")
model.to('cuda')
device = 'cuda'
use_half = True
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
print("Using Apple Silicon MPS")
model.to('mps')
device = 'mps'
use_half = False
else:
print("Using CPU")
device = None
use_half = False
# Load and check input image
try:
img = Image.open(args.image)
img_width, img_height = img.size
print(f"Image dimensions: {img_width}x{img_height}")
except Exception as e:
print(f"Error opening image: {e}")
return
# Run inference
if device == 'cuda':
results = model(args.image, classes=0, conf=args.conf, device=device, half=use_half)
elif device == 'mps':
results = model(args.image, classes=0, conf=args.conf, device=device)
else:
results = model(args.image, classes=0, conf=args.conf)
# Process results
detections = []
visualization_img = cv2.imread(args.image)
for result in results:
masks = result.masks
boxes = result.boxes
if boxes is None or len(boxes) == 0:
print("No people detected in the image")
return
person_count = len(boxes)
print(f"Detected {person_count} people")
# Visualize and extract data
if masks is not None:
for i, (mask, box) in enumerate(zip(masks.xy, boxes)):
confidence = float(box.conf[0])
x1, y1, x2, y2 = map(int, box.xyxy[0])
# Extract mask points
polygon_points = mask.tolist()
# Calculate percentages of image dimensions
x_coords = [point[0] for point in polygon_points]
y_coords = [point[1] for point in polygon_points]
min_x, max_x = min(x_coords), max(x_coords)
min_y, max_y = min(y_coords), max(y_coords)
width_pct = (max_x - min_x) / img_width
height_pct = (max_y - min_y) / img_height
# Create detection record
detection = {
"id": i,
"confidence": confidence,
"box": [x1, y1, x2, y2],
"points": polygon_points,
"width_pct": width_pct,
"height_pct": height_pct,
}
detections.append(detection)
# Draw bounding box
cv2.rectangle(visualization_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.putText(visualization_img, f'Person: {confidence:.2f}', (x1, y1 - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
# Draw segmentation mask
color_mask = np.zeros_like(visualization_img, dtype=np.uint8)
mask_points = np.array(polygon_points, dtype=np.int32)
cv2.fillPoly(color_mask, [mask_points], (0, 0, 255))
# Blend the mask with the original image
visualization_img = cv2.addWeighted(visualization_img, 1.0, color_mask, 0.5, 0)
# Save visualization
cv2.imwrite(args.output, visualization_img)
print(f"Visualization saved to {args.output}")
# Save detection data to JSON
with open(args.json, 'w') as f:
json.dump({
"person_count": person_count,
"detections": detections
}, f, indent=4)
print(f"Detection data saved to {args.json}")
if __name__ == "__main__":
main()