harpoon-1-2 / webcam_detection.py
christiankhoury05's picture
Update HarpoonNet 1.2 with Non-Commercial License - ConvNeXt-Small Architecture
cb237e8 verified
#!/usr/bin/env python3
"""
HarpoonNet Webcam Detection Script
Real-time drone detection using webcam feed
"""
import cv2
import torch
import numpy as np
import argparse
from pathlib import Path
from harpoon_modular import HarpoonNetModular
from PIL import Image
from torchvision import transforms
import time
class WebcamDetector:
def __init__(self, model_path, conf_thresh=0.6, nms_thresh=0.4, camera_id=0, flip_frame=True):
"""Initialize the webcam detector"""
self.model_path = model_path
self.conf_thresh = conf_thresh
self.nms_thresh = nms_thresh
self.flip_frame = flip_frame
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.debug_view = False # Toggle with 'd' key
self.last_time = None
# ImageNet normalization parameters (as float32)
self.normalize_mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
self.normalize_std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
# Load model
print(f"πŸ”„ Loading model from {model_path}")
checkpoint = torch.load(model_path, map_location=self.device)
self.model = HarpoonNetModular(num_classes=1, num_anchors=3)
self.model.load_state_dict(checkpoint['model_state_dict'])
self.model.to(self.device)
self.model.eval()
print("βœ… Model loaded successfully")
# Initialize webcam
self.cap = cv2.VideoCapture(camera_id)
if not self.cap.isOpened():
raise RuntimeError(f"Failed to open camera {camera_id}")
# Get camera properties
self.frame_width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
self.frame_height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
print(f"πŸ“Ή Camera resolution: {self.frame_width}x{self.frame_height}")
print(f"🎯 Initial confidence threshold: {self.conf_thresh:.2f}")
def enhance_frame(self, frame):
"""Enhance frame for better detection"""
# Convert to LAB color space
lab = cv2.cvtColor(frame, cv2.COLOR_BGR2LAB)
l, a, b = cv2.split(lab)
# Apply CLAHE to L channel
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
cl = clahe.apply(l)
# Merge channels
limg = cv2.merge((cl,a,b))
# Convert back to BGR
enhanced = cv2.cvtColor(limg, cv2.COLOR_LAB2BGR)
# Increase contrast
alpha = 1.3 # Contrast control
beta = 10 # Brightness control
enhanced = cv2.convertScaleAbs(enhanced, alpha=alpha, beta=beta)
return enhanced
def preprocess_frame(self, frame):
"""Preprocess frame for model input"""
# Get original dimensions
orig_h, orig_w = frame.shape[:2]
# Calculate scaling to maintain aspect ratio
input_size = 320
scale = min(input_size / orig_w, input_size / orig_h)
new_w = int(orig_w * scale)
new_h = int(orig_h * scale)
# Resize maintaining aspect ratio
resized = cv2.resize(frame, (new_w, new_h))
# Create square canvas with padding
square = np.zeros((input_size, input_size, 3), dtype=np.uint8)
# Center the resized image in the square
x_offset = (input_size - new_w) // 2
y_offset = (input_size - new_h) // 2
square[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized
# Convert to RGB
img = cv2.cvtColor(square, cv2.COLOR_BGR2RGB)
# Convert to PIL Image for transforms
img = Image.fromarray(img)
# Apply transforms
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
# Apply transform and add batch dimension
img = transform(img).unsqueeze(0)
return img, (scale, x_offset, y_offset)
def postprocess_detections(self, detections, preprocess_info, frame_shape):
"""Convert normalized coordinates back to original frame coordinates"""
scale, x_offset, y_offset = preprocess_info
orig_h, orig_w = frame_shape[:2]
processed_detections = []
for det in detections[0]['boxes']:
if len(det) == 4:
# Remove padding offset
x1 = (det[0] - x_offset) / scale
y1 = (det[1] - y_offset) / scale
x2 = (det[2] - x_offset) / scale
y2 = (det[3] - y_offset) / scale
# Clip to frame boundaries
x1 = np.clip(x1, 0, orig_w)
y1 = np.clip(y1, 0, orig_h)
x2 = np.clip(x2, 0, orig_w)
y2 = np.clip(y2, 0, orig_h)
# Only add if box has reasonable size
w = x2 - x1
h = y2 - y1
if w > 10 and h > 10 and w < orig_w * 0.9 and h < orig_h * 0.9:
processed_detections.append([int(x1), int(y1), int(x2), int(y2)])
return processed_detections
def process_frame(self, frame):
"""Process a single frame"""
# Preprocess
img, preprocess_info = self.preprocess_frame(frame)
img = img.to(self.device)
# Run inference
with torch.no_grad():
predictions = self.model(img)
detections = self.model.decode_predictions(predictions, confidence_threshold=self.conf_thresh)
# Postprocess
boxes = self.postprocess_detections(detections, preprocess_info, frame.shape)
# Draw detections
frame_with_boxes = frame.copy()
for box in boxes:
x1, y1, x2, y2 = box
cv2.rectangle(frame_with_boxes, (x1, y1), (x2, y2), (0, 255, 0), 2)
# Add confidence score if available
if len(detections[0]['scores']) > 0:
conf = detections[0]['scores'][0]
cv2.putText(frame_with_boxes, f"Drone: {conf:.2f}",
(x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
# Add FPS and detection count
fps = 1.0 / (time.time() - self.last_time) if self.last_time else 0.0
self.last_time = time.time()
cv2.putText(frame_with_boxes, f"FPS: {fps:.1f} | Detected: {len(boxes)} | Conf: {self.conf_thresh:.2f}",
(10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
return frame_with_boxes
def draw_detections(self, frame, detections):
"""Draw detection boxes and labels on frame"""
for det in detections:
x1, y1, x2, y2 = det['box']
conf = det['confidence']
# Draw box
color = (0, int(255 * conf), 0) # Brighter green for higher confidence
cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
# Draw label with confidence
label = f"Drone: {conf:.2f}"
cv2.putText(frame, label, (x1, y1-10),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
# Draw FPS and detection count
if hasattr(self, 'fps'):
cv2.putText(frame, f"FPS: {self.fps:.1f} | Detected: {len(detections)}",
(10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
# Draw debug info
if self.debug_view:
cv2.putText(frame, "Debug View: ON", (10, 60),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
cv2.putText(frame, f"Conf Thresh: {self.conf_thresh:.2f}", (10, 90),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
# Draw center crosshair
h, w = frame.shape[:2]
cv2.line(frame, (w//2, 0), (w//2, h), (0, 0, 255), 1)
cv2.line(frame, (0, h//2), (w, h//2), (0, 0, 255), 1)
return frame
def run(self):
"""Run webcam detection"""
print("πŸŽ₯ Starting detection...")
print("Controls:")
print(" 'q': Quit")
print(" 'd': Toggle debug view")
print(" '+': Increase confidence threshold")
print(" '-': Decrease confidence threshold")
self.last_time = time.time()
while True:
ret, frame = self.cap.read()
if not ret:
break
# Process frame
frame_with_boxes = self.process_frame(frame)
# Show frame
cv2.imshow('HarpoonNet Detection', frame_with_boxes)
# Handle key presses
key = cv2.waitKey(1) & 0xFF
if key == ord('q'):
break
elif key == ord('d'):
self.debug_view = not self.debug_view
elif key == ord('+'):
self.conf_thresh = min(1.0, self.conf_thresh + 0.05)
print(f"Confidence threshold: {self.conf_thresh:.2f}")
elif key == ord('-'):
self.conf_thresh = max(0.05, self.conf_thresh - 0.05)
print(f"Confidence threshold: {self.conf_thresh:.2f}")
self.cap.release()
cv2.destroyAllWindows()
def main():
parser = argparse.ArgumentParser(description='HarpoonNet Webcam Detection')
parser.add_argument('--model', type=str, required=True,
help='Path to model checkpoint')
parser.add_argument('--conf', type=float, default=0.6,
help='Initial confidence threshold')
parser.add_argument('--nms', type=float, default=0.4,
help='NMS threshold')
parser.add_argument('--camera', type=int, default=0,
help='Camera device ID')
parser.add_argument('--no-flip', action='store_true',
help='Disable frame flipping')
args = parser.parse_args()
try:
detector = WebcamDetector(
model_path=args.model,
conf_thresh=args.conf,
nms_thresh=args.nms,
camera_id=args.camera,
flip_frame=not args.no_flip
)
detector.run()
except Exception as e:
print(f"❌ Error: {str(e)}")
return 1
return 0
if __name__ == '__main__':
exit(main())