harpoon-1-2 / webcam_detection_tracked.py

Update HarpoonNet 1.2 with Non-Commercial License - ConvNeXt-Small Architecture

cb237e8 verified 2 months ago

11.6 kB

	"""
	HarpoonNet Webcam Detection with ByteTrack
	Real-time drone detection and tracking using webcam
	"""

	import cv2
	import torch
	import numpy as np
	import argparse
	import time
	from PIL import Image
	from torchvision import transforms
	from harpoon_modular import create_harpoon_net_modular
	from bytetrack import BYTETracker
	import random


	class WebcamDetectorWithTracking:
	"""Webcam detector with ByteTrack integration"""

	def __init__(self, model_path, device=None):
	"""Initialize detector with tracking"""
	self.model_path = model_path
	self.device = device or torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	self.conf_thresh = 0.6 # Increased from 0.35 to be less sensitive
	self.debug_view = False
	self.last_time = None

	# Initialize ByteTracker
	self.tracker = BYTETracker(
	frame_rate=30,
	track_thresh=0.7, # Increased from 0.5 - higher threshold for new tracks
	track_buffer=30,
	match_thresh=0.8
	)

	# Colors for different track IDs
	self.colors = self.generate_colors(50)

	# Load model
	print(f"🔄 Loading model from {model_path}")
	self.model = create_harpoon_net_modular(pretrained=False)

	# Load checkpoint
	checkpoint = torch.load(model_path, map_location=self.device)
	if 'model_state_dict' in checkpoint:
	# Full checkpoint file
	self.model.load_state_dict(checkpoint['model_state_dict'])
	else:
	# Just model weights
	self.model.load_state_dict(checkpoint)

	self.model = self.model.to(self.device)
	self.model.eval()
	print("✅ Model loaded successfully")

	# Setup webcam
	self.cap = cv2.VideoCapture(0)
	if not self.cap.isOpened():
	raise RuntimeError("❌ Could not open webcam")

	# Get webcam resolution
	self.frame_width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
	self.frame_height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
	print(f"📹 Camera resolution: {self.frame_width}x{self.frame_height}")
	print(f"🎯 Initial confidence threshold: {self.conf_thresh:.2f}")

	def generate_colors(self, num_colors):
	"""Generate random colors for track visualization"""
	colors = []
	for _ in range(num_colors):
	color = (
	random.randint(0, 255),
	random.randint(0, 255),
	random.randint(0, 255)
	)
	colors.append(color)
	return colors

	def preprocess_frame(self, frame):
	"""Preprocess frame for model input"""
	# Get original dimensions
	orig_h, orig_w = frame.shape[:2]

	# Calculate scaling to maintain aspect ratio
	input_size = 320
	scale = min(input_size / orig_w, input_size / orig_h)
	new_w = int(orig_w * scale)
	new_h = int(orig_h * scale)

	# Resize maintaining aspect ratio
	resized = cv2.resize(frame, (new_w, new_h))

	# Create square canvas with padding
	square = np.zeros((input_size, input_size, 3), dtype=np.uint8)
	# Center the resized image in the square
	x_offset = (input_size - new_w) // 2
	y_offset = (input_size - new_h) // 2
	square[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized

	# Convert to RGB
	img = cv2.cvtColor(square, cv2.COLOR_BGR2RGB)

	# Convert to PIL Image for transforms
	img = Image.fromarray(img)

	# Apply transforms
	transform = transforms.Compose([
	transforms.ToTensor(),
	transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
	])

	# Apply transform and add batch dimension
	img = transform(img).unsqueeze(0)

	return img, (scale, x_offset, y_offset)

	def postprocess_detections(self, detections, preprocess_info, frame_shape):
	"""Convert normalized coordinates back to original frame coordinates"""
	scale, x_offset, y_offset = preprocess_info
	orig_h, orig_w = frame_shape[:2]
	processed_detections = []

	for i, det in enumerate(detections[0]['boxes']):
	if len(det) == 4 and i < len(detections[0]['scores']):
	score = detections[0]['scores'][i]

	# Apply confidence threshold here
	if score < self.conf_thresh:
	continue

	# Remove padding offset
	x1 = (det[0] - x_offset) / scale
	y1 = (det[1] - y_offset) / scale
	x2 = (det[2] - x_offset) / scale
	y2 = (det[3] - y_offset) / scale

	# Clip to frame boundaries
	x1 = np.clip(x1, 0, orig_w)
	y1 = np.clip(y1, 0, orig_h)
	x2 = np.clip(x2, 0, orig_w)
	y2 = np.clip(y2, 0, orig_h)

	# Only add if box has reasonable size
	w = x2 - x1
	h = y2 - y1
	if w > 10 and h > 10 and w < orig_w * 0.9 and h < orig_h * 0.9:
	processed_detections.append([x1, y1, x2, y2, score])

	return processed_detections

	def process_frame(self, frame):
	"""Process a single frame with detection and tracking"""
	# Preprocess
	img, preprocess_info = self.preprocess_frame(frame)
	img = img.to(self.device)

	# Run inference
	with torch.no_grad():
	predictions = self.model(img)
	detections = self.model.decode_predictions(predictions, confidence_threshold=0.1) # Keep low for raw detections

	# Postprocess detections (applies our adjustable confidence threshold)
	detection_list = self.postprocess_detections(detections, preprocess_info, frame.shape)

	# Update tracker
	tracks = self.tracker.update(detection_list)

	# Draw tracks
	frame_with_tracks = self.draw_tracks(frame, tracks)

	# Add FPS and track count
	fps = 1.0 / (time.time() - self.last_time) if self.last_time else 0.0
	self.last_time = time.time()

	cv2.putText(frame_with_tracks, f"FPS: {fps:.1f} \| Tracks: {len(tracks)} \| Dets: {len(detection_list)} \| Conf: {self.conf_thresh:.2f}",
	(10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)

	return frame_with_tracks

	def draw_tracks(self, frame, tracks):
	"""Draw tracking results on frame"""
	frame_with_tracks = frame.copy()

	for track in tracks:
	if not track.is_activated:
	continue

	# Get track info
	track_id = track.track_id
	bbox = track.tlbr
	score = track.score

	# Get color for this track
	color = self.colors[track_id % len(self.colors)]

	# Draw bounding box
	x1, y1, x2, y2 = map(int, bbox)
	cv2.rectangle(frame_with_tracks, (x1, y1), (x2, y2), color, 2)

	# Draw track ID and confidence
	label = f"ID:{track_id} ({score:.2f})"
	label_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)[0]

	# Draw label background
	cv2.rectangle(frame_with_tracks,
	(x1, y1 - label_size[1] - 10),
	(x1 + label_size[0], y1),
	color, -1)

	# Draw label text
	cv2.putText(frame_with_tracks, label,
	(x1, y1 - 5),
	cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)

	# Draw track history (trail)
	if hasattr(track, 'history') and len(track.history) > 1:
	points = []
	for hist_bbox in list(track.history.values())[-10:]: # Last 10 positions
	center_x = int((hist_bbox[0] + hist_bbox[2]) / 2)
	center_y = int((hist_bbox[1] + hist_bbox[3]) / 2)
	points.append((center_x, center_y))

	# Draw trail
	for i in range(1, len(points)):
	cv2.line(frame_with_tracks, points[i-1], points[i], color, 2)

	# Draw center point
	center_x = int((x1 + x2) / 2)
	center_y = int((y1 + y2) / 2)
	cv2.circle(frame_with_tracks, (center_x, center_y), 3, color, -1)

	return frame_with_tracks

	def run(self):
	"""Run webcam detection with tracking"""
	print("🎥 Starting detection with tracking...")
	print("Controls:")
	print(" 'q': Quit")
	print(" 'd': Toggle debug view")
	print(" '+': Increase confidence threshold")
	print(" '-': Decrease confidence threshold")
	print(" 'r': Reset tracker")

	self.last_time = time.time()

	while True:
	ret, frame = self.cap.read()
	if not ret:
	break

	# Process frame
	frame_with_tracks = self.process_frame(frame)

	# Show frame
	cv2.imshow('HarpoonNet Detection + Tracking', frame_with_tracks)

	# Handle key presses
	key = cv2.waitKey(1) & 0xFF
	if key == ord('q'):
	break
	elif key == ord('d'):
	self.debug_view = not self.debug_view
	elif key == ord('+'):
	self.conf_thresh = min(1.0, self.conf_thresh + 0.05)
	print(f"Confidence threshold: {self.conf_thresh:.2f}")
	elif key == ord('-'):
	self.conf_thresh = max(0.05, self.conf_thresh - 0.05)
	print(f"Confidence threshold: {self.conf_thresh:.2f}")
	elif key == ord('r'):
	# Reset tracker
	self.tracker = BYTETracker(
	frame_rate=30,
	track_thresh=0.7, # Increased from 0.5 - higher threshold for new tracks
	track_buffer=30,
	match_thresh=0.8
	)
	print("🔄 Tracker reset")

	self.cap.release()
	cv2.destroyAllWindows()


	def main():
	"""Main function"""
	parser = argparse.ArgumentParser(description='HarpoonNet Webcam Detection with Tracking')
	parser.add_argument('--model', type=str, required=True, help='Path to model file')
	parser.add_argument('--device', type=str, default=None, help='Device to use (cuda/cpu)')

	args = parser.parse_args()

	# Set device
	if args.device:
	device = torch.device(args.device)
	else:
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	print(f"🚀 Using device: {device}")

	try:
	# Create detector
	detector = WebcamDetectorWithTracking(args.model, device)

	# Run detection
	detector.run()

	except KeyboardInterrupt:
	print("\n⏹️ Detection stopped by user")
	except Exception as e:
	print(f"❌ Error: {e}")


	if __name__ == '__main__':
	main()