File size: 10,534 Bytes

cb237e8

#!/usr/bin/env python3
"""
HarpoonNet Webcam Detection Script
Real-time drone detection using webcam feed
"""

import cv2
import torch
import numpy as np
import argparse
from pathlib import Path
from harpoon_modular import HarpoonNetModular
from PIL import Image
from torchvision import transforms
import time

class WebcamDetector:
    def __init__(self, model_path, conf_thresh=0.6, nms_thresh=0.4, camera_id=0, flip_frame=True):
        """Initialize the webcam detector"""
        self.model_path = model_path
        self.conf_thresh = conf_thresh
        self.nms_thresh = nms_thresh
        self.flip_frame = flip_frame
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.debug_view = False  # Toggle with 'd' key
        self.last_time = None
        
        # ImageNet normalization parameters (as float32)
        self.normalize_mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
        self.normalize_std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
        
        # Load model
        print(f"🔄 Loading model from {model_path}")
        checkpoint = torch.load(model_path, map_location=self.device)
        self.model = HarpoonNetModular(num_classes=1, num_anchors=3)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.model.to(self.device)
        self.model.eval()
        print("✅ Model loaded successfully")
        
        # Initialize webcam
        self.cap = cv2.VideoCapture(camera_id)
        if not self.cap.isOpened():
            raise RuntimeError(f"Failed to open camera {camera_id}")
        
        # Get camera properties
        self.frame_width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        self.frame_height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        print(f"📹 Camera resolution: {self.frame_width}x{self.frame_height}")
        print(f"🎯 Initial confidence threshold: {self.conf_thresh:.2f}")

    def enhance_frame(self, frame):
        """Enhance frame for better detection"""
        # Convert to LAB color space
        lab = cv2.cvtColor(frame, cv2.COLOR_BGR2LAB)
        l, a, b = cv2.split(lab)
        
        # Apply CLAHE to L channel
        clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
        cl = clahe.apply(l)
        
        # Merge channels
        limg = cv2.merge((cl,a,b))
        
        # Convert back to BGR
        enhanced = cv2.cvtColor(limg, cv2.COLOR_LAB2BGR)
        
        # Increase contrast
        alpha = 1.3  # Contrast control
        beta = 10    # Brightness control
        enhanced = cv2.convertScaleAbs(enhanced, alpha=alpha, beta=beta)
        
        return enhanced

    def preprocess_frame(self, frame):
        """Preprocess frame for model input"""
        # Get original dimensions
        orig_h, orig_w = frame.shape[:2]
        
        # Calculate scaling to maintain aspect ratio
        input_size = 320
        scale = min(input_size / orig_w, input_size / orig_h)
        new_w = int(orig_w * scale)
        new_h = int(orig_h * scale)
        
        # Resize maintaining aspect ratio
        resized = cv2.resize(frame, (new_w, new_h))
        
        # Create square canvas with padding
        square = np.zeros((input_size, input_size, 3), dtype=np.uint8)
        # Center the resized image in the square
        x_offset = (input_size - new_w) // 2
        y_offset = (input_size - new_h) // 2
        square[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized
        
        # Convert to RGB
        img = cv2.cvtColor(square, cv2.COLOR_BGR2RGB)
        
        # Convert to PIL Image for transforms
        img = Image.fromarray(img)
        
        # Apply transforms
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        
        # Apply transform and add batch dimension
        img = transform(img).unsqueeze(0)
        
        return img, (scale, x_offset, y_offset)

    def postprocess_detections(self, detections, preprocess_info, frame_shape):
        """Convert normalized coordinates back to original frame coordinates"""
        scale, x_offset, y_offset = preprocess_info
        orig_h, orig_w = frame_shape[:2]
        processed_detections = []
        
        for det in detections[0]['boxes']:
            if len(det) == 4:
                # Remove padding offset
                x1 = (det[0] - x_offset) / scale
                y1 = (det[1] - y_offset) / scale
                x2 = (det[2] - x_offset) / scale
                y2 = (det[3] - y_offset) / scale
                
                # Clip to frame boundaries
                x1 = np.clip(x1, 0, orig_w)
                y1 = np.clip(y1, 0, orig_h)
                x2 = np.clip(x2, 0, orig_w)
                y2 = np.clip(y2, 0, orig_h)
                
                # Only add if box has reasonable size
                w = x2 - x1
                h = y2 - y1
                if w > 10 and h > 10 and w < orig_w * 0.9 and h < orig_h * 0.9:
                    processed_detections.append([int(x1), int(y1), int(x2), int(y2)])
        
        return processed_detections

    def process_frame(self, frame):
        """Process a single frame"""
        # Preprocess
        img, preprocess_info = self.preprocess_frame(frame)
        img = img.to(self.device)
        
        # Run inference
        with torch.no_grad():
            predictions = self.model(img)
            detections = self.model.decode_predictions(predictions, confidence_threshold=self.conf_thresh)
        
        # Postprocess
        boxes = self.postprocess_detections(detections, preprocess_info, frame.shape)
        
        # Draw detections
        frame_with_boxes = frame.copy()
        for box in boxes:
            x1, y1, x2, y2 = box
            cv2.rectangle(frame_with_boxes, (x1, y1), (x2, y2), (0, 255, 0), 2)
            
            # Add confidence score if available
            if len(detections[0]['scores']) > 0:
                conf = detections[0]['scores'][0]
                cv2.putText(frame_with_boxes, f"Drone: {conf:.2f}", 
                          (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        
        # Add FPS and detection count
        fps = 1.0 / (time.time() - self.last_time) if self.last_time else 0.0
        self.last_time = time.time()
        
        cv2.putText(frame_with_boxes, f"FPS: {fps:.1f} | Detected: {len(boxes)} | Conf: {self.conf_thresh:.2f}", 
                   (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
        
        return frame_with_boxes

    def draw_detections(self, frame, detections):
        """Draw detection boxes and labels on frame"""
        for det in detections:
            x1, y1, x2, y2 = det['box']
            conf = det['confidence']
            
            # Draw box
            color = (0, int(255 * conf), 0)  # Brighter green for higher confidence
            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
            
            # Draw label with confidence
            label = f"Drone: {conf:.2f}"
            cv2.putText(frame, label, (x1, y1-10), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
        
        # Draw FPS and detection count
        if hasattr(self, 'fps'):
            cv2.putText(frame, f"FPS: {self.fps:.1f} | Detected: {len(detections)}", 
                       (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        
        # Draw debug info
        if self.debug_view:
            cv2.putText(frame, "Debug View: ON", (10, 60), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
            cv2.putText(frame, f"Conf Thresh: {self.conf_thresh:.2f}", (10, 90),
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
            
            # Draw center crosshair
            h, w = frame.shape[:2]
            cv2.line(frame, (w//2, 0), (w//2, h), (0, 0, 255), 1)
            cv2.line(frame, (0, h//2), (w, h//2), (0, 0, 255), 1)
        
        return frame

    def run(self):
        """Run webcam detection"""
        print("🎥 Starting detection...")
        print("Controls:")
        print("  'q': Quit")
        print("  'd': Toggle debug view")
        print("  '+': Increase confidence threshold")
        print("  '-': Decrease confidence threshold")
        
        self.last_time = time.time()
        
        while True:
            ret, frame = self.cap.read()
            if not ret:
                break
            
            # Process frame
            frame_with_boxes = self.process_frame(frame)
            
            # Show frame
            cv2.imshow('HarpoonNet Detection', frame_with_boxes)
            
            # Handle key presses
            key = cv2.waitKey(1) & 0xFF
            if key == ord('q'):
                break
            elif key == ord('d'):
                self.debug_view = not self.debug_view
            elif key == ord('+'):
                self.conf_thresh = min(1.0, self.conf_thresh + 0.05)
                print(f"Confidence threshold: {self.conf_thresh:.2f}")
            elif key == ord('-'):
                self.conf_thresh = max(0.05, self.conf_thresh - 0.05)
                print(f"Confidence threshold: {self.conf_thresh:.2f}")
        
        self.cap.release()
        cv2.destroyAllWindows()

def main():
    parser = argparse.ArgumentParser(description='HarpoonNet Webcam Detection')
    parser.add_argument('--model', type=str, required=True,
                        help='Path to model checkpoint')
    parser.add_argument('--conf', type=float, default=0.6,
                        help='Initial confidence threshold')
    parser.add_argument('--nms', type=float, default=0.4,
                        help='NMS threshold')
    parser.add_argument('--camera', type=int, default=0,
                        help='Camera device ID')
    parser.add_argument('--no-flip', action='store_true',
                        help='Disable frame flipping')
    
    args = parser.parse_args()
    
    try:
        detector = WebcamDetector(
            model_path=args.model,
            conf_thresh=args.conf,
            nms_thresh=args.nms,
            camera_id=args.camera,
            flip_frame=not args.no_flip
        )
        detector.run()
    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return 1
    
    return 0

if __name__ == '__main__':
    exit(main())