""" Multi-Dataset Configuration for HarpoonNet Modular Training Combines multiple drone datasets for comprehensive training """ import torch import os from pathlib import Path class MultiDatasetConfig: """Configuration for training with multiple drone datasets""" # Base paths DOWNLOADS_PATH = os.path.expanduser("~/Downloads") PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__)) # Available datasets with their info DATASETS = { 'drone_v1': { 'path': os.path.join(DOWNLOADS_PATH, 'Drone.v1i.yolov8'), 'name': 'Drone.v1i.yolov8', 'description': 'Large drone dataset - 17K+ images', 'priority': 1, # Primary dataset 'train_images': 15687, 'valid_images': 1342, 'test_images': 677 }, 'drone_detection_v5': { 'path': os.path.join(DOWNLOADS_PATH, 'droneDetection.v5i.yolov8'), 'name': 'droneDetection.v5i.yolov8', 'description': 'Drone detection dataset - 1.3K images', 'priority': 2, 'train_images': 1143, 'valid_images': 109, 'test_images': 54 }, 'drone_detection_v5_2': { 'path': os.path.join(DOWNLOADS_PATH, 'droneDetection.v5i (2).yolov8'), 'name': 'droneDetection.v5i (2).yolov8', 'description': 'Drone detection dataset v2 - 1.3K images', 'priority': 3, 'train_images': 1143, 'valid_images': 109, 'test_images': 54 }, 'miranda_ot1': { 'path': os.path.join(DOWNLOADS_PATH, 'Miranda OT1.v3i.yolov8'), 'name': 'Miranda OT1.v3i.yolov8', 'description': 'Miranda drone dataset - 933 images', 'priority': 4, 'train_images': 933, 'valid_images': 0, # No validation split 'test_images': 0 # No test split }, # Background (negative) dataset โ€“ smoke & clouds only (no labels) 'smokecloud_v1': { 'path': os.path.join(DOWNLOADS_PATH, 'Smoke_Cloud.v1i (2).yolov8'), 'name': 'Smoke_Cloud.v1i (2).yolov8', 'description': 'Smoke & cloud negative dataset - 5.5K images, label-free', 'priority': 5, 'is_negative_dataset': True, # Flag to indicate this is for negative examples # Actual counts from the dataset 'train_images': 4500, 'valid_images': 639, 'test_images': 326 }, # Additional negative datasets for reducing false positives 'doors_v1': { 'path': os.path.join(DOWNLOADS_PATH, 'door.v1i.yolov8'), 'name': 'door.v1i.yolov8', 'description': 'Door/wall negative dataset - 50 images, label-free', 'priority': 6, 'is_negative_dataset': True, 'train_images': 35, 'valid_images': 10, 'test_images': 5 }, 'birds_v1': { 'path': os.path.join(DOWNLOADS_PATH, 'birds label.v1i.yolov8'), 'name': 'birds label.v1i.yolov8', 'description': 'Birds negative dataset - 100 images, label-free', 'priority': 7, 'is_negative_dataset': True, 'train_images': 70, 'valid_images': 20, 'test_images': 10 }, 'airplanes_v1': { 'path': os.path.join(DOWNLOADS_PATH, 'Airplane detection.v1i.yolov8'), 'name': 'Airplane detection.v1i.yolov8', 'description': 'Airplane negative dataset - 4.5K images, label-free', 'priority': 8, 'is_negative_dataset': True, 'train_images': 3200, 'valid_images': 900, 'test_images': 454 } } # Validate dataset availability AVAILABLE_DATASETS = {} for key, dataset in DATASETS.items(): if os.path.exists(dataset['path']): AVAILABLE_DATASETS[key] = dataset print(f"โœ… Found dataset: {dataset['name']} ({dataset['description']})") else: print(f"โŒ Missing dataset: {dataset['name']} at {dataset['path']}") # Calculate total dataset size TOTAL_TRAIN_IMAGES = sum(d['train_images'] for d in AVAILABLE_DATASETS.values()) TOTAL_VALID_IMAGES = sum(d['valid_images'] for d in AVAILABLE_DATASETS.values()) TOTAL_TEST_IMAGES = sum(d['test_images'] for d in AVAILABLE_DATASETS.values()) TOTAL_IMAGES = TOTAL_TRAIN_IMAGES + TOTAL_VALID_IMAGES + TOTAL_TEST_IMAGES print(f"\n๐Ÿ“Š Combined Dataset Stats:") print(f" Train: {TOTAL_TRAIN_IMAGES:,} images") print(f" Valid: {TOTAL_VALID_IMAGES:,} images") print(f" Test: {TOTAL_TEST_IMAGES:,} images") print(f" Total: {TOTAL_IMAGES:,} images") # Model parameters for HarpoonNet Modular NUM_CLASSES = 1 # Single class: drone IMAGE_SIZE = 416 # Optimized for edge inference # Training parameters - optimized for large multi-dataset training BATCH_SIZE = 16 # Reduced due to larger dataset size EPOCHS = 50 # More epochs for comprehensive learning LEARNING_RATE = 0.001 # Conservative LR for stable multi-dataset training WEIGHT_DECAY = 0.0001 # Loss parameters for HarpoonNet Modular (single-scale) LAMBDA_COORD = 5.0 # Coordinate loss weight LAMBDA_OBJ = 1.0 # Objectness loss weight LAMBDA_NOOBJ = 0.5 # No-object loss weight LAMBDA_CLASS = 1.0 # Classification loss weight # Regularization DROPOUT_RATE = 0.1 GRADIENT_CLIP_VALUE = 10.0 # Device DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Inference parameters CONF_THRESHOLD = 0.25 # Confidence threshold NMS_THRESHOLD = 0.4 # NMS threshold MAX_DETECTIONS = 100 # Max detections per image # Data augmentation - balanced for multi-dataset training AUGMENTATION = True AUG_PARAMS = { 'flip_lr': 0.5, # Horizontal flip 'flip_ud': 0.1, # Vertical flip (minimal for drones) 'rotation': 15, # Max rotation degrees 'scale': 0.2, # Scale variation 'brightness': 0.2, # Brightness variation 'contrast': 0.2, # Contrast variation 'saturation': 0.3, # Saturation variation 'hue': 0.01 # Minimal hue variation } # Checkpoints and logging CHECKPOINT_PATH = "checkpoints" EXPERIMENT_NAME = f"harpoon_multi_dataset_{TOTAL_IMAGES//1000}k" # Early stopping EARLY_STOPPING_PATIENCE = 10 EARLY_STOPPING_MIN_DELTA = 0.001 # Learning rate scheduler LR_SCHEDULER = 'reduce_on_plateau' LR_SCHEDULER_PATIENCE = 5 LR_SCHEDULER_FACTOR = 0.5 LR_SCHEDULER_MIN_LR = 1e-6 @classmethod def get_dataset_paths(cls, dataset_key, split='train'): """Get paths for a specific dataset and split""" if dataset_key not in cls.AVAILABLE_DATASETS: raise ValueError(f"Dataset {dataset_key} not available") dataset = cls.AVAILABLE_DATASETS[dataset_key] base_path = dataset['path'] if split == 'train': images_path = os.path.join(base_path, 'train', 'images') labels_path = os.path.join(base_path, 'train', 'labels') elif split == 'valid': images_path = os.path.join(base_path, 'valid', 'images') labels_path = os.path.join(base_path, 'valid', 'labels') elif split == 'test': images_path = os.path.join(base_path, 'test', 'images') labels_path = os.path.join(base_path, 'test', 'labels') else: raise ValueError(f"Invalid split: {split}") return images_path, labels_path @classmethod def get_all_dataset_paths(cls, split='train'): """Get paths for all available datasets for a specific split""" all_paths = [] for dataset_key in cls.AVAILABLE_DATASETS.keys(): try: images_path, labels_path = cls.get_dataset_paths(dataset_key, split) dataset_info = cls.AVAILABLE_DATASETS[dataset_key] is_negative = dataset_info.get('is_negative_dataset', False) # For negative datasets, only check images path (no labels needed) if is_negative: if os.path.exists(images_path): all_paths.append((images_path, labels_path, dataset_key)) print(f"โœ… Added {split} path for {dataset_key} (negative dataset)") else: print(f"โš ๏ธ Skipping {dataset_key} {split} - images path not found") else: # For regular datasets, check both images and labels if os.path.exists(images_path) and os.path.exists(labels_path): all_paths.append((images_path, labels_path, dataset_key)) print(f"โœ… Added {split} path for {dataset_key}") else: print(f"โš ๏ธ Skipping {dataset_key} {split} - path not found") except ValueError as e: print(f"โš ๏ธ Skipping {dataset_key} {split}: {e}") return all_paths @classmethod def print_config(cls): """Print the multi-dataset configuration""" print("\n" + "="*60) print("๐Ÿš€ HarpoonNet Multi-Dataset Training Configuration") print("="*60) print(f"Available Datasets: {len(cls.AVAILABLE_DATASETS)}") for key, dataset in cls.AVAILABLE_DATASETS.items(): print(f" ๐Ÿ“ {dataset['name']}") print(f" Train: {dataset['train_images']:,} | Valid: {dataset['valid_images']:,} | Test: {dataset['test_images']:,}") print(f"\n๐Ÿ“Š Combined Statistics:") print(f" Total Images: {cls.TOTAL_IMAGES:,}") print(f" Training Images: {cls.TOTAL_TRAIN_IMAGES:,}") print(f" Validation Images: {cls.TOTAL_VALID_IMAGES:,}") print(f"\n๐Ÿ—๏ธ Architecture:") print(f" Model: HarpoonNet Modular (EfficientNet-B0 + HarpoonHead)") print(f" Input Size: {cls.IMAGE_SIZE}x{cls.IMAGE_SIZE}") print(f" Classes: {cls.NUM_CLASSES} (drone detection)") print(f"\n๐ŸŽฏ Training Parameters:") print(f" Batch Size: {cls.BATCH_SIZE}") print(f" Epochs: {cls.EPOCHS}") print(f" Learning Rate: {cls.LEARNING_RATE}") print(f" Device: {cls.DEVICE}") print(f" Experiment: {cls.EXPERIMENT_NAME}") print("="*60) if __name__ == "__main__": # Test the configuration config = MultiDatasetConfig() config.print_config() # Test path retrieval print(f"\n๐Ÿงช Testing path retrieval:") train_paths = config.get_all_dataset_paths('train') print(f"Found {len(train_paths)} training dataset paths") valid_paths = config.get_all_dataset_paths('valid') print(f"Found {len(valid_paths)} validation dataset paths")