RyanJames/yolo12l-person-seg · Unable to connect the correct model yaml and checkpoint

I am definitely using the "l" variant
Are you running this with "cuda" or something else?
Here is my code i am using to run this with cuda on an nvidia GPU:
.....truncated
from ultralytics import YOLO

.....truncated

model = None


def setup_model(**kwargs):
    """Initialize YOLO model with optimizations."""
    global model

    # Define persistent model path for container volume
    persistent_model_path = "/data/models/yolo12l-person-seg-extended.pt"

    # Check if model exists in persistent storage
    if os.path.exists(persistent_model_path):
        logger.info(
            f"Loading YOLO12 segmentation model from persistent storage: {persistent_model_path}"
        )
        try:
            model = YOLO(persistent_model_path)
            logger.info("Segmentation model loaded from persistent volume successfully")
        except Exception as e:
            logger.error(
                "Failed to load custom segmentation model",
                error=str(e),
                model_path=persistent_model_path,
                traceback=True,
            )
            raise RuntimeError(
                f"Could not load required model from {persistent_model_path}: {str(e)}"
            )
    else:
        # Custom model not found - critical error as we have no fallback
        error_msg = f"CRITICAL: Required model not found at {persistent_model_path}"
        logger.critical(
            error_msg,
            required_model="yolo12l-person-seg.pt",
            check_paths=["/data/models/", "/app/", "./"],
            container_name=os.getenv("HOSTNAME", "unknown"),
        )
        raise FileNotFoundError(error_msg)

    # Check for available GPU acceleration
    if torch.cuda.is_available():
        logger.info(
            "CUDA is available, using NVIDIA GPU acceleration",
            cuda_version=torch.version.cuda,
            device_name=torch.cuda.get_device_name(0),
        )

        # Configure CUDA optimizations
        try:
            # This optimizes matrix multiplications
            torch.set_float32_matmul_precision("high")
            logger.info("Set float32 matmul precision to high")

            # FlashAttention should be pre-installed in the container image
            try:
                import flash_attn

                logger.info(
                    "FlashAttention is available and will be used automatically",
                    version=flash_attn.__version__,
                )
                flash_attn_available = True
            except ImportError:
                logger.warning(
                    "FlashAttention not found despite being in the base image - check build process",
                    warning="This is unexpected and may indicate an issue with the Docker image build",
                )
                flash_attn_available = False
        except Exception as e:
            logger.warning("Error setting up CUDA optimizations", error=str(e))

        try:
            # Move model to CUDA
            model.to("cuda")
            logger.info("Model successfully moved to CUDA device")

            # Test inference with FP16 to verify it works
            dummy_input = torch.zeros((1, 3, 640, 640), device="cuda")

            # Log if using FlashAttention
            if flash_attn_available:
                logger.info("Running inference with FlashAttention and FP16 support")
            else:
                logger.info(
                    "Running inference with standard attention and FP16 support"
                )

            test_result = model.predict(dummy_input, device="cuda", half=True)
            logger.info("Successfully tested FP16 inference")
        except Exception as e:
            logger.warning(
                "Failed to test FP16 precision, will still try at runtime", error=str(e)
            )
            # Ensure model is still on CUDA even if test fails
            model.to("cuda")
    else:
        logger.info("No GPU acceleration available, falling back to CPU")

    logger.info("YOLO12 model initialized successfully")