Unable to connect the correct model yaml and checkpoint

#1
by Steven8686 - opened

Hi there,
Thanks to your great work. However, when I use your inference demo presented in README, I encountered an error
File ".../lib/python3.11/site-packages/torch/nn/modules/module.py", line 1931, in getattr
raise AttributeError(
AttributeError: 'AAttn' object has no attribute 'qk'. Did you mean: 'qkv'?

I've checked the issues (https://github.com/sunsmarterjie/yolov12/issues/109) in YOLOv12 official repo, they said this means the .pt you provided is likely a turbo model. What's the exact yaml in your repo used?
Thanks a lot!

I am definitely using the "l" variant
Are you running this with "cuda" or something else?

Here is my code i am using to run this with cuda on an nvidia GPU:

.....truncated
from ultralytics import YOLO

.....truncated

model = None


def setup_model(**kwargs):
    """Initialize YOLO model with optimizations."""
    global model

    # Define persistent model path for container volume
    persistent_model_path = "/data/models/yolo12l-person-seg-extended.pt"

    # Check if model exists in persistent storage
    if os.path.exists(persistent_model_path):
        logger.info(
            f"Loading YOLO12 segmentation model from persistent storage: {persistent_model_path}"
        )
        try:
            model = YOLO(persistent_model_path)
            logger.info("Segmentation model loaded from persistent volume successfully")
        except Exception as e:
            logger.error(
                "Failed to load custom segmentation model",
                error=str(e),
                model_path=persistent_model_path,
                traceback=True,
            )
            raise RuntimeError(
                f"Could not load required model from {persistent_model_path}: {str(e)}"
            )
    else:
        # Custom model not found - critical error as we have no fallback
        error_msg = f"CRITICAL: Required model not found at {persistent_model_path}"
        logger.critical(
            error_msg,
            required_model="yolo12l-person-seg.pt",
            check_paths=["/data/models/", "/app/", "./"],
            container_name=os.getenv("HOSTNAME", "unknown"),
        )
        raise FileNotFoundError(error_msg)

    # Check for available GPU acceleration
    if torch.cuda.is_available():
        logger.info(
            "CUDA is available, using NVIDIA GPU acceleration",
            cuda_version=torch.version.cuda,
            device_name=torch.cuda.get_device_name(0),
        )

        # Configure CUDA optimizations
        try:
            # This optimizes matrix multiplications
            torch.set_float32_matmul_precision("high")
            logger.info("Set float32 matmul precision to high")

            # FlashAttention should be pre-installed in the container image
            try:
                import flash_attn

                logger.info(
                    "FlashAttention is available and will be used automatically",
                    version=flash_attn.__version__,
                )
                flash_attn_available = True
            except ImportError:
                logger.warning(
                    "FlashAttention not found despite being in the base image - check build process",
                    warning="This is unexpected and may indicate an issue with the Docker image build",
                )
                flash_attn_available = False
        except Exception as e:
            logger.warning("Error setting up CUDA optimizations", error=str(e))

        try:
            # Move model to CUDA
            model.to("cuda")
            logger.info("Model successfully moved to CUDA device")

            # Test inference with FP16 to verify it works
            dummy_input = torch.zeros((1, 3, 640, 640), device="cuda")

            # Log if using FlashAttention
            if flash_attn_available:
                logger.info("Running inference with FlashAttention and FP16 support")
            else:
                logger.info(
                    "Running inference with standard attention and FP16 support"
                )

            test_result = model.predict(dummy_input, device="cuda", half=True)
            logger.info("Successfully tested FP16 inference")
        except Exception as e:
            logger.warning(
                "Failed to test FP16 precision, will still try at runtime", error=str(e)
            )
            # Ensure model is still on CUDA even if test fails
            model.to("cuda")
    else:
        logger.info("No GPU acceleration available, falling back to CPU")

    logger.info("YOLO12 model initialized successfully")

Sign up or log in to comment