File size: 5,234 Bytes

a15fec5

"""
Hugging Face compatible image processor for Trendyol DinoV2
"""
from transformers import ImageProcessingMixin, BatchFeature
from transformers.utils import TensorType
from PIL import Image
import torch
import numpy as np
import cv2
from torchvision import transforms
import torchvision.transforms.functional as TF
from io import BytesIO
from typing import Union, List, Optional


def downscale_image(image: Image.Image, max_dimension: int) -> Image.Image:
    """Downscale image while maintaining aspect ratio"""
    original_width, original_height = image.size
    
    if max(original_width, original_height) <= max_dimension:
        return image
    
    aspect_ratio = original_width / original_height
    
    if original_width > original_height:
        new_width = max_dimension
        new_height = int(max_dimension / aspect_ratio)
    else:
        new_height = max_dimension
        new_width = int(max_dimension * aspect_ratio)
    
    return image.resize((new_width, new_height), Image.LANCZOS)


class DownScaleLanczos:
    def __init__(self, target_size=384):
        self.target_size = target_size

    def __call__(self, img):
        return downscale_image(img, self.target_size)


class JPEGCompression:
    def __init__(self, quality=75):
        self.quality = quality

    def __call__(self, img):
        buffer = BytesIO()
        img.save(buffer, format='JPEG', quality=self.quality)
        buffer.seek(0)
        return Image.open(buffer)


class ScaleImage:
    def __init__(self, target_size):
        self.target_size = target_size
        
    def __call__(self, img):
        w, h = img.size
        max_size = max(h, w)
        scale = self.target_size / max_size
        new_size = int(w * scale), int(h * scale)
        return img.resize(new_size, Image.BILINEAR)


class PadToSquare:
    def __init__(self, color=255):
        self.color = color
        
    def __call__(self, img):
        if isinstance(img, np.ndarray):
            img = Image.fromarray(img)
        
        width, height = img.size
        if self.color != -1:
            padding = abs(width - height) // 2
            if width < height:
                return TF.pad(img, (padding, 0, padding + (height - width) % 2, 0), fill=self.color, padding_mode='constant')
            elif width > height:
                return TF.pad(img, (0, padding, 0, padding + (width - height) % 2), fill=self.color, padding_mode='constant')
        return img


class TrendyolDinoV2ImageProcessor(ImageProcessingMixin):
    """
    Hugging Face compatible image processor for TrendyolDinoV2 model.
    """
    
    model_input_names = ["pixel_values"]
    
    def __init__(
        self,
        input_size=224,
        downscale_size=332,
        pad_color=255,
        jpeg_quality=75,
        do_normalize=True,
        image_mean=(0.485, 0.456, 0.406),
        image_std=(0.229, 0.224, 0.225),
        **kwargs
    ):
        super().__init__(**kwargs)
        
        self.input_size = input_size
        self.downscale_size = downscale_size
        self.pad_color = pad_color
        self.jpeg_quality = jpeg_quality
        self.do_normalize = do_normalize
        self.image_mean = image_mean
        self.image_std = image_std
    
    def _get_preprocess_fn(self):
        """Create the preprocessing pipeline (not stored as attribute to avoid JSON serialization issues)"""
        return transforms.Compose([
            DownScaleLanczos(self.downscale_size),
            JPEGCompression(self.jpeg_quality),
            ScaleImage(self.downscale_size),
            PadToSquare(self.pad_color),
            transforms.Resize((self.input_size, self.input_size)),
            transforms.ToTensor(),
            transforms.Normalize(self.image_mean, self.image_std)
        ])
    
    def __call__(
        self,
        images: Union[Image.Image, np.ndarray, torch.Tensor, List[Image.Image], List[np.ndarray], List[torch.Tensor]],
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs
    ) -> BatchFeature:
        """
        Preprocess images for the model.
        """
        # Handle single image
        if not isinstance(images, list):
            images = [images]
        
        # Get preprocessing pipeline
        preprocess_fn = self._get_preprocess_fn()
        
        # Preprocess all images
        processed_images = []
        for image in images:
            if isinstance(image, str):
                image = Image.open(image).convert('RGB')
            elif isinstance(image, np.ndarray):
                image = Image.fromarray(image).convert('RGB')
            elif not isinstance(image, Image.Image):
                raise ValueError(f"Unsupported image type: {type(image)}")
            
            # Apply preprocessing
            processed_tensor = preprocess_fn(image)
            processed_images.append(processed_tensor)
        
        # Stack tensors
        pixel_values = torch.stack(processed_images)
        
        # Return BatchFeature
        data = {"pixel_values": pixel_values}
        return BatchFeature(data=data, tensor_type=return_tensors)


# Register for auto class
TrendyolDinoV2ImageProcessor.register_for_auto_class("AutoImageProcessor")