rf-detr-base / image_processing_rf_detr.py

Upload processor

d21fe9a verified about 2 months ago

5.2 kB

	from typing import Dict, List, Tuple, Optional, Literal

	import torch
	from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
	from torchvision.transforms import ToTensor, Normalize
	from rfdetr.util.misc import nested_tensor_from_tensor_list
	from rfdetr.models.lwdetr import PostProcess


	class RFDetrImageProcessor(BaseImageProcessor):
	model_input_names = ["pixel_values", "pixel_mask"]

	def __init__(
	self,
	model_name: Literal['RFDETRBase, RFDETRLarge']='RFDETRBase',
	num_select: int=300,
	image_mean: List[int]=[0.485, 0.456, 0.406],
	image_std: List[int]=[0.229, 0.224, 0.225],
	**kwargs
	):
	super().__init__(**kwargs)
	self.model_name = model_name
	self.config = {
	'image_mean': image_mean,
	'image_std': image_std,
	}
	self.post_process_config = {
	'num_select': num_select,
	}

	def post_process_object_detection(
	self,
	outputs,
	target_sizes: List[Tuple],
	**kwargs
	) -> List[Dict[str, torch.Tensor]]:
	"""
	Parameters
	----------
	outputs:
	outputs from model loaded with AutoModelForObjectDetection or ONNX model
	target_sizes: list[tuple]
	original sizes of the images.
	"""
	if isinstance(outputs, list): ### Handle ONNX outputs
	logits = torch.tensor(outputs[0])
	pred_boxes = torch.tensor(outputs[1])
	else:
	logits = outputs.logits
	pred_boxes = outputs.pred_boxes

	outputs = {
	'pred_logits': logits,
	'pred_boxes': pred_boxes,
	}

	# using rfdetr's postprocess class
	post_process = PostProcess(self.post_process_config['num_select'])
	detections = post_process(
	outputs,
	target_sizes=target_sizes,
	)

	return detections

	def convert_and_validate_boxes(self, annotations, images):
	for ann, img in zip(annotations, images):
	# convert from COCO format [x_min, y_min, width, height] to [cx, cy, w, h]
	boxes = ann["boxes"].to(torch.float32)
	boxes[:, [0,1]] += boxes[:, [2,3]] / 2
	ann["boxes"] = boxes

	torch._assert(isinstance(boxes, torch.Tensor), "Expected target boxes to be of type Tensor.")
	torch._assert(
	len(boxes.shape) == 2 and boxes.shape[-1] == 4,
	"Expected target boxes to be a tensor of shape [N, 4].",
	)
	for box in boxes:
	torch._assert(
	box[2]/2 <= box[0] <= img.shape[2] - box[2]/2 and box[3]/2 <= box[1] <= img.shape[1] - box[3]/2,
	"Expected w/2 <= x1 <= W - w/2 and h/2 <= cy <= H - h/2.",
	)

	def preprocess(
	self,
	images,
	annotations=None,
	) -> BatchFeature:
	"""
	Parameters
	----------
	images: List[PIL.Image.Image]
	a single or a list of PIL images
	annotations: Optional[List[Dict[str, torch.Tensor \| List]]]
	List of annotations associated with the image or batch of images. If annotation is for object
	detection, the annotations should be a dictionary with the following keys:
	- boxes (FloatTensor[N, 4]): the ground-truth boxes COCO format [x_min, y_min, width, height]
	- class_labels (Int64Tensor[N]): the class label for each ground-truth box
	"""
	totensor = ToTensor()
	normalize = Normalize(mean=self.config['image_mean'], std=self.config['image_std'])

	if images is not None and not isinstance(images, list):
	images = list(images)
	if not isinstance(images[0], torch.Tensor):
	images = [totensor(img) for img in images]
	if annotations is not None:
	self.convert_and_validate_boxes(annotations, images)

	# get the original image sizes
	original_image_sizes: List[Tuple[int, int]] = []
	for img in images:
	val = img.shape[-2:]
	torch._assert(
	len(val) == 2,
	f"expecting the last two dimensions of the Tensor to be H and W instead got {img.shape[-2:]}",
	)
	original_image_sizes.append((val[0], val[1]))
	target_sizes = torch.tensor(original_image_sizes)

	# transform the input
	# normalize image
	images = [normalize(img) for img in images]
	# pad the list of images to make a tensor of size [B, C, H, W] and [B, H, W]
	nested_tensor = nested_tensor_from_tensor_list(images)

	data = {
	'pixel_values': nested_tensor.tensors,
	'pixel_mask': nested_tensor.mask,
	'target_sizes': target_sizes,
	'labels': annotations
	}
	return BatchFeature(data=data)


	__all__ = [
	"RFDetrImageProcessor"
	]