allenai
/

Molmo-7B-D-0924

@@ -15,36 +15,13 @@ from transformers.image_utils import (
     is_valid_image,
 )
 from transformers.processing_utils import ImagesKwargs
-from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
-from transformers.utils import TensorType, is_vision_available, logging
 logger = logging.get_logger(__name__)
-def make_batched_images(images) -> List[List[ImageInput]]:
-    """
-    Accepts images in list or nested list format, and makes a list of images for preprocessing.
-    Args:
-        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
-            The input image.
-    Returns:
-        list: A list of images.
-    """
-    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
-        return [img for img_list in images for img in img_list]
-    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
-        return images
-    elif is_valid_image(images):
-        return [images]
-    raise ValueError(f"Could not make batched images from {images}")
 def pad_to_bounding_box(
     image, offset_height, offset_width, target_height,
     target_width, value=0
@@ -68,7 +45,7 @@ def normalize_image(image, offset, scale):
 def resize_and_pad(
     image,
     desired_output_size,
-    resize_method=InterpolationMode.BILINEAR,
     pad_value=0,
     normalize=True,
     image_mean=OPENAI_CLIP_MEAN,
@@ -85,26 +62,29 @@ def resize_and_pad(
     scaled_height = int(np.array(height, np.float32) * image_scale)
     scaled_width = int(np.array(width, np.float32) * image_scale)
-    # if resize_method == "tensorflow":
-    #     FIXME remove
-    import tensorflow as tf
-    image = tf.image.convert_image_dtype(tf.constant(image), dtype=tf.float32)
-    image = tf.image.resize(
-        image,
-        [scaled_height, scaled_width],
-        method=tf.image.ResizeMethod.BILINEAR,
-        antialias=True,
-    )
-    image = tf.clip_by_value(image, 0.0, 1.0)
-    image = image.numpy()
-    # else:
-    #     image = torch.permute(torch.from_numpy(image), [2, 0, 1])
-    #     image = convert_image_dtype(image)  # resize in flaot32
-    #     image = torchvision.transforms.Resize(
-    #         [scaled_height, scaled_width], InterpolationMode.BILINEAR, antialias=True
-    #     )(image)
-    #     image = torch.clip(image, 0.0, 1.0)
-    #     image = torch.permute(image, [1, 2, 0]).numpy()
     top_pad = (desired_height - scaled_height) // 2
     left_pad = (desired_width - scaled_width) // 2
@@ -201,18 +181,6 @@ class MolmoImageProcessor(BaseImageProcessor):
         image_token_length_h: Optional[int] = None,
         image_patch_size: Optional[int] = None,
     ):
-        """Preprocesses an image
-        Returns:
-            crops: (n_crops, n_patches, patch_dim) individual crops, `n_crops` might
-                   change between images but the other dimension are fixed
-            tokens: (n_tokens,) int32 tokens, pad tokens indicating where to insert the
-                                patch features, might include other special tokens as well
-            patch_ordering: (n_crops, n_tokens_per_crop) order image features should be inserted
-                            into the `tokens`, negative values indicates patches features to exclude
-            padding_mask: (n_crops, n_patches) what percent of each crop is padding, be None
-                          if the image mask is not being used.
-        """
         if isinstance(base_image_input_size, int):
             base_image_input_size = (base_image_input_size, base_image_input_size)
@@ -438,7 +406,18 @@ class MolmoImageProcessor(BaseImageProcessor):
         image_patch_size: Optional[int] = None,
         **kwargs,
     ):
-        """Preprocesses a single image"""
         max_crops = max_crops or self.max_crops
         overlap_margins = overlap_margins or self.overlap_margins

     is_valid_image,
 )
 from transformers.processing_utils import ImagesKwargs
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.utils import logging
 logger = logging.get_logger(__name__)
 def pad_to_bounding_box(
     image, offset_height, offset_width, target_height,
     target_width, value=0
 def resize_and_pad(
     image,
     desired_output_size,
+    resize_method="torch-bilinear",
     pad_value=0,
     normalize=True,
     image_mean=OPENAI_CLIP_MEAN,
     scaled_height = int(np.array(height, np.float32) * image_scale)
     scaled_width = int(np.array(width, np.float32) * image_scale)
+    if resize_method == "tensorflow":
+        # This how the original training code did resizing, it can produce slightly different
+        # results then using torch resize so we keep it just in case
+        import tensorflow as tf
+        image = tf.image.convert_image_dtype(tf.constant(image), dtype=tf.float32)
+        image = tf.image.resize(
+            image,
+            [scaled_height, scaled_width],
+            method=tf.image.ResizeMethod.BILINEAR,
+            antialias=True,
+        )
+        image = tf.clip_by_value(image, 0.0, 1.0)
+        image = image.numpy()
+    elif resize_method == "torch-bilinear":
+        image = torch.permute(torch.from_numpy(image), [2, 0, 1])
+        image = convert_image_dtype(image)  # resize in float32 to match the training code
+        image = torchvision.transforms.Resize(
+            [scaled_height, scaled_width], InterpolationMode.BILINEAR, antialias=True
+        )(image)
+        image = torch.clip(image, 0.0, 1.0)
+        image = torch.permute(image, [1, 2, 0]).numpy()
+    else:
+        raise NotImplementedError(resize_method)
     top_pad = (desired_height - scaled_height) // 2
     left_pad = (desired_width - scaled_width) // 2
         image_token_length_h: Optional[int] = None,
         image_patch_size: Optional[int] = None,
     ):
         if isinstance(base_image_input_size, int):
             base_image_input_size = (base_image_input_size, base_image_input_size)
         image_patch_size: Optional[int] = None,
         **kwargs,
     ):
+        """Preprocesses an image
+        Returns:
+            crops: (n_crops, n_patches, patch_dim) individual crops, `n_crops` might
+                   change between images but the other dimension are fixed
+            tokens: (n_tokens,) int32 tokens, pad tokens indicate where to insert the
+                                patch features, might include other special tokens as well
+            image_idx: (n_crops, n_patches) index in `tokens` to put the patch features from the
+                       crops after pooling, negative values indicates patches features to exclude
+            padding_mask: (n_crops, n_patches) what percent of each crop is padding, can be None
+                          if the image mask is not being used.
+        """
         max_crops = max_crops or self.max_crops
         overlap_margins = overlap_margins or self.overlap_margins