Upload Moondream

Browse files

Files changed (4) hide show

generation_config.json +2 -0
model.safetensors +2 -2
moondream.py +2 -1
vision_encoder.py +122 -25

generation_config.json CHANGED Viewed

@@ -1,4 +1,6 @@
 {
   "_from_model_config": true,
   "transformers_version": "4.38.2"
 }

 {
   "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
   "transformers_version": "4.38.2"
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b645a86282dbc3ce41a45acc9ec98e54a4cc3939ef32ca84591ca56046b0fed1
-size 3715037856

 version https://git-lfs.github.com/spec/v1
+oid sha256:7840817a7015edf729fa3d60099c35f08fc30511a1dc8ea231acd0e9a6555bb8
+size 3733912224

moondream.py CHANGED Viewed

@@ -29,7 +29,8 @@ class Moondream(PreTrainedModel):
         return self.text_model.device
     def encode_image(self, image):
-        return self.vision_encoder(image)
     def input_embeds(self, prompt, image_embeds, tokenizer):
         def _tokenize(txt):

         return self.text_model.device
     def encode_image(self, image):
+        with torch.no_grad():
+            return self.vision_encoder(image)
     def input_embeds(self, prompt, image_embeds, tokenizer):
         def _tokenize(txt):

vision_encoder.py CHANGED Viewed

@@ -1,7 +1,11 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
 from einops import rearrange
 from torchvision.transforms.v2 import (
     Compose,
     Resize,
@@ -172,7 +176,7 @@ class VisionProjection(nn.Module):
         model_dim = 2048
         hidden_dim = model_dim * 4
-        self.mlp = MLP(image_embedding_dim, hidden_dim, model_dim)
     @property
     def device(self):
@@ -182,6 +186,26 @@ class VisionProjection(nn.Module):
         return self.mlp(x)
 class VisionEncoder(nn.Module):
     def __init__(self, use_flash_attn=False):
@@ -189,15 +213,7 @@ class VisionEncoder(nn.Module):
         self.encoder = EncoderWrapper(use_flash_attn)
         self.projection = VisionProjection()
-        self.preprocess = Compose(
-            [
-                Resize(size=(378, 378), interpolation=InterpolationMode.BICUBIC),
-                ToImage(),
-                ToDtype(torch.float32, scale=True),
-                Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
-            ]
-        )
     @property
     def device(self):
@@ -207,22 +223,103 @@ class VisionEncoder(nn.Module):
     def dtype(self):
         return self.projection.mlp.fc1.weight.dtype
-    def __call__(self, images) -> torch.Tensor:
-        if not isinstance(images, list) and not isinstance(images, torch.Tensor):
-            images = [images]
-        with torch.no_grad():
-            # Skip preprocess if images are already tensors
-            if not isinstance(images, torch.Tensor) and not isinstance(
-                images[0], torch.Tensor
-            ):
-                images = [self.preprocess(image.convert("RGB")) for image in images]
-            if isinstance(images, list):
-                images = torch.stack(images)
-            x = images.to(self.device, dtype=self.dtype)
-            x = self.encoder(x)
-            x = self.projection(x)
-            return x

+from typing import Union
+import PIL.Image
 import torch
 import torch.nn.functional as F
 from torch import nn
 from einops import rearrange
+import PIL
 from torchvision.transforms.v2 import (
     Compose,
     Resize,
         model_dim = 2048
         hidden_dim = model_dim * 4
+        self.mlp = MLP(image_embedding_dim * 2, hidden_dim, model_dim)
     @property
     def device(self):
         return self.mlp(x)
+def create_patches(image, patch_size=(378, 378)):
+    assert image.dim() == 3, "Image must be in CHW format"
+    _, height, width = image.shape  # Channels, Height, Width
+    patch_height, patch_width = patch_size
+    if height == patch_height and width == patch_width:
+        return []
+    # Iterate over the image and create patches
+    patches = []
+    for i in range(0, height, patch_height):
+        row_patches = []
+        for j in range(0, width, patch_width):
+            patch = image[:, i : i + patch_height, j : j + patch_width]
+            row_patches.append(patch)
+        patches.append(torch.stack(row_patches))
+    return patches
 class VisionEncoder(nn.Module):
     def __init__(self, use_flash_attn=False):
         self.encoder = EncoderWrapper(use_flash_attn)
         self.projection = VisionProjection()
+        self.supported_sizes = [(378, 378), (378, 756), (756, 378), (756, 756)]
     @property
     def device(self):
     def dtype(self):
         return self.projection.mlp.fc1.weight.dtype
+    def preprocess(self, image: PIL.Image.Image):
+        width, height = image.size
+        max_dim = max(width, height)
+        if max_dim < 512:
+            im_size = (378, 378)
+        else:
+            aspect_ratio = width / height
+            im_size = min(
+                self.supported_sizes,
+                key=lambda size: (
+                    abs((size[1] / size[0]) - aspect_ratio),
+                    abs(size[0] - width) + abs(size[1] - height),
+                ),
+            )
+        return Compose(
+            [
+                Resize(size=im_size, interpolation=InterpolationMode.BICUBIC),
+                ToImage(),
+                ToDtype(torch.float32, scale=True),
+                Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+            ]
+        )(image)
+    def forward(
+        self, images: Union[PIL.Image.Image, list[PIL.Image.Image], torch.Tensor]
+    ) -> torch.Tensor:
+        im_list = None
+        if isinstance(images, torch.Tensor):
+            # Input must have dimensions (B, C, H, W)
+            assert (
+                len(images.shape) == 4
+            ), "Tensor input must have dimensions (B, C, H, W)"
+            im_list = list(images)
+        elif isinstance(images, PIL.Image.Image):
+            im_list = [images]
+        elif isinstance(images, list):
+            im_list = images
+        else:
+            raise ValueError(
+                "Input must be a PIL image, list of PIL images, or a tensor"
+            )
+        # Preprocess unless the images are already tensors (indicating that
+        # they have already been preprocessed)
+        if not isinstance(im_list[0], torch.Tensor):
+            im_list = [self.preprocess(im.convert("RGB")) for im in im_list]
+        patches = [create_patches(im) for im in im_list]
+        flat_patches = [patch for image_patches in patches for patch in image_patches]
+        # Images may be variable size, and need to be resized to a common size after
+        # creating patches.
+        resized_images = [
+            F.interpolate(im.unsqueeze(0), size=(378, 378), mode="bilinear")
+            for im in im_list
+        ]
+        combined_images = torch.cat([*resized_images, *flat_patches], dim=0)
+        combined_images = combined_images.to(self.device, dtype=self.dtype)
+        combined_features = self.encoder(combined_images)
+        full_img_features = combined_features[: len(im_list)]
+        patch_features = (
+            combined_features[len(im_list) :].transpose(1, 2).view(-1, 1152, 27, 27)
+        )
+        # Reshape patch features back to their original structure
+        reshaped_patch_features = []
+        patch_idx = 0
+        for i, patch_set in enumerate(patches):
+            if len(patch_set) == 0:
+                reshaped_patch_features.append(
+                    full_img_features[i].transpose(0, 1).view(1152, 27, 27)
+                )
+            else:
+                sample_features = []
+                for row_patches in patch_set:
+                    row_len = len(row_patches)
+                    row_features = patch_features[
+                        patch_idx : patch_idx + row_len
+                    ]  # row_len, T, C
+                    row_features = torch.cat(
+                        list(row_features), dim=2
+                    )  # T, C * row_len
+                    patch_idx += row_len
+                    sample_features.append(row_features)
+                sample_features = torch.cat(sample_features, dim=1)
+                sample_features = F.interpolate(
+                    sample_features.unsqueeze(0), size=(27, 27), mode="bilinear"
+                ).squeeze(0)
+                reshaped_patch_features.append(sample_features)
+        reshaped_patch_features = (
+            torch.stack(reshaped_patch_features).view(-1, 1152, 729).transpose(1, 2)
+        )
+        final_features = torch.cat([full_img_features, reshaped_patch_features], dim=2)
+        return self.projection(final_features)