Update Model

Browse files

Files changed (4) hide show

README.md +61 -2
config.json +195 -6
model.safetensors +3 -0
preprocessor_config.json +34 -0

README.md CHANGED Viewed

@@ -1,6 +1,65 @@
 ---
 license: mit
-pipeline_tag: image-segmentation
 ---
-This repository contains the model described in [Your ViT is Secretly an Image Segmentation Model](https://huggingface.co/papers/2503.19108).

 ---
+library_name: transformers
 license: mit
+tags:
+- vision
+- image-segmentation
+- pytorch
 ---
+# EoMT
+[![PyTorch](https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white)](https://pytorch.org/)
+**EoMT (Encoder-only Mask Transformer)** is a Vision Transformer (ViT) architecture designed for high-quality and efficient image segmentation. It was introduced in the CVPR 2025 highlight paper:
+**[Your ViT is Secretly an Image Segmentation Model](https://www.tue-mps.org/eomt)**
+by Tommie Kerssies, Niccolò Cavagnero, Alexander Hermans, Narges Norouzi, Giuseppe Averta, Bastian Leibe, Gijs Dubbelman, and Daan de Geus.
+> **Key Insight**: Given sufficient scale and pretraining, a plain ViT along with additional few params can perform segmentation without the need for task-specific decoders or pixel fusion modules. The same model backbone supports semantic, instance, and panoptic segmentation with different post-processing 🤗
+The original implementation can be found in this [repository](https://github.com/tue-mps/eomt)
+---
+### How to use
+Here is how to use this model for Instance Segmentation:
+```python
+import matplotlib.pyplot as plt
+import requests
+import torch
+from PIL import Image
+from transformers import EomtForUniversalSegmentation, AutoImageProcessor
+model_id = "yaswanthgali/coco_instance_eomt_large_640-hf"
+processor = AutoImageProcessor.from_pretrained(model_id)
+model = EomtForUniversalSegmentation.from_pretrained(model_id)
+image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+inputs = processor(
+    images=image,
+    return_tensors="pt",
+)
+with torch.inference_mode():
+    outputs = model(**inputs)
+# Prepare the original image size in the format (height, width)
+original_image_sizes = [(image.height, image.width)]
+# Post-process the model outputs to get final segmentation prediction
+preds = processor.post_process_instance_segmentation(
+    outputs,
+    original_image_sizes=original_image_sizes,
+)
+# Visualize the segmentation mask
+plt.imshow(preds[0]["segmentation"])
+plt.axis("off")
+plt.title("Instance Segmentation")
+plt.show()
+```

config.json CHANGED Viewed

@@ -1,9 +1,198 @@
 {
-  "model_type": "eomt",
-  "backbone": "vit_large_patch14_reg4_dinov2",
   "image_size": 640,
-  "patch_size": 16,
-  "num_queries": 200,
   "num_blocks": 4,
-  "num_labels": 80
-}

 {
+  "architectures": [
+    "EomtForUniversalSegmentation"
+  ],
+  "attention_dropout": 0.0,
+  "class_weight": 2.0,
+  "dice_weight": 5.0,
+  "drop_path_rate": 0.0,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4",
+    "5": "LABEL_5",
+    "6": "LABEL_6",
+    "7": "LABEL_7",
+    "8": "LABEL_8",
+    "9": "LABEL_9",
+    "10": "LABEL_10",
+    "11": "LABEL_11",
+    "12": "LABEL_12",
+    "13": "LABEL_13",
+    "14": "LABEL_14",
+    "15": "LABEL_15",
+    "16": "LABEL_16",
+    "17": "LABEL_17",
+    "18": "LABEL_18",
+    "19": "LABEL_19",
+    "20": "LABEL_20",
+    "21": "LABEL_21",
+    "22": "LABEL_22",
+    "23": "LABEL_23",
+    "24": "LABEL_24",
+    "25": "LABEL_25",
+    "26": "LABEL_26",
+    "27": "LABEL_27",
+    "28": "LABEL_28",
+    "29": "LABEL_29",
+    "30": "LABEL_30",
+    "31": "LABEL_31",
+    "32": "LABEL_32",
+    "33": "LABEL_33",
+    "34": "LABEL_34",
+    "35": "LABEL_35",
+    "36": "LABEL_36",
+    "37": "LABEL_37",
+    "38": "LABEL_38",
+    "39": "LABEL_39",
+    "40": "LABEL_40",
+    "41": "LABEL_41",
+    "42": "LABEL_42",
+    "43": "LABEL_43",
+    "44": "LABEL_44",
+    "45": "LABEL_45",
+    "46": "LABEL_46",
+    "47": "LABEL_47",
+    "48": "LABEL_48",
+    "49": "LABEL_49",
+    "50": "LABEL_50",
+    "51": "LABEL_51",
+    "52": "LABEL_52",
+    "53": "LABEL_53",
+    "54": "LABEL_54",
+    "55": "LABEL_55",
+    "56": "LABEL_56",
+    "57": "LABEL_57",
+    "58": "LABEL_58",
+    "59": "LABEL_59",
+    "60": "LABEL_60",
+    "61": "LABEL_61",
+    "62": "LABEL_62",
+    "63": "LABEL_63",
+    "64": "LABEL_64",
+    "65": "LABEL_65",
+    "66": "LABEL_66",
+    "67": "LABEL_67",
+    "68": "LABEL_68",
+    "69": "LABEL_69",
+    "70": "LABEL_70",
+    "71": "LABEL_71",
+    "72": "LABEL_72",
+    "73": "LABEL_73",
+    "74": "LABEL_74",
+    "75": "LABEL_75",
+    "76": "LABEL_76",
+    "77": "LABEL_77",
+    "78": "LABEL_78",
+    "79": "LABEL_79"
+  },
   "image_size": 640,
+  "importance_sample_ratio": 0.75,
+  "initializer_range": 0.02,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_10": 10,
+    "LABEL_11": 11,
+    "LABEL_12": 12,
+    "LABEL_13": 13,
+    "LABEL_14": 14,
+    "LABEL_15": 15,
+    "LABEL_16": 16,
+    "LABEL_17": 17,
+    "LABEL_18": 18,
+    "LABEL_19": 19,
+    "LABEL_2": 2,
+    "LABEL_20": 20,
+    "LABEL_21": 21,
+    "LABEL_22": 22,
+    "LABEL_23": 23,
+    "LABEL_24": 24,
+    "LABEL_25": 25,
+    "LABEL_26": 26,
+    "LABEL_27": 27,
+    "LABEL_28": 28,
+    "LABEL_29": 29,
+    "LABEL_3": 3,
+    "LABEL_30": 30,
+    "LABEL_31": 31,
+    "LABEL_32": 32,
+    "LABEL_33": 33,
+    "LABEL_34": 34,
+    "LABEL_35": 35,
+    "LABEL_36": 36,
+    "LABEL_37": 37,
+    "LABEL_38": 38,
+    "LABEL_39": 39,
+    "LABEL_4": 4,
+    "LABEL_40": 40,
+    "LABEL_41": 41,
+    "LABEL_42": 42,
+    "LABEL_43": 43,
+    "LABEL_44": 44,
+    "LABEL_45": 45,
+    "LABEL_46": 46,
+    "LABEL_47": 47,
+    "LABEL_48": 48,
+    "LABEL_49": 49,
+    "LABEL_5": 5,
+    "LABEL_50": 50,
+    "LABEL_51": 51,
+    "LABEL_52": 52,
+    "LABEL_53": 53,
+    "LABEL_54": 54,
+    "LABEL_55": 55,
+    "LABEL_56": 56,
+    "LABEL_57": 57,
+    "LABEL_58": 58,
+    "LABEL_59": 59,
+    "LABEL_6": 6,
+    "LABEL_60": 60,
+    "LABEL_61": 61,
+    "LABEL_62": 62,
+    "LABEL_63": 63,
+    "LABEL_64": 64,
+    "LABEL_65": 65,
+    "LABEL_66": 66,
+    "LABEL_67": 67,
+    "LABEL_68": 68,
+    "LABEL_69": 69,
+    "LABEL_7": 7,
+    "LABEL_70": 70,
+    "LABEL_71": 71,
+    "LABEL_72": 72,
+    "LABEL_73": 73,
+    "LABEL_74": 74,
+    "LABEL_75": 75,
+    "LABEL_76": 76,
+    "LABEL_77": 77,
+    "LABEL_78": 78,
+    "LABEL_79": 79,
+    "LABEL_8": 8,
+    "LABEL_9": 9
+  },
+  "layer_norm_eps": 1e-06,
+  "layerscale_value": 1e-05,
+  "mask_weight": 5.0,
+  "mlp_ratio": 4,
+  "model_type": "eomt",
+  "no_object_weight": 0.1,
+  "num_attention_heads": 16,
   "num_blocks": 4,
+  "num_channels": 3,
+  "num_hidden_layers": 24,
+  "num_queries": 200,
+  "num_register_tokens": 4,
+  "num_upscale_blocks": 2,
+  "oversample_ratio": 3.0,
+  "patch_size": 16,
+  "torch_dtype": "float32",
+  "train_num_points": 12544,
+  "transformers_version": "4.53.0.dev0",
+  "use_swiglu_ffn": false
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8fa9bee24c6a8b41bc98ec3097aa0ab0f758142c57b6a35e5172da62bcc6115
+size 1266611816

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": false,
+  "device": null,
+  "disable_grouping": null,
+  "do_center_crop": null,
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_pad": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_split_image": false,
+  "ignore_index": null,
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "image_processor_type": "EomtImageProcessorFast",
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "input_data_format": null,
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "return_tensors": null,
+  "size": {
+    "longest_edge": 640,
+    "shortest_edge": 640
+  }
+}