{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "gray and black", "1": "screen", "2": "wedding", "3": "white and black", "4": "blue", "5": "brick", "6": "many", "7": "yes", "8": "security", "9": "bus", "10": "9:35", "11": "in car", "12": "air", "13": "bedroom", "14": "donut", "15": "window", "16": "plastic", "17": "black and white", "18": "fashion", "19": "purple", "20": "on road", "21": "table", "22": "exit", "23": "leather", "24": "out", "25": "not sure", "26": "birthday", "27": "station", "28": "chopsticks", "29": "girl", "30": "trees", "31": "canopy", "32": "not there", "33": "queen", "34": "beige", "35": "soccer ball", "36": "3", "37": "photographer", "38": "sidewalk", "39": "snow", "40": "cage", "41": "talking on phone", "42": "snowboard", "43": "lg", "44": "bikes", "45": "picnic table", "46": "talking", "47": "walking", "48": "pink", "49": "park", "50": "right", "51": "tired", "52": "giraffe", "53": "blue and white", "54": "curtains", "55": "style", "56": "ball", "57": "name tag", "58": "train", "59": "green", "60": "hat", "61": "bicycles", "62": "king", "63": "door", "64": "solid", "65": "person", "66": "hair", "67": "human", "68": "tent", "69": "laying down", "70": "low", "71": "resting", "72": "camera", "73": "rack", "74": "zoo", "75": "cross", "76": "unknown", "77": "clock tower", "78": "6", "79": "watching", "80": "monitor", "81": "wine", "82": "they aren't", "83": "skateboard", "84": "black", "85": "dirt", "86": "red and blue", "87": "protection", "88": "stand", "89": "suv", "90": "calico", "91": "man", "92": "ground", "93": "bricks", "94": "skiing", "95": "roof", "96": "little girl", "97": "red", "98": "women", "99": "red and yellow", "100": "chair", "101": "wine tasting", "102": "7", "103": "backpack", "104": "10", "105": "lying down", "106": "sky", "107": "plain", "108": "nothing", "109": "big ben", "110": "plate", "111": "7:45", "112": "double", "113": "clear", "114": "8", "115": "street", "116": "neon", "117": "dog", "118": "skateboarding", "119": "woods", "120": "4", "121": "boy", "122": "0", "123": "stripes", "124": "curtain", "125": "french", "126": "doughnut", "127": "no", "128": "2010", "129": "5", "130": "africa", "131": "tabby", "132": "giraffes", "133": "small", "134": "necklace", "135": "shadows", "136": "arrow", "137": "gray", "138": "clock", "139": "tan", "140": "tv", "141": "2000", "142": "cup", "143": "2013", "144": "happy", "145": "yellow", "146": "blonde", "147": "tower", "148": "bike rack", "149": "bicycle", "150": "crown", "151": "desert", "152": "crossing", "153": "brown", "154": "skier", "155": "on street", "156": "windows", "157": "shade", "158": "car", "159": "jeep", "160": "natural", "161": "smile", "162": "lady", "163": "large", "164": "at table", "165": "2", "166": "shadow", "167": "sun", "168": "snowboarding", "169": "net", "170": "orange", "171": "smiling", "172": "7:35", "173": "white", "174": "forest", "175": "shrimp", "176": "beagle", "177": "cat", "178": "outside", "179": "lanyard", "180": "snowboarder", "181": "fence", "182": "down", "183": "church", "184": "cloudy", "185": "white and blue", "186": "wall", "187": "don't know", "188": "sleeping", "189": "soccer", "190": "hawaii", "191": "woman", "192": "1", "193": "ice cream", "194": "8:35", "195": "shelter", "196": "full", "197": "can't tell", "198": "platform" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 122, "1": 192, "10": 104, "2": 165, "2000": 141, "2010": 128, "2013": 143, "3": 36, "4": 120, "5": 129, "6": 78, "7": 102, "7:35": 172, "7:45": 111, "8": 114, "8:35": 194, "9:35": 10, "africa": 130, "air": 12, "arrow": 136, "at table": 164, "backpack": 103, "ball": 56, "beagle": 176, "bedroom": 13, "beige": 34, "bicycle": 149, "bicycles": 61, "big ben": 109, "bike rack": 148, "bikes": 44, "birthday": 26, "black": 84, "black and white": 17, "blonde": 146, "blue": 4, "blue and white": 53, "boy": 121, "brick": 5, "bricks": 93, "brown": 153, "bus": 9, "cage": 40, "calico": 90, "camera": 72, "can't tell": 197, "canopy": 31, "car": 158, "cat": 177, "chair": 100, "chopsticks": 28, "church": 183, "clear": 113, "clock": 138, "clock tower": 77, "cloudy": 184, "cross": 75, "crossing": 152, "crown": 150, "cup": 142, "curtain": 124, "curtains": 54, "desert": 151, "dirt": 85, "dog": 117, "don't know": 187, "donut": 14, "door": 63, "double": 112, "doughnut": 126, "down": 182, "exit": 22, "fashion": 18, "fence": 181, "forest": 174, "french": 125, "full": 196, "giraffe": 52, "giraffes": 132, "girl": 29, "gray": 137, "gray and black": 0, "green": 59, "ground": 92, "hair": 66, "happy": 144, "hat": 60, "hawaii": 190, "human": 67, "ice cream": 193, "in car": 11, "jeep": 159, "king": 62, "lady": 162, "lanyard": 179, "large": 163, "laying down": 69, "leather": 23, "lg": 43, "little girl": 96, "low": 70, "lying down": 105, "man": 91, "many": 6, "monitor": 80, "name tag": 57, "natural": 160, "necklace": 134, "neon": 116, "net": 169, "no": 127, "not sure": 25, "not there": 32, "nothing": 108, "on road": 20, "on street": 155, "orange": 170, "out": 24, "outside": 178, "park": 49, "person": 65, "photographer": 37, "picnic table": 45, "pink": 48, "plain": 107, "plastic": 16, "plate": 110, "platform": 198, "protection": 87, "purple": 19, "queen": 33, "rack": 73, "red": 97, "red and blue": 86, "red and yellow": 99, "resting": 71, "right": 50, "roof": 95, "screen": 1, "security": 8, "shade": 157, "shadow": 166, "shadows": 135, "shelter": 195, "shrimp": 175, "sidewalk": 38, "skateboard": 83, "skateboarding": 118, "skier": 154, "skiing": 94, "sky": 106, "sleeping": 188, "small": 133, "smile": 161, "smiling": 171, "snow": 39, "snowboard": 42, "snowboarder": 180, "snowboarding": 168, "soccer": 189, "soccer ball": 35, "solid": 64, "stand": 88, "station": 27, "street": 115, "stripes": 123, "style": 55, "sun": 167, "suv": 89, "tabby": 131, "table": 21, "talking": 46, "talking on phone": 41, "tan": 139, "tent": 68, "they aren't": 82, "tired": 51, "tower": 147, "train": 58, "trees": 30, "tv": 140, "unknown": 76, "walking": 47, "wall": 186, "watching": 79, "wedding": 2, "white": 173, "white and black": 3, "white and blue": 185, "window": 15, "windows": 156, "wine": 81, "wine tasting": 101, "woman": 191, "women": 98, "woods": 119, "yellow": 145, "yes": 7, "zoo": 74 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.48.3", "type_vocab_size": 2, "vocab_size": 30522 }