{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "cage", "1": "not sure", "2": "blue and white", "3": "no", "4": "yellow", "5": "shadow", "6": "orange", "7": "stand", "8": "name tag", "9": "7:45", "10": "don't know", "11": "plastic", "12": "white and blue", "13": "soccer ball", "14": "queen", "15": "wedding", "16": "woods", "17": "black", "18": "8", "19": "in car", "20": "watching", "21": "walking", "22": "double", "23": "smile", "24": "red and blue", "25": "birthday", "26": "clear", "27": "9:35", "28": "crossing", "29": "blonde", "30": "3", "31": "human", "32": "man", "33": "protection", "34": "suv", "35": "hair", "36": "5", "37": "camera", "38": "smiling", "39": "cross", "40": "red and yellow", "41": "beige", "42": "fence", "43": "red", "44": "2013", "45": "lady", "46": "yes", "47": "4", "48": "leather", "49": "2010", "50": "natural", "51": "tv", "52": "blue", "53": "wine tasting", "54": "7", "55": "7:35", "56": "picnic table", "57": "church", "58": "bike rack", "59": "clock tower", "60": "person", "61": "large", "62": "street", "63": "calico", "64": "lanyard", "65": "pink", "66": "tired", "67": "necklace", "68": "giraffes", "69": "full", "70": "roof", "71": "skiing", "72": "tan", "73": "window", "74": "boy", "75": "forest", "76": "fashion", "77": "nothing", "78": "sidewalk", "79": "brick", "80": "canopy", "81": "air", "82": "0", "83": "little girl", "84": "door", "85": "bicycles", "86": "platform", "87": "chopsticks", "88": "table", "89": "green", "90": "dirt", "91": "hat", "92": "women", "93": "white", "94": "wall", "95": "2", "96": "ground", "97": "laying down", "98": "net", "99": "curtain", "100": "bricks", "101": "rack", "102": "sun", "103": "small", "104": "tabby", "105": "tent", "106": "on road", "107": "lg", "108": "photographer", "109": "jeep", "110": "shrimp", "111": "doughnut", "112": "purple", "113": "security", "114": "crown", "115": "desert", "116": "giraffe", "117": "skier", "118": "chair", "119": "solid", "120": "wine", "121": "shade", "122": "dog", "123": "snowboarding", "124": "plate", "125": "french", "126": "6", "127": "white and black", "128": "ball", "129": "many", "130": "lying down", "131": "out", "132": "resting", "133": "they aren't", "134": "skateboard", "135": "soccer", "136": "brown", "137": "gray", "138": "right", "139": "hawaii", "140": "cup", "141": "backpack", "142": "beagle", "143": "snowboard", "144": "woman", "145": "neon", "146": "stripes", "147": "black and white", "148": "curtains", "149": "big ben", "150": "plain", "151": "zoo", "152": "windows", "153": "train", "154": "shelter", "155": "style", "156": "2000", "157": "screen", "158": "talking on phone", "159": "exit", "160": "outside", "161": "at table", "162": "1", "163": "park", "164": "sleeping", "165": "on street", "166": "snow", "167": "low", "168": "bikes", "169": "shadows", "170": "unknown", "171": "ice cream", "172": "snowboarder", "173": "10", "174": "bedroom", "175": "arrow", "176": "talking", "177": "down", "178": "station", "179": "not there", "180": "bicycle", "181": "happy", "182": "cloudy", "183": "girl", "184": "tower", "185": "bus", "186": "can't tell", "187": "8:35", "188": "car", "189": "sky", "190": "king", "191": "donut", "192": "clock", "193": "gray and black", "194": "monitor", "195": "africa", "196": "trees", "197": "cat", "198": "skateboarding" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 82, "1": 162, "10": 173, "2": 95, "2000": 156, "2010": 49, "2013": 44, "3": 30, "4": 47, "5": 36, "6": 126, "7": 54, "7:35": 55, "7:45": 9, "8": 18, "8:35": 187, "9:35": 27, "africa": 195, "air": 81, "arrow": 175, "at table": 161, "backpack": 141, "ball": 128, "beagle": 142, "bedroom": 174, "beige": 41, "bicycle": 180, "bicycles": 85, "big ben": 149, "bike rack": 58, "bikes": 168, "birthday": 25, "black": 17, "black and white": 147, "blonde": 29, "blue": 52, "blue and white": 2, "boy": 74, "brick": 79, "bricks": 100, "brown": 136, "bus": 185, "cage": 0, "calico": 63, "camera": 37, "can't tell": 186, "canopy": 80, "car": 188, "cat": 197, "chair": 118, "chopsticks": 87, "church": 57, "clear": 26, "clock": 192, "clock tower": 59, "cloudy": 182, "cross": 39, "crossing": 28, "crown": 114, "cup": 140, "curtain": 99, "curtains": 148, "desert": 115, "dirt": 90, "dog": 122, "don't know": 10, "donut": 191, "door": 84, "double": 22, "doughnut": 111, "down": 177, "exit": 159, "fashion": 76, "fence": 42, "forest": 75, "french": 125, "full": 69, "giraffe": 116, "giraffes": 68, "girl": 183, "gray": 137, "gray and black": 193, "green": 89, "ground": 96, "hair": 35, "happy": 181, "hat": 91, "hawaii": 139, "human": 31, "ice cream": 171, "in car": 19, "jeep": 109, "king": 190, "lady": 45, "lanyard": 64, "large": 61, "laying down": 97, "leather": 48, "lg": 107, "little girl": 83, "low": 167, "lying down": 130, "man": 32, "many": 129, "monitor": 194, "name tag": 8, "natural": 50, "necklace": 67, "neon": 145, "net": 98, "no": 3, "not sure": 1, "not there": 179, "nothing": 77, "on road": 106, "on street": 165, "orange": 6, "out": 131, "outside": 160, "park": 163, "person": 60, "photographer": 108, "picnic table": 56, "pink": 65, "plain": 150, "plastic": 11, "plate": 124, "platform": 86, "protection": 33, "purple": 112, "queen": 14, "rack": 101, "red": 43, "red and blue": 24, "red and yellow": 40, "resting": 132, "right": 138, "roof": 70, "screen": 157, "security": 113, "shade": 121, "shadow": 5, "shadows": 169, "shelter": 154, "shrimp": 110, "sidewalk": 78, "skateboard": 134, "skateboarding": 198, "skier": 117, "skiing": 71, "sky": 189, "sleeping": 164, "small": 103, "smile": 23, "smiling": 38, "snow": 166, "snowboard": 143, "snowboarder": 172, "snowboarding": 123, "soccer": 135, "soccer ball": 13, "solid": 119, "stand": 7, "station": 178, "street": 62, "stripes": 146, "style": 155, "sun": 102, "suv": 34, "tabby": 104, "table": 88, "talking": 176, "talking on phone": 158, "tan": 72, "tent": 105, "they aren't": 133, "tired": 66, "tower": 184, "train": 153, "trees": 196, "tv": 51, "unknown": 170, "walking": 21, "wall": 94, "watching": 20, "wedding": 15, "white": 93, "white and black": 127, "white and blue": 12, "window": 73, "windows": 152, "wine": 120, "wine tasting": 53, "woman": 144, "women": 92, "woods": 16, "yellow": 4, "yes": 46, "zoo": 151 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.45.0.dev0", "type_vocab_size": 2, "vocab_size": 30522 }