{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "photographer", "1": "low", "2": "exit", "3": "curtain", "4": "walking", "5": "sky", "6": "jeep", "7": "wall", "8": "snowboarding", "9": "bicycles", "10": "big ben", "11": "small", "12": "2", "13": "lg", "14": "dog", "15": "7:35", "16": "birthday", "17": "picnic table", "18": "windows", "19": "resting", "20": "king", "21": "snowboarder", "22": "full", "23": "platform", "24": "shelter", "25": "person", "26": "purple", "27": "10", "28": "white and blue", "29": "on road", "30": "large", "31": "lying down", "32": "clock", "33": "blue and white", "34": "plate", "35": "don't know", "36": "skateboarding", "37": "yes", "38": "blue", "39": "human", "40": "girl", "41": "black and white", "42": "zoo", "43": "smiling", "44": "tower", "45": "many", "46": "arrow", "47": "4", "48": "bicycle", "49": "out", "50": "door", "51": "2000", "52": "clock tower", "53": "car", "54": "monitor", "55": "cat", "56": "boy", "57": "tired", "58": "forest", "59": "tent", "60": "leather", "61": "backpack", "62": "red", "63": "white", "64": "green", "65": "cloudy", "66": "plastic", "67": "blonde", "68": "brick", "69": "french", "70": "ground", "71": "wine tasting", "72": "talking on phone", "73": "happy", "74": "lady", "75": "ice cream", "76": "station", "77": "soccer ball", "78": "shadow", "79": "camera", "80": "neon", "81": "watching", "82": "necklace", "83": "street", "84": "bike rack", "85": "stripes", "86": "5", "87": "style", "88": "shade", "89": "desert", "90": "red and yellow", "91": "wedding", "92": "brown", "93": "bedroom", "94": "2013", "95": "7:45", "96": "giraffes", "97": "double", "98": "outside", "99": "unknown", "100": "suv", "101": "calico", "102": "ball", "103": "laying down", "104": "train", "105": "table", "106": "tabby", "107": "net", "108": "gray", "109": "park", "110": "beige", "111": "stand", "112": "at table", "113": "man", "114": "canopy", "115": "plain", "116": "black", "117": "orange", "118": "women", "119": "beagle", "120": "yellow", "121": "tan", "122": "in car", "123": "6", "124": "skateboard", "125": "hat", "126": "8:35", "127": "roof", "128": "sidewalk", "129": "bikes", "130": "cup", "131": "chair", "132": "donut", "133": "right", "134": "air", "135": "nothing", "136": "giraffe", "137": "natural", "138": "fashion", "139": "little girl", "140": "3", "141": "cross", "142": "wine", "143": "gray and black", "144": "on street", "145": "africa", "146": "can't tell", "147": "no", "148": "bricks", "149": "name tag", "150": "hawaii", "151": "smile", "152": "skier", "153": "trees", "154": "woman", "155": "snowboard", "156": "church", "157": "crossing", "158": "cage", "159": "tv", "160": "1", "161": "pink", "162": "doughnut", "163": "woods", "164": "9:35", "165": "solid", "166": "curtains", "167": "lanyard", "168": "red and blue", "169": "snow", "170": "dirt", "171": "protection", "172": "fence", "173": "2010", "174": "skiing", "175": "security", "176": "talking", "177": "white and black", "178": "they aren't", "179": "down", "180": "screen", "181": "shrimp", "182": "window", "183": "rack", "184": "soccer", "185": "bus", "186": "clear", "187": "sun", "188": "shadows", "189": "hair", "190": "chopsticks", "191": "8", "192": "not there", "193": "queen", "194": "sleeping", "195": "7", "196": "not sure", "197": "crown", "198": "0" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 198, "1": 160, "10": 27, "2": 12, "2000": 51, "2010": 173, "2013": 94, "3": 140, "4": 47, "5": 86, "6": 123, "7": 195, "7:35": 15, "7:45": 95, "8": 191, "8:35": 126, "9:35": 164, "africa": 145, "air": 134, "arrow": 46, "at table": 112, "backpack": 61, "ball": 102, "beagle": 119, "bedroom": 93, "beige": 110, "bicycle": 48, "bicycles": 9, "big ben": 10, "bike rack": 84, "bikes": 129, "birthday": 16, "black": 116, "black and white": 41, "blonde": 67, "blue": 38, "blue and white": 33, "boy": 56, "brick": 68, "bricks": 148, "brown": 92, "bus": 185, "cage": 158, "calico": 101, "camera": 79, "can't tell": 146, "canopy": 114, "car": 53, "cat": 55, "chair": 131, "chopsticks": 190, "church": 156, "clear": 186, "clock": 32, "clock tower": 52, "cloudy": 65, "cross": 141, "crossing": 157, "crown": 197, "cup": 130, "curtain": 3, "curtains": 166, "desert": 89, "dirt": 170, "dog": 14, "don't know": 35, "donut": 132, "door": 50, "double": 97, "doughnut": 162, "down": 179, "exit": 2, "fashion": 138, "fence": 172, "forest": 58, "french": 69, "full": 22, "giraffe": 136, "giraffes": 96, "girl": 40, "gray": 108, "gray and black": 143, "green": 64, "ground": 70, "hair": 189, "happy": 73, "hat": 125, "hawaii": 150, "human": 39, "ice cream": 75, "in car": 122, "jeep": 6, "king": 20, "lady": 74, "lanyard": 167, "large": 30, "laying down": 103, "leather": 60, "lg": 13, "little girl": 139, "low": 1, "lying down": 31, "man": 113, "many": 45, "monitor": 54, "name tag": 149, "natural": 137, "necklace": 82, "neon": 80, "net": 107, "no": 147, "not sure": 196, "not there": 192, "nothing": 135, "on road": 29, "on street": 144, "orange": 117, "out": 49, "outside": 98, "park": 109, "person": 25, "photographer": 0, "picnic table": 17, "pink": 161, "plain": 115, "plastic": 66, "plate": 34, "platform": 23, "protection": 171, "purple": 26, "queen": 193, "rack": 183, "red": 62, "red and blue": 168, "red and yellow": 90, "resting": 19, "right": 133, "roof": 127, "screen": 180, "security": 175, "shade": 88, "shadow": 78, "shadows": 188, "shelter": 24, "shrimp": 181, "sidewalk": 128, "skateboard": 124, "skateboarding": 36, "skier": 152, "skiing": 174, "sky": 5, "sleeping": 194, "small": 11, "smile": 151, "smiling": 43, "snow": 169, "snowboard": 155, "snowboarder": 21, "snowboarding": 8, "soccer": 184, "soccer ball": 77, "solid": 165, "stand": 111, "station": 76, "street": 83, "stripes": 85, "style": 87, "sun": 187, "suv": 100, "tabby": 106, "table": 105, "talking": 176, "talking on phone": 72, "tan": 121, "tent": 59, "they aren't": 178, "tired": 57, "tower": 44, "train": 104, "trees": 153, "tv": 159, "unknown": 99, "walking": 4, "wall": 7, "watching": 81, "wedding": 91, "white": 63, "white and black": 177, "white and blue": 28, "window": 182, "windows": 18, "wine": 142, "wine tasting": 71, "woman": 154, "women": 118, "woods": 163, "yellow": 120, "yes": 37, "zoo": 42 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.34.1", "type_vocab_size": 2, "vocab_size": 30522 }