{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "king", "1": "4", "2": "canopy", "3": "sun", "4": "calico", "5": "brown", "6": "style", "7": "cloudy", "8": "blue", "9": "birthday", "10": "bicycles", "11": "can't tell", "12": "forest", "13": "doughnut", "14": "neon", "15": "picnic table", "16": "watching", "17": "wall", "18": "fence", "19": "chair", "20": "down", "21": "stripes", "22": "orange", "23": "hat", "24": "cat", "25": "skateboarding", "26": "exit", "27": "shade", "28": "sleeping", "29": "backpack", "30": "brick", "31": "net", "32": "platform", "33": "laying down", "34": "boy", "35": "hair", "36": "camera", "37": "smile", "38": "name tag", "39": "in car", "40": "bedroom", "41": "snowboarder", "42": "not there", "43": "sky", "44": "5", "45": "chopsticks", "46": "trees", "47": "table", "48": "6", "49": "natural", "50": "door", "51": "africa", "52": "smiling", "53": "10", "54": "rack", "55": "7", "56": "fashion", "57": "snowboarding", "58": "skier", "59": "shelter", "60": "photographer", "61": "2", "62": "happy", "63": "resting", "64": "on street", "65": "7:35", "66": "yellow", "67": "air", "68": "beagle", "69": "plate", "70": "protection", "71": "not sure", "72": "human", "73": "9:35", "74": "bikes", "75": "cage", "76": "hawaii", "77": "out", "78": "on road", "79": "2013", "80": "2010", "81": "skiing", "82": "station", "83": "ice cream", "84": "window", "85": "tent", "86": "clear", "87": "red", "88": "stand", "89": "wine tasting", "90": "soccer", "91": "lying down", "92": "queen", "93": "crown", "94": "shrimp", "95": "woods", "96": "black and white", "97": "tired", "98": "ground", "99": "3", "100": "suv", "101": "clock tower", "102": "clock", "103": "gray and black", "104": "bike rack", "105": "green", "106": "necklace", "107": "plastic", "108": "girl", "109": "street", "110": "full", "111": "desert", "112": "security", "113": "no", "114": "yes", "115": "lg", "116": "curtains", "117": "person", "118": "snowboard", "119": "bricks", "120": "tabby", "121": "car", "122": "ball", "123": "outside", "124": "pink", "125": "donut", "126": "8:35", "127": "tower", "128": "screen", "129": "gray", "130": "lanyard", "131": "solid", "132": "blue and white", "133": "women", "134": "low", "135": "cross", "136": "talking on phone", "137": "bus", "138": "red and blue", "139": "wedding", "140": "red and yellow", "141": "purple", "142": "leather", "143": "white and black", "144": "right", "145": "beige", "146": "unknown", "147": "wine", "148": "black", "149": "little girl", "150": "shadow", "151": "train", "152": "0", "153": "roof", "154": "small", "155": "large", "156": "snow", "157": "bicycle", "158": "park", "159": "1", "160": "man", "161": "cup", "162": "dirt", "163": "arrow", "164": "big ben", "165": "monitor", "166": "sidewalk", "167": "talking", "168": "at table", "169": "skateboard", "170": "crossing", "171": "dog", "172": "tan", "173": "tv", "174": "giraffe", "175": "they aren't", "176": "woman", "177": "church", "178": "7:45", "179": "8", "180": "french", "181": "2000", "182": "soccer ball", "183": "giraffes", "184": "double", "185": "jeep", "186": "white", "187": "shadows", "188": "zoo", "189": "nothing", "190": "don't know", "191": "walking", "192": "blonde", "193": "plain", "194": "white and blue", "195": "curtain", "196": "lady", "197": "many", "198": "windows" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 152, "1": 159, "10": 53, "2": 61, "2000": 181, "2010": 80, "2013": 79, "3": 99, "4": 1, "5": 44, "6": 48, "7": 55, "7:35": 65, "7:45": 178, "8": 179, "8:35": 126, "9:35": 73, "africa": 51, "air": 67, "arrow": 163, "at table": 168, "backpack": 29, "ball": 122, "beagle": 68, "bedroom": 40, "beige": 145, "bicycle": 157, "bicycles": 10, "big ben": 164, "bike rack": 104, "bikes": 74, "birthday": 9, "black": 148, "black and white": 96, "blonde": 192, "blue": 8, "blue and white": 132, "boy": 34, "brick": 30, "bricks": 119, "brown": 5, "bus": 137, "cage": 75, "calico": 4, "camera": 36, "can't tell": 11, "canopy": 2, "car": 121, "cat": 24, "chair": 19, "chopsticks": 45, "church": 177, "clear": 86, "clock": 102, "clock tower": 101, "cloudy": 7, "cross": 135, "crossing": 170, "crown": 93, "cup": 161, "curtain": 195, "curtains": 116, "desert": 111, "dirt": 162, "dog": 171, "don't know": 190, "donut": 125, "door": 50, "double": 184, "doughnut": 13, "down": 20, "exit": 26, "fashion": 56, "fence": 18, "forest": 12, "french": 180, "full": 110, "giraffe": 174, "giraffes": 183, "girl": 108, "gray": 129, "gray and black": 103, "green": 105, "ground": 98, "hair": 35, "happy": 62, "hat": 23, "hawaii": 76, "human": 72, "ice cream": 83, "in car": 39, "jeep": 185, "king": 0, "lady": 196, "lanyard": 130, "large": 155, "laying down": 33, "leather": 142, "lg": 115, "little girl": 149, "low": 134, "lying down": 91, "man": 160, "many": 197, "monitor": 165, "name tag": 38, "natural": 49, "necklace": 106, "neon": 14, "net": 31, "no": 113, "not sure": 71, "not there": 42, "nothing": 189, "on road": 78, "on street": 64, "orange": 22, "out": 77, "outside": 123, "park": 158, "person": 117, "photographer": 60, "picnic table": 15, "pink": 124, "plain": 193, "plastic": 107, "plate": 69, "platform": 32, "protection": 70, "purple": 141, "queen": 92, "rack": 54, "red": 87, "red and blue": 138, "red and yellow": 140, "resting": 63, "right": 144, "roof": 153, "screen": 128, "security": 112, "shade": 27, "shadow": 150, "shadows": 187, "shelter": 59, "shrimp": 94, "sidewalk": 166, "skateboard": 169, "skateboarding": 25, "skier": 58, "skiing": 81, "sky": 43, "sleeping": 28, "small": 154, "smile": 37, "smiling": 52, "snow": 156, "snowboard": 118, "snowboarder": 41, "snowboarding": 57, "soccer": 90, "soccer ball": 182, "solid": 131, "stand": 88, "station": 82, "street": 109, "stripes": 21, "style": 6, "sun": 3, "suv": 100, "tabby": 120, "table": 47, "talking": 167, "talking on phone": 136, "tan": 172, "tent": 85, "they aren't": 175, "tired": 97, "tower": 127, "train": 151, "trees": 46, "tv": 173, "unknown": 146, "walking": 191, "wall": 17, "watching": 16, "wedding": 139, "white": 186, "white and black": 143, "white and blue": 194, "window": 84, "windows": 198, "wine": 147, "wine tasting": 89, "woman": 176, "women": 133, "woods": 95, "yellow": 66, "yes": 114, "zoo": 188 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.37.0", "type_vocab_size": 2, "vocab_size": 30522 }