{ "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "walking", "1": "king", "2": "blonde", "3": "air", "4": "8", "5": "bedroom", "6": "ice cream", "7": "car", "8": "picnic table", "9": "white and blue", "10": "cup", "11": "crossing", "12": "human", "13": "0", "14": "resting", "15": "no", "16": "little girl", "17": "they aren't", "18": "woman", "19": "fashion", "20": "double", "21": "outside", "22": "3", "23": "train", "24": "sun", "25": "at table", "26": "hat", "27": "full", "28": "1", "29": "crown", "30": "woods", "31": "wine tasting", "32": "boy", "33": "dog", "34": "chopsticks", "35": "right", "36": "park", "37": "blue and white", "38": "forest", "39": "beige", "40": "cat", "41": "pink", "42": "skateboard", "43": "wall", "44": "plastic", "45": "not sure", "46": "2", "47": "red and blue", "48": "7", "49": "red and yellow", "50": "suv", "51": "yellow", "52": "gray", "53": "shadows", "54": "down", "55": "hawaii", "56": "girl", "57": "red", "58": "backpack", "59": "yes", "60": "shrimp", "61": "chair", "62": "watching", "63": "tired", "64": "jeep", "65": "10", "66": "wine", "67": "sky", "68": "5", "69": "leather", "70": "4", "71": "queen", "72": "bus", "73": "green", "74": "white and black", "75": "white", "76": "wedding", "77": "style", "78": "table", "79": "out", "80": "black", "81": "shadow", "82": "doughnut", "83": "donut", "84": "many", "85": "birthday", "86": "6", "87": "brown" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 13, "1": 28, "10": 65, "2": 46, "3": 22, "4": 70, "5": 68, "6": 86, "7": 48, "8": 4, "air": 3, "at table": 25, "backpack": 58, "bedroom": 5, "beige": 39, "birthday": 85, "black": 80, "blonde": 2, "blue and white": 37, "boy": 32, "brown": 87, "bus": 72, "car": 7, "cat": 40, "chair": 61, "chopsticks": 34, "crossing": 11, "crown": 29, "cup": 10, "dog": 33, "donut": 83, "double": 20, "doughnut": 82, "down": 54, "fashion": 19, "forest": 38, "full": 27, "girl": 56, "gray": 52, "green": 73, "hat": 26, "hawaii": 55, "human": 12, "ice cream": 6, "jeep": 64, "king": 1, "leather": 69, "little girl": 16, "many": 84, "no": 15, "not sure": 45, "out": 79, "outside": 21, "park": 36, "picnic table": 8, "pink": 41, "plastic": 44, "queen": 71, "red": 57, "red and blue": 47, "red and yellow": 49, "resting": 14, "right": 35, "shadow": 81, "shadows": 53, "shrimp": 60, "skateboard": 42, "sky": 67, "style": 77, "sun": 24, "suv": 50, "table": 78, "they aren't": 17, "tired": 63, "train": 23, "walking": 0, "wall": 43, "watching": 62, "wedding": 76, "white": 75, "white and black": 74, "white and blue": 9, "wine": 66, "wine tasting": 31, "woman": 18, "woods": 30, "yellow": 51, "yes": 59 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.52.4", "type_vocab_size": 2, "vocab_size": 30522 }