{ "architectures": [ "SiglipForImageClassification" ], "id2label": { "0": "\u0905", "1": "\u0906", "2": "\u0907", "3": "\u0908", "4": "\u0909", "5": "\u090a", "6": "\u090f", "7": "\u0910", "8": "\u0913", "9": "\u0914", "10": "\u0915", "11": "\u0915\u094d\u0937", "12": "\u0916", "13": "\u0917", "14": "\u0918", "15": "\u091a", "16": "\u091b", "17": "\u091c", "18": "\u091c\u094d\u091e", "19": "\u091d", "20": "\u091f", "21": "\u0920", "22": "\u0921", "23": "\u0922", "24": "\u0923", "25": "\u0924", "26": "\u0925", "27": "\u0926", "28": "\u0927", "29": "\u0928", "30": "\u092a", "31": "\u092b", "32": "\u092c", "33": "\u092d", "34": "\u092e", "35": "\u092f", "36": "\u0930", "37": "\u0932", "38": "\u0933", "39": "\u0935", "40": "\u0936", "41": "\u0938", "42": "\u0939" }, "initializer_factor": 1.0, "label2id": { "\u0905": 0, "\u0906": 1, "\u0907": 2, "\u0908": 3, "\u0909": 4, "\u090a": 5, "\u090f": 6, "\u0910": 7, "\u0913": 8, "\u0914": 9, "\u0915": 10, "\u0915\u094d\u0937": 11, "\u0916": 12, "\u0917": 13, "\u0918": 14, "\u091a": 15, "\u091b": 16, "\u091c": 17, "\u091c\u094d\u091e": 18, "\u091d": 19, "\u091f": 20, "\u0920": 21, "\u0921": 22, "\u0922": 23, "\u0923": 24, "\u0924": 25, "\u0925": 26, "\u0926": 27, "\u0927": 28, "\u0928": 29, "\u092a": 30, "\u092b": 31, "\u092c": 32, "\u092d": 33, "\u092e": 34, "\u092f": 35, "\u0930": 36, "\u0932": 37, "\u0933": 38, "\u0935": 39, "\u0936": 40, "\u0938": 41, "\u0939": 42 }, "model_type": "siglip", "problem_type": "single_label_classification", "text_config": { "attention_dropout": 0.0, "hidden_act": "gelu_pytorch_tanh", "hidden_size": 768, "intermediate_size": 3072, "layer_norm_eps": 1e-06, "max_position_embeddings": 64, "model_type": "siglip_text_model", "num_attention_heads": 12, "num_hidden_layers": 12, "projection_size": 768, "torch_dtype": "float32", "vocab_size": 256000 }, "torch_dtype": "float32", "transformers_version": "4.50.0", "vision_config": { "attention_dropout": 0.0, "hidden_act": "gelu_pytorch_tanh", "hidden_size": 768, "image_size": 224, "intermediate_size": 3072, "layer_norm_eps": 1e-06, "model_type": "siglip_vision_model", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "patch_size": 16, "torch_dtype": "float32" } }