Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

README.md +35 -6
added_tokens.json +7 -0
config.json +10 -0
merges.txt +0 -0
model.safetensors +3 -0
special_tokens_map.json +36 -0
tokenizer.json +0 -0
tokenizer_config.json +203 -0
vocab.json +0 -0

README.md CHANGED Viewed

@@ -1,8 +1,37 @@
 ---
-license: cc-by-4.0
-language:
-- en
-base_model:
-- HuggingFaceTB/SmolLM2-135M
 pipeline_tag: visual-question-answering
----

 ---
+license: cc-by-nc-4.0
+tags:
+  - visual-question-answering
+  - multimodal
+  - pytorch
+  - cross-attention
+  - vision-transformer
 pipeline_tag: visual-question-answering
+---
+# Visual Question Answering (VQA) Model
+This is a multimodal Visual Question Answering system built for my Bachelor's final project. It combines a Vision Transformer (ViT) image encoder and a SmolLM2 language model using a cross-attention mechanism.
+## Model Architecture
+- **Vision Encoder:** Pretrained ViT
+- **Language Model:** SmolLM2-135M
+- **Fusion:** Cross-attention layer aligning vision and language
+- **Dataset:** VQA v2 and LLaVa datasets for training
+## How to Use
+```python
+from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering
+from PIL import Image
+processor = AutoProcessor.from_pretrained("yourusername/vqa-multimodal")
+model = AutoModelForVisualQuestionAnswering.from_pretrained("yourusername/vqa-multimodal")
+image = Image.open("example.jpg")
+question = "What is the person doing?"
+inputs = processor(images=image, text=question, return_tensors="pt")
+outputs = model(**inputs)
+answer = processor.tokenizer.decode(outputs.logits.argmax(-1))

added_tokens.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "<answer>": 49155,
+  "<end_sequence>": 49153,
+  "<pad>": 49156,
+  "<question>": 49154,
+  "<start_sequence>": 49152
+}

config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "hyperparams": {
+    "max_length": 512,
+    "num_heads": 8,
+    "text_dim": 576,
+    "train_image_encoder": false,
+    "use_preprocessed": true,
+    "vision_dim": 768
+  }
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93daaa9b48347cbff272e7331ccc2297a3e561e1c4ece6194a1d5157f4340c05
+size 890778224

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "additional_special_tokens": [
+    "<start_sequence>",
+    "<end_sequence>",
+    "<question>",
+    "<answer>"
+  ],
+  "bos_token": {
+    "content": "<start_sequence>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<end_sequence>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,203 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49152": {
+      "content": "<start_sequence>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49153": {
+      "content": "<end_sequence>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49154": {
+      "content": "<question>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49155": {
+      "content": "<answer>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49156": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<start_sequence>",
+    "<end_sequence>",
+    "<question>",
+    "<answer>"
+  ],
+  "bos_token": "<start_sequence>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<end_sequence>",
+  "extra_special_tokens": {},
+  "max_length": 128,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_to_multiple_of": null,
+  "pad_token": "<pad>",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "stride": 0,
+  "tokenizer_class": "GPT2Tokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff