Susant-Achary
/

SmolVLM2-500M-Video-Instruct-vqav2

Visual Question Answering

Model card Files Files and versions

Susant-Achary commited on Jul 18

Commit

1943cf0

·

verified ·

1 Parent(s): 0abd65a

Update README.md

Files changed (1) hide show

README.md +9 -9

README.md CHANGED Viewed

@@ -6,17 +6,18 @@ tags:
 - base_model:adapter:HuggingFaceTB/SmolVLM2-500M-Video-Instruct
 - lora
 - transformers
 model-index:
 - name: Susant-Achary/SmolVLM2-500M-Video-Instruct-VQA2
   results:
   - task:
       type: visual-question-answering
     dataset:
-        type: jinaai/table-vqa
-        name: jinaai/table-vqa
     metrics:
-      - type: training_loss
-        value: 0.7473664236068726
 datasets:
 - jinaai/table-vqa
 language:
@@ -79,12 +80,12 @@ import requests
 # Define the base model and the fine-tuned adapter repository
 base_model_id = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
-adapter_model_id = "Susant-Achary/SmolVLM2-500M-Video-Instruct-vqav2"
 # Load the processor from the base model
 processor = AutoProcessor.from_pretrained(base_model_id)
-# Load the base model with quantization
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_use_double_quant=True,
@@ -95,7 +96,6 @@ bnb_config = BitsAndBytesConfig(
 model = Idefics3ForConditionalGeneration.from_pretrained(
     base_model_id,
     quantization_config=bnb_config,
-    _attn_implementation="flash_attention_2",
     device_map="auto"
 )
@@ -104,8 +104,8 @@ model = PeftModel.from_pretrained(model, adapter_model_id)
 # Prepare an example image and question
 # You can replace this with your own image and question
-url = "https://www.researchgate.net/profile/Sarah-Parisot/publication/325862405/figure/tbl1/AS:668602864791562@1536418753263/VQA-20-standard-test-set-results-comparison-of-state-of-the-art-methods.png"
-image = Image.open(requests.get(url, stream=True).raw)
 question = "What is in the image?"
 # Prepare the input for the model

 - base_model:adapter:HuggingFaceTB/SmolVLM2-500M-Video-Instruct
 - lora
 - transformers
+- finance
 model-index:
 - name: Susant-Achary/SmolVLM2-500M-Video-Instruct-VQA2
   results:
   - task:
       type: visual-question-answering
     dataset:
+      type: jinaai/table-vqa
+      name: jinaai/table-vqa
     metrics:
+    - type: training_loss
+      value: 0.7473664236068726
 datasets:
 - jinaai/table-vqa
 language:
 # Define the base model and the fine-tuned adapter repository
 base_model_id = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
+adapter_model_id = "Susant-Achary/SmolVLM2-500M-Video-Instruct-vqav2"
 # Load the processor from the base model
 processor = AutoProcessor.from_pretrained(base_model_id)
+# Load the base model with quantization
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_use_double_quant=True,
 model = Idefics3ForConditionalGeneration.from_pretrained(
     base_model_id,
     quantization_config=bnb_config,
     device_map="auto"
 )
 # Prepare an example image and question
 # You can replace this with your own image and question
+url = "/content/VQA-20-standard-test-set-results-comparison-of-state-of-the-art-methods.png"
+image = Image.open(url)
 question = "What is in the image?"
 # Prepare the input for the model