Update README.md
Browse files
README.md
CHANGED
@@ -6,17 +6,18 @@ tags:
|
|
6 |
- base_model:adapter:HuggingFaceTB/SmolVLM2-500M-Video-Instruct
|
7 |
- lora
|
8 |
- transformers
|
|
|
9 |
model-index:
|
10 |
- name: Susant-Achary/SmolVLM2-500M-Video-Instruct-VQA2
|
11 |
results:
|
12 |
- task:
|
13 |
type: visual-question-answering
|
14 |
dataset:
|
15 |
-
|
16 |
-
|
17 |
metrics:
|
18 |
-
|
19 |
-
|
20 |
datasets:
|
21 |
- jinaai/table-vqa
|
22 |
language:
|
@@ -79,12 +80,12 @@ import requests
|
|
79 |
|
80 |
# Define the base model and the fine-tuned adapter repository
|
81 |
base_model_id = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
|
82 |
-
adapter_model_id = "Susant-Achary/SmolVLM2-500M-Video-Instruct-vqav2"
|
83 |
|
84 |
# Load the processor from the base model
|
85 |
processor = AutoProcessor.from_pretrained(base_model_id)
|
86 |
|
87 |
-
# Load the base model with quantization
|
88 |
bnb_config = BitsAndBytesConfig(
|
89 |
load_in_4bit=True,
|
90 |
bnb_4bit_use_double_quant=True,
|
@@ -95,7 +96,6 @@ bnb_config = BitsAndBytesConfig(
|
|
95 |
model = Idefics3ForConditionalGeneration.from_pretrained(
|
96 |
base_model_id,
|
97 |
quantization_config=bnb_config,
|
98 |
-
_attn_implementation="flash_attention_2",
|
99 |
device_map="auto"
|
100 |
)
|
101 |
|
@@ -104,8 +104,8 @@ model = PeftModel.from_pretrained(model, adapter_model_id)
|
|
104 |
|
105 |
# Prepare an example image and question
|
106 |
# You can replace this with your own image and question
|
107 |
-
url = "
|
108 |
-
image = Image.open(
|
109 |
question = "What is in the image?"
|
110 |
|
111 |
# Prepare the input for the model
|
|
|
6 |
- base_model:adapter:HuggingFaceTB/SmolVLM2-500M-Video-Instruct
|
7 |
- lora
|
8 |
- transformers
|
9 |
+
- finance
|
10 |
model-index:
|
11 |
- name: Susant-Achary/SmolVLM2-500M-Video-Instruct-VQA2
|
12 |
results:
|
13 |
- task:
|
14 |
type: visual-question-answering
|
15 |
dataset:
|
16 |
+
type: jinaai/table-vqa
|
17 |
+
name: jinaai/table-vqa
|
18 |
metrics:
|
19 |
+
- type: training_loss
|
20 |
+
value: 0.7473664236068726
|
21 |
datasets:
|
22 |
- jinaai/table-vqa
|
23 |
language:
|
|
|
80 |
|
81 |
# Define the base model and the fine-tuned adapter repository
|
82 |
base_model_id = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
|
83 |
+
adapter_model_id = "Susant-Achary/SmolVLM2-500M-Video-Instruct-vqav2"
|
84 |
|
85 |
# Load the processor from the base model
|
86 |
processor = AutoProcessor.from_pretrained(base_model_id)
|
87 |
|
88 |
+
# Load the base model with quantization
|
89 |
bnb_config = BitsAndBytesConfig(
|
90 |
load_in_4bit=True,
|
91 |
bnb_4bit_use_double_quant=True,
|
|
|
96 |
model = Idefics3ForConditionalGeneration.from_pretrained(
|
97 |
base_model_id,
|
98 |
quantization_config=bnb_config,
|
|
|
99 |
device_map="auto"
|
100 |
)
|
101 |
|
|
|
104 |
|
105 |
# Prepare an example image and question
|
106 |
# You can replace this with your own image and question
|
107 |
+
url = "/content/VQA-20-standard-test-set-results-comparison-of-state-of-the-art-methods.png"
|
108 |
+
image = Image.open(url)
|
109 |
question = "What is in the image?"
|
110 |
|
111 |
# Prepare the input for the model
|