|
|
import gradio as gr |
|
|
from transformers import BlipProcessor, BlipForQuestionAnswering |
|
|
from PIL import Image |
|
|
import torch |
|
|
|
|
|
|
|
|
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") |
|
|
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") |
|
|
model.eval() |
|
|
|
|
|
|
|
|
def resize_image(image): |
|
|
if image is not None: |
|
|
max_size = 512 |
|
|
image.thumbnail((max_size, max_size)) |
|
|
return image |
|
|
|
|
|
|
|
|
def answer_question(resized_image, question): |
|
|
if resized_image is None or question.strip() == "": |
|
|
return "Please upload an image and ask a question." |
|
|
|
|
|
inputs = processor(resized_image, question, return_tensors="pt") |
|
|
with torch.no_grad(): |
|
|
output = model.generate(**inputs) |
|
|
return processor.decode(output[0], skip_special_tokens=True) |
|
|
|
|
|
|
|
|
with gr.Blocks(title="BLIP VQA App (Salesforce/blip-vqa-base)") as demo: |
|
|
gr.Markdown("## π· Visual Question Answering with BLIP VQA\nUpload an image and ask a question about it.") |
|
|
|
|
|
image_input = gr.Image(type="pil", label="Upload Image") |
|
|
resized_image = gr.State() |
|
|
|
|
|
question_input = gr.Textbox(label="Question", placeholder="What is in the image?") |
|
|
ask_button = gr.Button("Ask") |
|
|
answer_output = gr.Textbox(label="Answer") |
|
|
|
|
|
|
|
|
image_input.change(fn=resize_image, inputs=image_input, outputs=resized_image) |
|
|
|
|
|
|
|
|
ask_button.click(fn=answer_question, inputs=[resized_image, question_input], outputs=answer_output) |
|
|
|
|
|
demo.launch() |
|
|
|