import gradio as gr from transformers import BlipProcessor, BlipForQuestionAnswering from PIL import Image import torch # Load processor and model processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") model.eval() # Resize function def resize_image(image): if image is not None: max_size = 512 image.thumbnail((max_size, max_size)) return image # Answer question function def answer_question(resized_image, question): if resized_image is None or question.strip() == "": return "Please upload an image and ask a question." inputs = processor(resized_image, question, return_tensors="pt") with torch.no_grad(): output = model.generate(**inputs) return processor.decode(output[0], skip_special_tokens=True) # Gradio UI with gr.Blocks(title="BLIP VQA App (Salesforce/blip-vqa-base)") as demo: gr.Markdown("## 📷 Visual Question Answering with BLIP VQA\nUpload an image and ask a question about it.") image_input = gr.Image(type="pil", label="Upload Image") resized_image = gr.State() question_input = gr.Textbox(label="Question", placeholder="What is in the image?") ask_button = gr.Button("Ask") answer_output = gr.Textbox(label="Answer") # Resize image on upload image_input.change(fn=resize_image, inputs=image_input, outputs=resized_image) # Ask button triggers VQA ask_button.click(fn=answer_question, inputs=[resized_image, question_input], outputs=answer_output) demo.launch()