from transformers import BlipProcessor, BlipForQuestionAnswering, pipeline import torch from PIL import Image import gradio as gr import re # Set model ID model_id = "mshsahmed/blip-vqa-finetuned-kvasir-v58849" # Load BLIP model and processor processor = BlipProcessor.from_pretrained(model_id) model = BlipForQuestionAnswering.from_pretrained(model_id) model.eval() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # Load translation pipelines en_to_ar = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ar") ar_to_en = pipeline("translation", model="Helsinki-NLP/opus-mt-ar-en") # Arabic detection function def is_arabic(text): return bool(re.search(r'[\u0600-\u06FF]', text)) # Prediction function with translation support def predict(image, question): try: original_question = question if is_arabic(question): question = ar_to_en(question)[0]['translation_text'] inputs = processor(image, question, return_tensors="pt").to(device) with torch.no_grad(): outputs = model.generate(**inputs) answer_en = processor.tokenizer.decode(outputs[0], skip_special_tokens=True) if is_arabic(original_question): answer = en_to_ar(answer_en)[0]['translation_text'] else: answer = answer_en return answer except Exception as e: return f"❌ Error: {str(e)}" # Gradio UI with gr.Blocks() as demo: gr.Markdown("## 🤖 BLIP VQA Model - Supports English & Arabic Questions") image_input = gr.Image(type="pil", label="Upload Image") question_input = gr.Textbox(label="Ask a Question (Arabic or English)") answer_output = gr.Textbox(label="Answer") submit_btn = gr.Button("Get Answer") submit_btn.click(fn=predict, inputs=[image_input, question_input], outputs=answer_output) demo.launch()