import gradio as gr
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch

# Load an advanced image captioning model with optimizations
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

# Move model to GPU if available for faster inference
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate a detailed caption quickly
def generate_detailed_caption(image):
    inputs = processor(image, return_tensors="pt").to(device)
    out = model.generate(
        **inputs, 
        max_length=75,  # Slightly shorter for speed
        num_beams=5,     # Fewer beams for faster inference
        repetition_penalty=1.8, 
        length_penalty=1.0, 
        no_repeat_ngram_size=2
    )
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

# Gradio interface with webcam support
iface = gr.Interface(
    fn=generate_detailed_caption,
    inputs=[gr.Image(type="pil", label="Capture an image from webcam")],
    outputs=gr.Textbox(label="Detailed Image Description"),
    title="📷 Fast Image Capture & Description App",
    description="Capture an image using your webcam and let AI quickly generate a detailed description!",
    live=True,
)

if __name__ == "__main__":
    iface.launch()