import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
import gradio as gr
from PIL import Image

# Load Kosmos-2 Model
MODEL_NAME = "microsoft/kosmos-2-patch14-224"

processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = AutoModelForVision2Seq.from_pretrained(MODEL_NAME)

# Ensure model is on GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def analyze_image(image, prompt):
    """Process an image with a text prompt using Kosmos-2."""
    try:
        image = Image.fromarray(image)  # Convert to PIL Image
        inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
        
        # Generate output
        output = model.generate(**inputs, max_new_tokens=100)  # Allow up to 100 new tokens
        result_text = processor.batch_decode(output, skip_special_tokens=True)[0]
        
        return result_text

    except Exception as e:
        return f"Error: {str(e)}"

# Gradio Interface
iface = gr.Interface(
    fn=analyze_image,
    inputs=[gr.Image(type="numpy"), gr.Textbox(label="Prompt")],
    outputs=gr.Textbox(label="Generated Response"),
    title="Kosmos-2 Image Reasoning",
    description="Upload an image and provide a text prompt. Kosmos-2 will generate insights based on the image and text input.",
)

# Launch the Gradio app
iface.launch()