Spaces:
Sleeping
Sleeping
import torch | |
from transformers import AutoProcessor, AutoModelForVision2Seq | |
import gradio as gr | |
from PIL import Image | |
# Load Kosmos-2 Model | |
MODEL_NAME = "microsoft/kosmos-2-patch14-224" | |
processor = AutoProcessor.from_pretrained(MODEL_NAME) | |
model = AutoModelForVision2Seq.from_pretrained(MODEL_NAME) | |
# Ensure model is on GPU if available | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model.to(device) | |
def analyze_image(image, prompt): | |
"""Process an image with a text prompt using Kosmos-2.""" | |
try: | |
image = Image.fromarray(image) # Convert to PIL Image | |
inputs = processor(images=image, text=prompt, return_tensors="pt").to(device) | |
# Generate output | |
output = model.generate(**inputs, max_new_tokens=100) # Allow up to 100 new tokens | |
result_text = processor.batch_decode(output, skip_special_tokens=True)[0] | |
return result_text | |
except Exception as e: | |
return f"Error: {str(e)}" | |
# Gradio Interface | |
iface = gr.Interface( | |
fn=analyze_image, | |
inputs=[gr.Image(type="numpy"), gr.Textbox(label="Prompt")], | |
outputs=gr.Textbox(label="Generated Response"), | |
title="Kosmos-2 Image Reasoning", | |
description="Upload an image and provide a text prompt. Kosmos-2 will generate insights based on the image and text input.", | |
) | |
# Launch the Gradio app | |
iface.launch() | |