Spaces:
Running
Running
import streamlit as st | |
from PIL import Image | |
from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor | |
st.title("Coffe machine captioning app") | |
def load_model(): | |
with st.spinner('Loading model and tokenizer...'): | |
model_id = "Fer14/paligemma_coffe_machine_caption" | |
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id) | |
processor = PaliGemmaProcessor.from_pretrained(model_id) | |
st.success('Model loaded!') | |
return model, processor | |
model, processor = load_model() | |
st.sidebar.title("Instructions") | |
st.sidebar.write( | |
""" | |
1. Upload an image using the file uploader. | |
2. Wait for the app to process and generate the caption. | |
3. The caption will be displayed in the text area. | |
4. Enjoy your caption! | |
""" | |
) | |
uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"]) | |
prompt = ( | |
f"Generate a caption for the following coffee maker image. The caption has to be of the following structure:\n" | |
"\"A <color> <type>, <accessories>, <shape> shaped, with <screen> and <number> <b_color> butons\"\n\n" | |
"in which:\n" | |
"- color: red, black, blue...\n" | |
"- type: coffee machine, coffee maker, espresso coffee machine...\n" | |
"- accessories: a list of accessories like the ones described above\n" | |
"- shape: cubed, round...\n" | |
"- screen: screen, no screen.\n" | |
"- number: amount of buttons to add\n" | |
"- b_color: color of the buttons" | |
) | |
if uploaded_image is not None: | |
# Display the uploaded image | |
image = Image.open(uploaded_image).convert("RGB") | |
st.image(image, caption='Uploaded Image.', use_column_width=True) | |
inputs = processor( | |
text=prompt, | |
images=image, | |
return_tensors="pt", | |
padding="longest", | |
) | |
with st.spinner('Generating caption...'): | |
output = model.generate(**inputs, max_length=1000) | |
out = processor.decode(output[0], skip_special_tokens=True)[len(prompt) :] | |
# Display the extracted text | |
st.text_area("Coffe machine caption", out, height=100) | |