Update app.py
Browse files
app.py
CHANGED
|
@@ -1,86 +1,97 @@
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
-
|
| 3 |
-
from
|
| 4 |
from PIL import Image
|
| 5 |
-
import requests
|
| 6 |
from io import BytesIO
|
| 7 |
|
| 8 |
-
#
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
user_prompt = st.text_input("Enter your prompt", value="Describe this image in a paragraph", placeholder="e.g., What is shown in the image?")
|
| 16 |
|
| 17 |
-
#
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
response = requests.get(image_url)
|
| 21 |
-
img = Image.open(BytesIO(response.content))
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
new_width = int(new_height * aspect_ratio)
|
| 31 |
-
img_resized = img.resize((new_width, new_height))
|
| 32 |
-
else:
|
| 33 |
-
img_resized = img # No resizing needed if the image is smaller than the max height
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
|
|
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
except Exception as e:
|
| 39 |
-
st.error(f"β Unable to
|
|
|
|
| 40 |
|
| 41 |
-
# Process
|
| 42 |
-
if st.button("Get Description"
|
| 43 |
-
if
|
| 44 |
try:
|
| 45 |
-
#
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
#
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
}
|
| 60 |
-
]
|
| 61 |
|
| 62 |
-
#
|
| 63 |
-
|
| 64 |
-
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
|
| 65 |
-
messages=messages,
|
| 66 |
-
max_tokens=500
|
| 67 |
-
)
|
| 68 |
|
| 69 |
-
#
|
| 70 |
-
model_response = completion.choices[0].message
|
| 71 |
-
|
| 72 |
-
# Display the result in a clean and simple format
|
| 73 |
st.subheader("π Model Response")
|
| 74 |
-
|
| 75 |
-
# Display Content
|
| 76 |
-
st.markdown(f"**Description**: {model_response.get('content', 'No description available')}")
|
| 77 |
|
| 78 |
except Exception as e:
|
| 79 |
st.error(f"β An error occurred: {e}")
|
| 80 |
else:
|
| 81 |
-
st.warning("β οΈ Please
|
| 82 |
|
| 83 |
-
#
|
| 84 |
st.markdown("""
|
| 85 |
<style>
|
| 86 |
.stButton>button {
|
|
@@ -102,6 +113,10 @@ st.markdown("""
|
|
| 102 |
border-radius: 10px;
|
| 103 |
}
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
/* Center the image */
|
| 106 |
.stImage {
|
| 107 |
display: block;
|
|
@@ -109,4 +124,4 @@ st.markdown("""
|
|
| 109 |
margin-right: auto;
|
| 110 |
}
|
| 111 |
</style>
|
| 112 |
-
""", unsafe_allow_html=True)
|
|
|
|
| 1 |
+
import os
|
| 2 |
import streamlit as st
|
| 3 |
+
import onnxruntime as ort
|
| 4 |
+
from transformers import AutoTokenizer, AutoProcessor
|
| 5 |
from PIL import Image
|
|
|
|
| 6 |
from io import BytesIO
|
| 7 |
|
| 8 |
+
# Download ONNX models if they do not already exist
|
| 9 |
+
if not os.path.exists("vision_encoder_q4f16.onnx"):
|
| 10 |
+
os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/vision_encoder_q4f16.onnx')
|
| 11 |
+
if not os.path.exists("decoder_model_merged_q4f16.onnx"):
|
| 12 |
+
os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/decoder_model_merged_q4f16.onnx')
|
| 13 |
+
if not os.path.exists("embed_tokens_q4f16.onnx"):
|
| 14 |
+
os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/embed_tokens_q4f16.onnx')
|
|
|
|
| 15 |
|
| 16 |
+
# Load tokenizer and processor
|
| 17 |
+
tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
|
| 18 |
+
processor = AutoProcessor.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
|
|
|
|
|
|
|
| 19 |
|
| 20 |
+
# Load ONNX sessions
|
| 21 |
+
vision_encoder_session = ort.InferenceSession("vision_encoder_q4f16.onnx")
|
| 22 |
+
decoder_session = ort.InferenceSession("decoder_model_merged_q4f16.onnx")
|
| 23 |
+
embed_tokens_session = ort.InferenceSession("embed_tokens_q4f16.onnx")
|
| 24 |
|
| 25 |
+
# Streamlit App Configuration
|
| 26 |
+
st.set_page_config(page_title="Vision-Based ONNX AI App", page_icon="π€", layout="wide")
|
| 27 |
+
st.title("πΌοΈ Vision-Based ONNX AI Demo App")
|
| 28 |
+
st.markdown("<p style='text-align: center; font-size: 18px; color: #555;'>Upload an image and get a description</p>", unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
# User Input: Image Upload
|
| 31 |
+
uploaded_image = st.file_uploader("Upload an Image", type=["png", "jpg", "jpeg"])
|
| 32 |
+
user_prompt = st.text_input("Enter your prompt", value="Describe this image in detail", placeholder="e.g., What is shown in the image?")
|
| 33 |
|
| 34 |
+
# Display uploaded image
|
| 35 |
+
def display_uploaded_image(uploaded_image):
|
| 36 |
+
try:
|
| 37 |
+
img = Image.open(uploaded_image)
|
| 38 |
+
st.image(img, caption="Uploaded Image", use_container_width=True)
|
| 39 |
+
return img
|
| 40 |
except Exception as e:
|
| 41 |
+
st.error(f"β Unable to display image. Error: {e}")
|
| 42 |
+
return None
|
| 43 |
|
| 44 |
+
# Process the uploaded image
|
| 45 |
+
if st.button("Get Description"):
|
| 46 |
+
if uploaded_image and user_prompt:
|
| 47 |
try:
|
| 48 |
+
# Display the uploaded image
|
| 49 |
+
img = display_uploaded_image(uploaded_image)
|
| 50 |
+
if img is None:
|
| 51 |
+
st.error("β Image processing failed.")
|
| 52 |
+
st.stop()
|
| 53 |
+
|
| 54 |
+
# Preprocess the image
|
| 55 |
+
img_buffer = BytesIO()
|
| 56 |
+
img.save(img_buffer, format="PNG")
|
| 57 |
+
img_bytes = img_buffer.getvalue()
|
| 58 |
+
processed_image = processor(images=img, return_tensors="np")
|
| 59 |
+
|
| 60 |
+
# Generate embeddings using the vision encoder
|
| 61 |
+
vision_embeddings = vision_encoder_session.run(
|
| 62 |
+
None, {"pixel_values": processed_image["pixel_values"]}
|
| 63 |
+
)[0]
|
| 64 |
+
|
| 65 |
+
# Tokenize the user prompt
|
| 66 |
+
inputs = tokenizer(user_prompt, return_tensors="np")
|
| 67 |
+
input_ids = inputs["input_ids"]
|
| 68 |
+
|
| 69 |
+
# Generate embedded tokens
|
| 70 |
+
embedded_tokens = embed_tokens_session.run(
|
| 71 |
+
None, {"input_ids": input_ids}
|
| 72 |
+
)[0]
|
| 73 |
+
|
| 74 |
+
# Generate a response using the decoder
|
| 75 |
+
decoder_outputs = decoder_session.run(
|
| 76 |
+
None, {
|
| 77 |
+
"vision_embeddings": vision_embeddings,
|
| 78 |
+
"embedded_tokens": embedded_tokens
|
| 79 |
}
|
| 80 |
+
)[0]
|
| 81 |
|
| 82 |
+
# Decode the output
|
| 83 |
+
description = tokenizer.decode(decoder_outputs, skip_special_tokens=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
+
# Display the description
|
|
|
|
|
|
|
|
|
|
| 86 |
st.subheader("π Model Response")
|
| 87 |
+
st.markdown(f"**Description**: {description}")
|
|
|
|
|
|
|
| 88 |
|
| 89 |
except Exception as e:
|
| 90 |
st.error(f"β An error occurred: {e}")
|
| 91 |
else:
|
| 92 |
+
st.warning("β οΈ Please upload an image and enter a prompt.")
|
| 93 |
|
| 94 |
+
# UI Enhancements
|
| 95 |
st.markdown("""
|
| 96 |
<style>
|
| 97 |
.stButton>button {
|
|
|
|
| 113 |
border-radius: 10px;
|
| 114 |
}
|
| 115 |
|
| 116 |
+
.stFileUploader>div>div {
|
| 117 |
+
border-radius: 10px;
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
/* Center the image */
|
| 121 |
.stImage {
|
| 122 |
display: block;
|
|
|
|
| 124 |
margin-right: auto;
|
| 125 |
}
|
| 126 |
</style>
|
| 127 |
+
""", unsafe_allow_html=True)
|