Spaces:

Narayana02
/

image_caption_description

Sleeping

App Files Files Community

Narayana02 commited on Dec 20, 2024

Commit

af3702c

verified ·

1 Parent(s): 24778dc

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -63

app.py CHANGED Viewed

@@ -1,86 +1,97 @@
 import streamlit as st
-from huggingface_hub import InferenceClient
-from config import HUGGINGFACE_API_KEY  # Import your API key from a separate config file
 from PIL import Image
-import requests
 from io import BytesIO
-# Streamlit App Configuration
-st.set_page_config(page_title="Llama-3.2 Demo App", page_icon="🤖", layout="wide")
-st.title("🖼️ Llama-3.2-90B-Vision-Instruct Demo App")
-st.markdown("<p style='text-align: center; font-size: 18px; color: #555;'>Enter an image URL and get a description</p>", unsafe_allow_html=True)
-# User Inputs with placeholder
-image_url = st.text_input("Enter Image URL", value="", placeholder="Paste image URL here...", max_chars=400)
-user_prompt = st.text_input("Enter your prompt", value="Describe this image in a paragraph", placeholder="e.g., What is shown in the image?")
-# Function to display the image from URL with height limit based on its actual size
-def show_image_from_url(image_url, max_height=200):
-    try:
-        response = requests.get(image_url)
-        img = Image.open(BytesIO(response.content))
-        # Get the original image size
-        img_width, img_height = img.size
-        # Calculate the new height and width based on the max height while maintaining the aspect ratio
-        if img_height > max_height:
-            aspect_ratio = img_width / img_height
-            new_height = max_height
-            new_width = int(new_height * aspect_ratio)
-            img_resized = img.resize((new_width, new_height))
-        else:
-            img_resized = img  # No resizing needed if the image is smaller than the max height
-        # Center the image and display it
-        st.image(img_resized, caption=f"Source: {image_url}", use_container_width=True)
     except Exception as e:
-        st.error(f"❌ Unable to load image. Error: {e}")
-# Process user input
-if st.button("Get Description", key="get_description"):
-    if image_url and user_prompt:
         try:
-            # Show the image with dynamic resizing based on the image size
-            show_image_from_url(image_url, max_height=600)
-            # Initialize the InferenceClient
-            client = InferenceClient(api_key=HUGGINGFACE_API_KEY)
-            # Define messages for the model
-            messages = [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": user_prompt},
-                        {"type": "image_url", "image_url": {"url": image_url}}
-                    ]
                 }
-            ]
-            # Call the model
-            completion = client.chat.completions.create(
-                model="meta-llama/Llama-3.2-11B-Vision-Instruct",
-                messages=messages,
-                max_tokens=500
-            )
-            # Extract JSON response
-            model_response = completion.choices[0].message
-            # Display the result in a clean and simple format
             st.subheader("📝 Model Response")
-            # Display Content
-            st.markdown(f"**Description**: {model_response.get('content', 'No description available')}")
         except Exception as e:
             st.error(f"❌ An error occurred: {e}")
     else:
-        st.warning("⚠️ Please enter an image URL and a prompt.")
-# Clean UI Enhancements
 st.markdown("""
     <style>
         .stButton>button {
@@ -102,6 +113,10 @@ st.markdown("""
             border-radius: 10px;
         }
         /* Center the image */
         .stImage {
             display: block;
@@ -109,4 +124,4 @@ st.markdown("""
             margin-right: auto;
         }
     </style>
-""", unsafe_allow_html=True)

+import os
 import streamlit as st
+import onnxruntime as ort
+from transformers import AutoTokenizer, AutoProcessor
 from PIL import Image
 from io import BytesIO
+# Download ONNX models if they do not already exist
+if not os.path.exists("vision_encoder_q4f16.onnx"):
+    os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/vision_encoder_q4f16.onnx')
+if not os.path.exists("decoder_model_merged_q4f16.onnx"):
+    os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/decoder_model_merged_q4f16.onnx')
+if not os.path.exists("embed_tokens_q4f16.onnx"):
+    os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/embed_tokens_q4f16.onnx')
+# Load tokenizer and processor
+tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
+processor = AutoProcessor.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
+# Load ONNX sessions
+vision_encoder_session = ort.InferenceSession("vision_encoder_q4f16.onnx")
+decoder_session = ort.InferenceSession("decoder_model_merged_q4f16.onnx")
+embed_tokens_session = ort.InferenceSession("embed_tokens_q4f16.onnx")
+# Streamlit App Configuration
+st.set_page_config(page_title="Vision-Based ONNX AI App", page_icon="🤖", layout="wide")
+st.title("🖼️ Vision-Based ONNX AI Demo App")
+st.markdown("<p style='text-align: center; font-size: 18px; color: #555;'>Upload an image and get a description</p>", unsafe_allow_html=True)
+# User Input: Image Upload
+uploaded_image = st.file_uploader("Upload an Image", type=["png", "jpg", "jpeg"])
+user_prompt = st.text_input("Enter your prompt", value="Describe this image in detail", placeholder="e.g., What is shown in the image?")
+# Display uploaded image
+def display_uploaded_image(uploaded_image):
+    try:
+        img = Image.open(uploaded_image)
+        st.image(img, caption="Uploaded Image", use_container_width=True)
+        return img
     except Exception as e:
+        st.error(f"❌ Unable to display image. Error: {e}")
+        return None
+# Process the uploaded image
+if st.button("Get Description"):
+    if uploaded_image and user_prompt:
         try:
+            # Display the uploaded image
+            img = display_uploaded_image(uploaded_image)
+            if img is None:
+                st.error("❌ Image processing failed.")
+                st.stop()
+            # Preprocess the image
+            img_buffer = BytesIO()
+            img.save(img_buffer, format="PNG")
+            img_bytes = img_buffer.getvalue()
+            processed_image = processor(images=img, return_tensors="np")
+            # Generate embeddings using the vision encoder
+            vision_embeddings = vision_encoder_session.run(
+                None, {"pixel_values": processed_image["pixel_values"]}
+            )[0]
+            # Tokenize the user prompt
+            inputs = tokenizer(user_prompt, return_tensors="np")
+            input_ids = inputs["input_ids"]
+            # Generate embedded tokens
+            embedded_tokens = embed_tokens_session.run(
+                None, {"input_ids": input_ids}
+            )[0]
+            # Generate a response using the decoder
+            decoder_outputs = decoder_session.run(
+                None, {
+                    "vision_embeddings": vision_embeddings,
+                    "embedded_tokens": embedded_tokens
                 }
+            )[0]
+            # Decode the output
+            description = tokenizer.decode(decoder_outputs, skip_special_tokens=True)
+            # Display the description
             st.subheader("📝 Model Response")
+            st.markdown(f"**Description**: {description}")
         except Exception as e:
             st.error(f"❌ An error occurred: {e}")
     else:
+        st.warning("⚠️ Please upload an image and enter a prompt.")
+# UI Enhancements
 st.markdown("""
     <style>
         .stButton>button {
             border-radius: 10px;
         }
+        .stFileUploader>div>div {
+            border-radius: 10px;
+        }
         /* Center the image */
         .stImage {
             display: block;
             margin-right: auto;
         }
     </style>
+""", unsafe_allow_html=True)