Narayana02 commited on
Commit
af3702c
Β·
verified Β·
1 Parent(s): 24778dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -63
app.py CHANGED
@@ -1,86 +1,97 @@
 
1
  import streamlit as st
2
- from huggingface_hub import InferenceClient
3
- from config import HUGGINGFACE_API_KEY # Import your API key from a separate config file
4
  from PIL import Image
5
- import requests
6
  from io import BytesIO
7
 
8
- # Streamlit App Configuration
9
- st.set_page_config(page_title="Llama-3.2 Demo App", page_icon="πŸ€–", layout="wide")
10
- st.title("πŸ–ΌοΈ Llama-3.2-90B-Vision-Instruct Demo App")
11
- st.markdown("<p style='text-align: center; font-size: 18px; color: #555;'>Enter an image URL and get a description</p>", unsafe_allow_html=True)
12
-
13
- # User Inputs with placeholder
14
- image_url = st.text_input("Enter Image URL", value="", placeholder="Paste image URL here...", max_chars=400)
15
- user_prompt = st.text_input("Enter your prompt", value="Describe this image in a paragraph", placeholder="e.g., What is shown in the image?")
16
 
17
- # Function to display the image from URL with height limit based on its actual size
18
- def show_image_from_url(image_url, max_height=200):
19
- try:
20
- response = requests.get(image_url)
21
- img = Image.open(BytesIO(response.content))
22
 
23
- # Get the original image size
24
- img_width, img_height = img.size
 
 
25
 
26
- # Calculate the new height and width based on the max height while maintaining the aspect ratio
27
- if img_height > max_height:
28
- aspect_ratio = img_width / img_height
29
- new_height = max_height
30
- new_width = int(new_height * aspect_ratio)
31
- img_resized = img.resize((new_width, new_height))
32
- else:
33
- img_resized = img # No resizing needed if the image is smaller than the max height
34
 
35
- # Center the image and display it
36
- st.image(img_resized, caption=f"Source: {image_url}", use_container_width=True)
 
37
 
 
 
 
 
 
 
38
  except Exception as e:
39
- st.error(f"❌ Unable to load image. Error: {e}")
 
40
 
41
- # Process user input
42
- if st.button("Get Description", key="get_description"):
43
- if image_url and user_prompt:
44
  try:
45
- # Show the image with dynamic resizing based on the image size
46
- show_image_from_url(image_url, max_height=600)
47
-
48
- # Initialize the InferenceClient
49
- client = InferenceClient(api_key=HUGGINGFACE_API_KEY)
50
-
51
- # Define messages for the model
52
- messages = [
53
- {
54
- "role": "user",
55
- "content": [
56
- {"type": "text", "text": user_prompt},
57
- {"type": "image_url", "image_url": {"url": image_url}}
58
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  }
60
- ]
61
 
62
- # Call the model
63
- completion = client.chat.completions.create(
64
- model="meta-llama/Llama-3.2-11B-Vision-Instruct",
65
- messages=messages,
66
- max_tokens=500
67
- )
68
 
69
- # Extract JSON response
70
- model_response = completion.choices[0].message
71
-
72
- # Display the result in a clean and simple format
73
  st.subheader("πŸ“ Model Response")
74
-
75
- # Display Content
76
- st.markdown(f"**Description**: {model_response.get('content', 'No description available')}")
77
 
78
  except Exception as e:
79
  st.error(f"❌ An error occurred: {e}")
80
  else:
81
- st.warning("⚠️ Please enter an image URL and a prompt.")
82
 
83
- # Clean UI Enhancements
84
  st.markdown("""
85
  <style>
86
  .stButton>button {
@@ -102,6 +113,10 @@ st.markdown("""
102
  border-radius: 10px;
103
  }
104
 
 
 
 
 
105
  /* Center the image */
106
  .stImage {
107
  display: block;
@@ -109,4 +124,4 @@ st.markdown("""
109
  margin-right: auto;
110
  }
111
  </style>
112
- """, unsafe_allow_html=True)
 
1
+ import os
2
  import streamlit as st
3
+ import onnxruntime as ort
4
+ from transformers import AutoTokenizer, AutoProcessor
5
  from PIL import Image
 
6
  from io import BytesIO
7
 
8
+ # Download ONNX models if they do not already exist
9
+ if not os.path.exists("vision_encoder_q4f16.onnx"):
10
+ os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/vision_encoder_q4f16.onnx')
11
+ if not os.path.exists("decoder_model_merged_q4f16.onnx"):
12
+ os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/decoder_model_merged_q4f16.onnx')
13
+ if not os.path.exists("embed_tokens_q4f16.onnx"):
14
+ os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/embed_tokens_q4f16.onnx')
 
15
 
16
+ # Load tokenizer and processor
17
+ tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
18
+ processor = AutoProcessor.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
 
 
19
 
20
+ # Load ONNX sessions
21
+ vision_encoder_session = ort.InferenceSession("vision_encoder_q4f16.onnx")
22
+ decoder_session = ort.InferenceSession("decoder_model_merged_q4f16.onnx")
23
+ embed_tokens_session = ort.InferenceSession("embed_tokens_q4f16.onnx")
24
 
25
+ # Streamlit App Configuration
26
+ st.set_page_config(page_title="Vision-Based ONNX AI App", page_icon="πŸ€–", layout="wide")
27
+ st.title("πŸ–ΌοΈ Vision-Based ONNX AI Demo App")
28
+ st.markdown("<p style='text-align: center; font-size: 18px; color: #555;'>Upload an image and get a description</p>", unsafe_allow_html=True)
 
 
 
 
29
 
30
+ # User Input: Image Upload
31
+ uploaded_image = st.file_uploader("Upload an Image", type=["png", "jpg", "jpeg"])
32
+ user_prompt = st.text_input("Enter your prompt", value="Describe this image in detail", placeholder="e.g., What is shown in the image?")
33
 
34
+ # Display uploaded image
35
+ def display_uploaded_image(uploaded_image):
36
+ try:
37
+ img = Image.open(uploaded_image)
38
+ st.image(img, caption="Uploaded Image", use_container_width=True)
39
+ return img
40
  except Exception as e:
41
+ st.error(f"❌ Unable to display image. Error: {e}")
42
+ return None
43
 
44
+ # Process the uploaded image
45
+ if st.button("Get Description"):
46
+ if uploaded_image and user_prompt:
47
  try:
48
+ # Display the uploaded image
49
+ img = display_uploaded_image(uploaded_image)
50
+ if img is None:
51
+ st.error("❌ Image processing failed.")
52
+ st.stop()
53
+
54
+ # Preprocess the image
55
+ img_buffer = BytesIO()
56
+ img.save(img_buffer, format="PNG")
57
+ img_bytes = img_buffer.getvalue()
58
+ processed_image = processor(images=img, return_tensors="np")
59
+
60
+ # Generate embeddings using the vision encoder
61
+ vision_embeddings = vision_encoder_session.run(
62
+ None, {"pixel_values": processed_image["pixel_values"]}
63
+ )[0]
64
+
65
+ # Tokenize the user prompt
66
+ inputs = tokenizer(user_prompt, return_tensors="np")
67
+ input_ids = inputs["input_ids"]
68
+
69
+ # Generate embedded tokens
70
+ embedded_tokens = embed_tokens_session.run(
71
+ None, {"input_ids": input_ids}
72
+ )[0]
73
+
74
+ # Generate a response using the decoder
75
+ decoder_outputs = decoder_session.run(
76
+ None, {
77
+ "vision_embeddings": vision_embeddings,
78
+ "embedded_tokens": embedded_tokens
79
  }
80
+ )[0]
81
 
82
+ # Decode the output
83
+ description = tokenizer.decode(decoder_outputs, skip_special_tokens=True)
 
 
 
 
84
 
85
+ # Display the description
 
 
 
86
  st.subheader("πŸ“ Model Response")
87
+ st.markdown(f"**Description**: {description}")
 
 
88
 
89
  except Exception as e:
90
  st.error(f"❌ An error occurred: {e}")
91
  else:
92
+ st.warning("⚠️ Please upload an image and enter a prompt.")
93
 
94
+ # UI Enhancements
95
  st.markdown("""
96
  <style>
97
  .stButton>button {
 
113
  border-radius: 10px;
114
  }
115
 
116
+ .stFileUploader>div>div {
117
+ border-radius: 10px;
118
+ }
119
+
120
  /* Center the image */
121
  .stImage {
122
  display: block;
 
124
  margin-right: auto;
125
  }
126
  </style>
127
+ """, unsafe_allow_html=True)