awacke1 commited on
Commit
f7c75cd
1 Parent(s): 65224df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +158 -45
app.py CHANGED
@@ -5,54 +5,109 @@ import os
5
  import base64
6
  import cv2
7
  from moviepy.editor import VideoFileClip
8
-
9
- # documentation
10
- # 1. Cookbook: https://cookbook.openai.com/examples/gpt4o/introduction_to_gpt4o
11
- # 2. Configure your Project and Orgs to limit/allow Models: https://platform.openai.com/settings/organization/general
12
- # 3. Watch your Billing! https://platform.openai.com/settings/organization/billing/overview
13
-
14
 
15
  # Set API key and organization ID from environment variables
16
  openai.api_key = os.getenv('OPENAI_API_KEY')
17
  openai.organization = os.getenv('OPENAI_ORG_ID')
18
- client = OpenAI(api_key= os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
19
 
20
  # Define the model to be used
21
- #MODEL = "gpt-4o"
22
  MODEL = "gpt-4o-2024-05-13"
23
 
24
- def process_text():
25
- text_input = st.text_input("Enter your text:")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  if text_input:
 
27
  completion = client.chat.completions.create(
28
  model=MODEL,
29
- messages=[
30
- {"role": "system", "content": "You are a helpful assistant. Help me with my math homework!"},
31
- {"role": "user", "content": f"Hello! Could you solve {text_input}?"}
32
- ]
33
  )
34
- st.write("Assistant: " + completion.choices[0].message.content)
 
 
 
 
 
 
 
 
 
 
35
 
36
  def process_image(image_input):
37
  if image_input:
 
38
  base64_image = base64.b64encode(image_input.read()).decode("utf-8")
 
 
 
 
 
 
39
  response = client.chat.completions.create(
40
  model=MODEL,
41
- messages=[
42
- {"role": "system", "content": "You are a helpful assistant that responds in Markdown."},
43
- {"role": "user", "content": [
44
- {"type": "text", "text": "Help me understand what is in this picture and list ten facts as markdown outline with appropriate emojis that describes what you see."},
45
- {"type": "image_url", "image_url": {
46
- "url": f"data:image/png;base64,{base64_image}"}
47
- }
48
- ]}
49
- ],
50
  temperature=0.0,
51
  )
52
- st.markdown(response.choices[0].message.content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  def process_audio(audio_input):
55
  if audio_input:
 
56
  transcription = client.audio.transcriptions.create(
57
  model="whisper-1",
58
  file=audio_input,
@@ -65,10 +120,15 @@ def process_audio(audio_input):
65
  ],
66
  temperature=0,
67
  )
68
- st.markdown(response.choices[0].message.content)
 
 
 
 
69
 
70
  def process_audio_for_video(video_input):
71
  if video_input:
 
72
  transcription = client.audio.transcriptions.create(
73
  model="whisper-1",
74
  file=video_input,
@@ -81,8 +141,12 @@ def process_audio_for_video(video_input):
81
  ],
82
  temperature=0,
83
  )
84
- st.markdown(response.choices[0].message.content)
85
- return response.choices[0].message.content
 
 
 
 
86
 
87
  def save_video(video_file):
88
  # Save the uploaded video file
@@ -126,7 +190,7 @@ def process_video(video_path, seconds_per_frame=2):
126
  def process_audio_and_video(video_input):
127
  if video_input is not None:
128
  # Save the uploaded video file
129
- video_path = save_video(video_input )
130
 
131
  # Process the saved video
132
  base64Frames, audio_path = process_video(video_path, seconds_per_frame=1)
@@ -135,29 +199,31 @@ def process_audio_and_video(video_input):
135
  transcript = process_audio_for_video(video_input)
136
 
137
  # Generate a summary with visual and audio
 
 
 
 
 
 
138
  response = client.chat.completions.create(
139
  model=MODEL,
140
- messages=[
141
- {"role": "system", "content": """You are generating a video summary. Create a summary of the provided video and its transcript. Respond in Markdown"""},
142
- {"role": "user", "content": [
143
- "These are the frames from the video.",
144
- *map(lambda x: {"type": "image_url",
145
- "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
146
- {"type": "text", "text": f"The audio transcription is: {transcript}"}
147
- ]},
148
- ],
149
  temperature=0,
150
  )
151
-
152
- st.markdown(response.choices[0].message.content)
153
-
 
 
 
154
 
155
  def main():
156
- st.markdown("### OpenAI GPT-4o Model")
157
- st.markdown("#### The Omni Model with Text, Audio, Image, and Video")
158
  option = st.selectbox("Select an option", ("Text", "Image", "Audio", "Video"))
159
  if option == "Text":
160
- process_text()
 
 
161
  elif option == "Image":
162
  image_input = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
163
  process_image(image_input)
@@ -167,6 +233,53 @@ def main():
167
  elif option == "Video":
168
  video_input = st.file_uploader("Upload a video file", type=["mp4"])
169
  process_audio_and_video(video_input)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
  if __name__ == "__main__":
172
- main()
 
5
  import base64
6
  import cv2
7
  from moviepy.editor import VideoFileClip
8
+ import pytz
9
+ from datetime import datetime
 
 
 
 
10
 
11
  # Set API key and organization ID from environment variables
12
  openai.api_key = os.getenv('OPENAI_API_KEY')
13
  openai.organization = os.getenv('OPENAI_ORG_ID')
14
+ client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
15
 
16
  # Define the model to be used
 
17
  MODEL = "gpt-4o-2024-05-13"
18
 
19
+ def generate_filename(prompt, file_type):
20
+ central = pytz.timezone('US/Central')
21
+ safe_date_time = datetime.now(central).strftime("%m%d_%H%M")
22
+ replaced_prompt = prompt.replace(" ", "_").replace("\n", "_")
23
+ safe_prompt = "".join(x for x in replaced_prompt if x.isalnum() or x == "_")[:90]
24
+ return f"{safe_date_time}_{safe_prompt}.{file_type}"
25
+
26
+ def create_file(filename, prompt, response, should_save=True):
27
+ if not should_save:
28
+ return
29
+ base_filename, ext = os.path.splitext(filename)
30
+ if ext in ['.txt', '.htm', '.md']:
31
+ with open(f"{base_filename}.md", 'w', encoding='utf-8') as file:
32
+ file.write(response)
33
+
34
+ def process_text(text_input):
35
+ if text_input:
36
+ st.session_state.messages.append({"role": "user", "content": text_input})
37
+
38
+ with st.chat_message("user"):
39
+ st.markdown(text_input)
40
+
41
+ with st.chat_message("assistant"):
42
+ completion = client.chat.completions.create(
43
+ model=MODEL,
44
+ messages=[
45
+ {"role": m["role"], "content": m["content"]}
46
+ for m in st.session_state.messages
47
+ ],
48
+ stream=False
49
+ )
50
+ return_text = completion.choices[0].message.content
51
+ st.write("Assistant: " + return_text)
52
+ filename = generate_filename(text_input, "md")
53
+ create_file(filename, text_input, return_text, should_save=True)
54
+ st.session_state.messages.append({"role": "assistant", "content": return_text})
55
+
56
+ def process_text2(MODEL='gpt-4o-2024-05-13', text_input='What is 2+2 and what is an imaginary number'):
57
  if text_input:
58
+ st.session_state.messages.append({"role": "user", "content": text_input})
59
  completion = client.chat.completions.create(
60
  model=MODEL,
61
+ messages=st.session_state.messages
 
 
 
62
  )
63
+ return_text = completion.choices[0].message.content
64
+ st.write("Assistant: " + return_text)
65
+ filename = generate_filename(text_input, "md")
66
+ create_file(filename, text_input, return_text, should_save=True)
67
+ return return_text
68
+
69
+ def save_image(image_input, filename):
70
+ # Save the uploaded image file
71
+ with open(filename, "wb") as f:
72
+ f.write(image_input.getvalue())
73
+ return filename
74
 
75
  def process_image(image_input):
76
  if image_input:
77
+ st.markdown('Processing image: ' + image_input.name )
78
  base64_image = base64.b64encode(image_input.read()).decode("utf-8")
79
+ st.session_state.messages.append({"role": "user", "content": [
80
+ {"type": "text", "text": "Help me understand what is in this picture and list ten facts as markdown outline with appropriate emojis that describes what you see."},
81
+ {"type": "image_url", "image_url": {
82
+ "url": f"data:image/png;base64,{base64_image}"}
83
+ }
84
+ ]})
85
  response = client.chat.completions.create(
86
  model=MODEL,
87
+ messages=st.session_state.messages,
 
 
 
 
 
 
 
 
88
  temperature=0.0,
89
  )
90
+ image_response = response.choices[0].message.content
91
+ st.markdown(image_response)
92
+
93
+ filename_md = generate_filename(image_input.name + '- ' + image_response, "md")
94
+ filename_png = filename_md.replace('.md', '.' + image_input.name.split('.')[-1])
95
+
96
+ create_file(filename_md, image_response, '', True)
97
+
98
+ with open(filename_md, "w", encoding="utf-8") as f:
99
+ f.write(image_response)
100
+
101
+ filename_img = image_input.name
102
+ save_image(image_input, filename_img)
103
+
104
+ st.session_state.messages.append({"role": "assistant", "content": image_response})
105
+
106
+ return image_response
107
 
108
  def process_audio(audio_input):
109
  if audio_input:
110
+ st.session_state.messages.append({"role": "user", "content": audio_input})
111
  transcription = client.audio.transcriptions.create(
112
  model="whisper-1",
113
  file=audio_input,
 
120
  ],
121
  temperature=0,
122
  )
123
+ audio_response = response.choices[0].message.content
124
+ st.markdown(audio_response)
125
+ filename = generate_filename(transcription.text, "md")
126
+ create_file(filename, transcription.text, audio_response, should_save=True)
127
+ st.session_state.messages.append({"role": "assistant", "content": audio_response})
128
 
129
  def process_audio_for_video(video_input):
130
  if video_input:
131
+ st.session_state.messages.append({"role": "user", "content": video_input})
132
  transcription = client.audio.transcriptions.create(
133
  model="whisper-1",
134
  file=video_input,
 
141
  ],
142
  temperature=0,
143
  )
144
+ video_response = response.choices[0].message.content
145
+ st.markdown(video_response)
146
+ filename = generate_filename(transcription, "md")
147
+ create_file(filename, transcription, video_response, should_save=True)
148
+ st.session_state.messages.append({"role": "assistant", "content": video_response})
149
+ return video_response
150
 
151
  def save_video(video_file):
152
  # Save the uploaded video file
 
190
  def process_audio_and_video(video_input):
191
  if video_input is not None:
192
  # Save the uploaded video file
193
+ video_path = save_video(video_input)
194
 
195
  # Process the saved video
196
  base64Frames, audio_path = process_video(video_path, seconds_per_frame=1)
 
199
  transcript = process_audio_for_video(video_input)
200
 
201
  # Generate a summary with visual and audio
202
+ st.session_state.messages.append({"role": "user", "content": [
203
+ "These are the frames from the video.",
204
+ *map(lambda x: {"type": "image_url",
205
+ "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
206
+ {"type": "text", "text": f"The audio transcription is: {transcript}"}
207
+ ]})
208
  response = client.chat.completions.create(
209
  model=MODEL,
210
+ messages=st.session_state.messages,
 
 
 
 
 
 
 
 
211
  temperature=0,
212
  )
213
+ video_response = response.choices[0].message.content
214
+ st.markdown(video_response)
215
+
216
+ filename = generate_filename(transcript, "md")
217
+ create_file(filename, transcript, video_response, should_save=True)
218
+ st.session_state.messages.append({"role": "assistant", "content": video_response})
219
 
220
  def main():
221
+ st.markdown("##### GPT-4o Omni Model: Text, Audio, Image, & Video")
 
222
  option = st.selectbox("Select an option", ("Text", "Image", "Audio", "Video"))
223
  if option == "Text":
224
+ text_input = st.text_input("Enter your text:")
225
+ if text_input:
226
+ process_text(text_input)
227
  elif option == "Image":
228
  image_input = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
229
  process_image(image_input)
 
233
  elif option == "Video":
234
  video_input = st.file_uploader("Upload a video file", type=["mp4"])
235
  process_audio_and_video(video_input)
236
+
237
+ # File Gallery
238
+ all_files = glob.glob("*.md")
239
+ all_files = [file for file in all_files if len(os.path.splitext(file)[0]) >= 10] # exclude files with short names
240
+ all_files.sort(key=lambda x: (os.path.splitext(x)[1], x), reverse=True) # sort by filename length which puts similar prompts together - consider making date and time of file optional.
241
+
242
+ st.sidebar.title("File Gallery")
243
+ for file in all_files:
244
+ with st.sidebar.expander(file):
245
+ with open(file, "r", encoding="utf-8") as f:
246
+ file_content = f.read()
247
+ st.code(file_content, language="markdown")
248
+
249
+ # ChatBot Entry
250
+ if prompt := st.chat_input("GPT-4o Multimodal ChatBot - What can I help you with?"):
251
+ st.session_state.messages.append({"role": "user", "content": prompt})
252
+ with st.chat_message("user"):
253
+ st.markdown(prompt)
254
+ with st.chat_message("assistant"):
255
+ completion = client.chat.completions.create(
256
+ model=MODEL,
257
+ messages=st.session_state.messages,
258
+ stream=True
259
+ )
260
+ response = process_text2(text_input=prompt)
261
+ st.session_state.messages.append({"role": "assistant", "content": response})
262
+
263
+ # Transcript to arxiv and client chat completion
264
+ filename = save_and_play_audio(audio_recorder)
265
+ if filename is not None:
266
+ transcript = transcribe_canary(filename)
267
+
268
+ # Search ArXiV and get the Summary and Reference Papers Listing
269
+ result = search_arxiv(transcript)
270
+
271
+ # Start chatbot with transcript:
272
+ st.session_state.messages.append({"role": "user", "content": transcript})
273
+ with st.chat_message("user"):
274
+ st.markdown(transcript)
275
+ with st.chat_message("assistant"):
276
+ completion = client.chat.completions.create(
277
+ model=MODEL,
278
+ messages=st.session_state.messages,
279
+ stream=True
280
+ )
281
+ response = process_text2(text_input=prompt)
282
+ st.session_state.messages.append({"role": "assistant", "content": response})
283
 
284
  if __name__ == "__main__":
285
+ main()