Spaces:

awacke1
/

GPT-4o-omni-text-audio-image-video

Running

App Files Files Community

awacke1 commited on May 14, 2024

Commit

b866d44

verified ·

1 Parent(s): d907b5f

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -46

app.py CHANGED Viewed

@@ -67,15 +67,21 @@ def process_audio(audio_input):
         )
         st.markdown(response.choices[0].message.content)
 def process_video(video_path, seconds_per_frame=2):
     base64Frames = []
     base_video_path, _ = os.path.splitext(video_path)
     video = cv2.VideoCapture(video_path)
     total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = video.get(cv2.CAP_PROP_FPS)
     frames_to_skip = int(fps * seconds_per_frame)
-    curr_frame=0
     # Loop through the video and extract frames at specified sampling rate
     while curr_frame < total_frames - 1:
@@ -86,6 +92,7 @@ def process_video(video_path, seconds_per_frame=2):
         _, buffer = cv2.imencode(".jpg", frame)
         base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
         curr_frame += frames_to_skip
     video.release()
     # Extract audio from video
@@ -97,54 +104,34 @@ def process_video(video_path, seconds_per_frame=2):
     print(f"Extracted {len(base64Frames)} frames")
     print(f"Extracted audio to {audio_path}")
     return base64Frames, audio_path
-# Extract 1 frame per second. You can adjust the `seconds_per_frame` parameter to change the sampling rate
-base64Frames, audio_path = process_video(VIDEO_PATH, seconds_per_frame=1)
-## Generate a summary with visual and audio
-def process_video(video_input):
-    base64Frames, audio_path = process_video(video_input, seconds_per_frame=1)
-    response = client.chat.completions.create(
-        model=MODEL,
-        messages=[
-        {"role": "system", "content":"""You are generating a video summary. Create a summary of the provided video and its transcript. Respond in Markdown"""},
-        {"role": "user", "content": [
-            "These are the frames from the video.",
-            *map(lambda x: {"type": "image_url",
-                            "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
-            {"type": "text", "text": f"The audio transcription is: {transcription.text}"}
             ],
-        }
-        ],
-        temperature=0,
-    )
-    st.markdown(response.choices[0].message.content)
-def process_video_frames(video_path, seconds_per_frame=2):
-    base64Frames = []
-    base_video_path, _ = os.path.splitext(video_path.name)
-    video = cv2.VideoCapture(video_path.name)
-    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
-    fps = video.get(cv2.CAP_PROP_FPS)
-    frames_to_skip = int(fps * seconds_per_frame)
-    curr_frame = 0
-    while curr_frame < total_frames - 1:
-        video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
-        success, frame = video.read()
-        if not success:
-            break
-        _, buffer = cv2.imencode(".jpg", frame)
-        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
-        curr_frame += frames_to_skip
-    video.release()
-    audio_path = f"{base_video_path}.mp3"
-    clip = VideoFileClip(video_path.name)
-    clip.audio.write_audiofile(audio_path, bitrate="32k")
-    clip.audio.close()
-    clip.close()
-    return base64Frames, audio_path
 def main():
     st.markdown("### OpenAI GPT-4o Model")

         )
         st.markdown(response.choices[0].message.content)
+def save_video(video_file):
+    # Save the uploaded video file
+    with open(video_file.name, "wb") as f:
+        f.write(video_file.getbuffer())
+    return video_file.name
 def process_video(video_path, seconds_per_frame=2):
     base64Frames = []
     base_video_path, _ = os.path.splitext(video_path)
     video = cv2.VideoCapture(video_path)
     total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = video.get(cv2.CAP_PROP_FPS)
     frames_to_skip = int(fps * seconds_per_frame)
+    curr_frame = 0
     # Loop through the video and extract frames at specified sampling rate
     while curr_frame < total_frames - 1:
         _, buffer = cv2.imencode(".jpg", frame)
         base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
         curr_frame += frames_to_skip
     video.release()
     # Extract audio from video
     print(f"Extracted {len(base64Frames)} frames")
     print(f"Extracted audio to {audio_path}")
     return base64Frames, audio_path
+def ProcessVideo(video_input)
+    if video_input is not None:
+        # Save the uploaded video file
+        video_path = save_video(video_file)
+        # Process the saved video
+        base64Frames, audio_path = process_video(video_path, seconds_per_frame=1)
+        # Generate a summary with visual and audio
+        response = client.chat.completions.create(
+            model=MODEL,
+            messages=[
+                {"role": "system", "content": """You are generating a video summary. Create a summary of the provided video and its transcript. Respond in Markdown"""},
+                {"role": "user", "content": [
+                    "These are the frames from the video.",
+                    *map(lambda x: {"type": "image_url",
+                                    "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
+                    {"type": "text", "text": f"The audio transcription is: {transcription.text}"}
+                ]},
             ],
+            temperature=0,
+        )
+        st.markdown(response.choices[0].message.content)
 def main():
     st.markdown("### OpenAI GPT-4o Model")