|
import gradio as gr |
|
from gradio_client import Client |
|
import os |
|
import json |
|
import re |
|
from moviepy.editor import * |
|
import cv2 |
|
|
|
hf_token = os.environ.get("HF_TKN") |
|
|
|
def extract_firstframe(video_in): |
|
vidcap = cv2.VideoCapture(video_in) |
|
success,image = vidcap.read() |
|
count = 0 |
|
while success: |
|
if count == 0: |
|
cv2.imwrite("first_frame.jpg", image) |
|
else: |
|
break |
|
success,image = vidcap.read() |
|
print ('Read a new frame: ', success) |
|
count += 1 |
|
print ("Done extracted first frame!") |
|
return "first_frame.jpg" |
|
|
|
def extract_audio(video_in): |
|
input_video = video_in |
|
output_audio = 'audio.wav' |
|
|
|
|
|
video_clip = VideoFileClip(input_video) |
|
audio_clip = video_clip.audio |
|
|
|
|
|
audio_clip.write_audiofile(output_audio, fps=44100) |
|
print("Audio extraction complete.") |
|
|
|
return 'audio.wav' |
|
|
|
def get_caption_from_kosmos(image_in): |
|
kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/") |
|
|
|
kosmos2_result = kosmos2_client.predict( |
|
image_in, |
|
"Detailed", |
|
fn_index=4 |
|
) |
|
|
|
print(f"KOSMOS2 RETURNS: {kosmos2_result}") |
|
|
|
with open(kosmos2_result[1], 'r') as f: |
|
data = json.load(f) |
|
|
|
reconstructed_sentence = [] |
|
for sublist in data: |
|
reconstructed_sentence.append(sublist[0]) |
|
|
|
full_sentence = ' '.join(reconstructed_sentence) |
|
|
|
|
|
|
|
pattern = r'^Describe this image in detail:\s*(.*)$' |
|
|
|
match = re.search(pattern, full_sentence) |
|
if match: |
|
description = match.group(1) |
|
print(description) |
|
else: |
|
print("Unable to locate valid description.") |
|
|
|
|
|
last_period_index = description.rfind('.') |
|
|
|
|
|
truncated_caption = description[:last_period_index + 1] |
|
|
|
|
|
print(f"\n—\nIMAGE CAPTION: {truncated_caption}") |
|
|
|
return truncated_caption |
|
|
|
def get_caption(image_in): |
|
client = Client("https://fffiloni-moondream1.hf.space/", hf_token=hf_token) |
|
result = client.predict( |
|
image_in, |
|
"Describe precisely the image in one sentence.", |
|
|
|
api_name="/predict" |
|
) |
|
print(result) |
|
return result |
|
|
|
def get_magnet(prompt): |
|
amended_prompt = f"{prompt}" |
|
print(amended_prompt) |
|
client = Client("https://fffiloni-magnet.hf.space/") |
|
result = client.predict( |
|
"facebook/audio-magnet-medium", |
|
"", |
|
amended_prompt, |
|
3, |
|
0.9, |
|
10, |
|
1, |
|
20, |
|
10, |
|
10, |
|
10, |
|
"prod-stride1 (new!)", |
|
api_name="/predict_full" |
|
) |
|
print(result) |
|
return result[1] |
|
|
|
def get_audioldm(prompt): |
|
client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/") |
|
result = client.predict( |
|
prompt, |
|
"Low quality. Music.", |
|
10, |
|
3.5, |
|
45, |
|
3, |
|
fn_index=1 |
|
) |
|
print(result) |
|
audio_result = extract_audio(result) |
|
return audio_result |
|
|
|
def get_audiogen(prompt): |
|
client = Client("https://fffiloni-audiogen.hf.space/") |
|
result = client.predict( |
|
prompt, |
|
10, |
|
api_name="/infer" |
|
) |
|
return result |
|
|
|
def get_tango(prompt): |
|
try: |
|
client = Client("https://declare-lab-tango.hf.space/") |
|
except: |
|
raise gr.Error("Tango space API is not ready, please try again in few minutes ") |
|
|
|
result = client.predict( |
|
prompt, |
|
100, |
|
4, |
|
api_name="/predict" |
|
) |
|
print(result) |
|
return result |
|
|
|
def blend_vsfx(video_in, audio_result): |
|
audioClip = AudioFileClip(audio_result) |
|
print(f"AUD: {audioClip.duration}") |
|
clip = VideoFileClip(video_in) |
|
print(f"VID: {clip.duration}") |
|
if clip.duration < audioClip.duration : |
|
audioClip = audioClip.subclip((0.0), (clip.duration)) |
|
elif clip.duration > audioClip.duration : |
|
clip = clip.subclip((0.0), (audioClip.duration)) |
|
final_clip = clip.set_audio(audioClip) |
|
|
|
codec = 'libx264' |
|
audio_codec = 'aac' |
|
final_clip.write_videofile('final_video_with_sound.mp4', codec=codec, audio_codec=audio_codec) |
|
return "final_video_with_sound.mp4" |
|
|
|
def infer(video_in, chosen_model): |
|
image_in = extract_firstframe(video_in) |
|
caption = get_caption(image_in) |
|
if chosen_model == "MAGNet" : |
|
audio_result = get_magnet(caption) |
|
elif chosen_model == "AudioLDM-2" : |
|
audio_result = get_audioldm(caption) |
|
elif chosen_model == "AudioGen" : |
|
audio_result = get_audiogen(caption) |
|
elif chosen_model == "Tango" : |
|
audio_result = get_tango(caption) |
|
|
|
final_res = blend_vsfx(video_in, audio_result) |
|
return audio_result, final_res |
|
css=""" |
|
#col-container{ |
|
margin: 0 auto; |
|
max-width: 800px; |
|
} |
|
""" |
|
|
|
with gr.Blocks(css=css) as demo: |
|
with gr.Column(elem_id="col-container"): |
|
gr.HTML(""" |
|
<h2 style="text-align: center;"> |
|
Video to SoundFX |
|
</h2> |
|
<p style="text-align: center;"> |
|
Get sound effects from a video shot while comparing audio models from image caption. |
|
</p> |
|
""") |
|
|
|
with gr.Row(): |
|
|
|
with gr.Column(): |
|
video_in = gr.Video(sources=["upload"], label="Video input") |
|
with gr.Row(): |
|
chosen_model = gr.Dropdown(label="Choose a model", choices=["MAGNet", "AudioLDM-2", "AudioGen", "Tango"], value="Tango") |
|
submit_btn = gr.Button("Submit") |
|
with gr.Column(): |
|
audio_o = gr.Audio(label="Audio output") |
|
video_o = gr.Video(label="Video with soundFX") |
|
|
|
submit_btn.click( |
|
fn=infer, |
|
inputs=[video_in, chosen_model], |
|
outputs=[audio_o, video_o], |
|
concurrency_limit = 2 |
|
) |
|
|
|
demo.queue(max_size=10).launch(debug=True, show_error=True) |