import os
try:
import torchaudio
except ImportError:
os.system("cd ./F5-TTS; pip install -e .")
import spaces
import logging
from datetime import datetime
from pathlib import Path
import gradio as gr
import torch
import torchaudio
import tempfile
import requests
import shutil
import numpy as np
from huggingface_hub import hf_hub_download
model_path = "./F5-TTS/ckpts/v2c/"
if not os.path.exists(model_path):
os.makedirs(model_path)
file_path = hf_hub_download(repo_id="lshzhm/DeepAudio-V1", filename="v2c_s44.pt", local_dir=model_path)
print(f"Model saved at: {file_path}")
log = logging.getLogger()
#@spaces.GPU(duration=120)
def video_to_audio_and_speech(video: gr.Video, prompt: str, v2a_num_steps: int, text: str, audio_prompt: gr.Audio, text_prompt: str, v2s_num_steps: int):
video_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
audio_p_path = tempfile.NamedTemporaryFile(delete=False, suffix='.wav').name
output_dir = os.path.dirname(video_path)
video_save_path = str(output_dir) + "/" + str(video_path).replace("/", "__").strip(".") + ".mp4"
print("paths", video, video_path, output_dir, video_save_path)
print("paths", audio_prompt, audio_p_path, audio_prompt[1].shape, audio_prompt[1].max(), audio_prompt[1].min(), type(audio_prompt[1]))
if video.startswith("http"):
data = requests.get(video, timeout=60).content
with open(video_path, "wb") as fw:
fw.write(data)
else:
shutil.copy(video, video_path)
if isinstance(audio_prompt, tuple):
sr, data = audio_prompt
torchaudio.save(audio_p_path, torch.from_numpy(data.reshape(1,-1)/32768.0).to(torch.float32), sr)
elif audio_prompt.startswith("http"):
data = requests.get(audio_prompt, timeout=60).content
with open(audio_p_path, "wb") as fw:
fw.write(data)
else:
shutil.copy(audio_prompt, audio_p_path)
if prompt == "":
command = "cd ./MMAudio; python ./demo.py --variant small_44k --output %s --video %s --calc_energy 1 --num_steps %d" % (output_dir, video_path, v2a_num_steps)
else:
command = "cd ./MMAudio; python ./demo.py --variant small_44k --output %s --video %s --prompt %s --calc_energy 1 --num_steps %d" % (output_dir, video_path, prompt, v2a_num_steps)
print("v2a command", command)
os.system(command)
video_gen = video_save_path[:-4]+".mp4.gen.mp4"
command = "python ./F5-TTS/src/f5_tts/infer/infer_cli_test.py --output_dir %s --start 0 --end 1 --ckpt_file ./F5-TTS/ckpts/v2c/v2c_s44.pt --v2a_path %s --wav_p %s --txt_p \"%s\" --video %s --v2a_wav %s --txt \"%s\" --nfe_step %d" % (output_dir, output_dir, audio_p_path, text_prompt, video_save_path, video_save_path[:-4]+".flac", text, v2s_num_steps)
print("v2s command", command, video_gen)
os.system(command)
return video_save_path, video_gen
video_to_audio_and_speech_tab = gr.Interface(
fn=video_to_audio_and_speech,
description="""
Project page: https://acappemin.github.io/DeepAudio-V1.github.io
Code: https://github.com/acappemin/DeepAudio-V1
""",
inputs=[
gr.Video(label="Input Video"),
gr.Text(label='Video-to-Audio Text Prompt'),
gr.Number(label='Video-to-Audio Num Steps', value=25, precision=0, minimum=1),
gr.Text(label='Video-to-Speech Transcription'),
gr.Audio(label='Video-to-Speech Speech Prompt'),
gr.Text(label='Video-to-Speech Speech Prompt Transcription'),
gr.Number(label='Video-to-Speech Num Steps', value=32, precision=0, minimum=1),
],
outputs=[
gr.Video(label="Video-to-Audio Output"),
gr.Video(label="Video-to-Speech Output"),
],
cache_examples=False,
title='Video-to-Audio-and-Speech',
examples=[
[
'./tests/0235.mp4',
'',
25,
"Who finally decided to show up for work Yay",
'./tests/Gobber-00-0778.wav',
"I've still got a few knocking around in here",
32,
],
[
'./tests/0778.mp4',
'',
25,
"I've still got a few knocking around in here",
'./tests/Gobber-00-0235.wav',
"Who finally decided to show up for work Yay",
32,
],
])
if __name__ == "__main__":
gr.TabbedInterface([video_to_audio_and_speech_tab], ['Video-to-Audio-and-Speech']).queue(max_size=1).launch()