Spaces:
Running
Running
gradio
Browse files- F5-TTS/src/f5_tts/infer/infer_cli_test.py +84 -40
- MMAudio/demo.py +2 -4
- app.py +45 -12
F5-TTS/src/f5_tts/infer/infer_cli_test.py
CHANGED
@@ -188,6 +188,26 @@ parser.add_argument(
|
|
188 |
type=str,
|
189 |
default="",
|
190 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
args = parser.parse_args()
|
193 |
|
@@ -404,17 +424,21 @@ def normalize_wav(waveform, waveform_ref):
|
|
404 |
|
405 |
|
406 |
if __name__ == "__main__":
|
407 |
-
|
408 |
-
|
409 |
v2a_path = args.v2a_path
|
|
|
|
|
|
|
410 |
|
411 |
-
|
412 |
-
|
413 |
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
|
|
|
|
418 |
|
419 |
print("datas2", len(datas2))
|
420 |
if True:
|
@@ -423,17 +447,34 @@ if __name__ == "__main__":
|
|
423 |
video_p, txt_p, wav_p = data_p
|
424 |
|
425 |
v2a_audio = v2a_path + video.replace("/", "__").strip(".") + ".flac"
|
426 |
-
v2a_audio_p = v2a_path + video_p.replace("/", "__").strip(".") + ".flac"
|
427 |
|
428 |
-
print(video, wav, v2a_audio, video_p, wav_p
|
429 |
|
430 |
-
if not os.path.exists(video) or not os.path.exists(
|
431 |
continue
|
432 |
-
if not os.path.exists(
|
433 |
continue
|
434 |
|
435 |
-
energy = torch.from_numpy(np.load(v2a_audio+".npz")["arr_0"]).unsqueeze(0).unsqueeze(2)
|
436 |
-
energy_p = torch.from_numpy(np.load(v2a_audio_p+".npz")["arr_0"]).unsqueeze(0).unsqueeze(2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
#print("energy shape", energy_p.shape, energy.shape)
|
438 |
#energy = torch.cat([energy_p, energy], dim=1)
|
439 |
|
@@ -450,37 +491,40 @@ if __name__ == "__main__":
|
|
450 |
wav_gen = torch.zeros(1, 24000)
|
451 |
sr_gen = 24000
|
452 |
|
453 |
-
waveform, sr = torchaudio.load(wav)
|
454 |
-
if sr != 24000:
|
455 |
-
|
456 |
-
waveform_p, sr = torchaudio.load(wav_p)
|
457 |
-
if sr != 24000:
|
458 |
-
|
459 |
#print(wav_gen.shape, wav_gen.max(), waveform.max(), waveform_p.max())
|
460 |
|
461 |
-
if not os.path.exists(output_dir):
|
462 |
-
|
463 |
-
if not os.path.exists(output_dir+"/ref/"):
|
464 |
-
|
465 |
-
if not os.path.exists(output_dir+"/gen/"):
|
466 |
-
|
467 |
-
if not os.path.exists(output_dir+"/tgt/"):
|
468 |
-
|
469 |
|
470 |
-
torchaudio.save(output_dir+"/ref/"+str(i+args.start).zfill(8)+".wav", waveform_p[0:1,:], 24000)
|
471 |
-
torchaudio.save(output_dir+"/gen/"+str(i+args.start).zfill(8)+".wav", normalize_wav(wav_gen[0:1,:], waveform_p[0:1,:]), 24000)
|
472 |
-
torchaudio.save(output_dir+"/tgt/"+str(i+args.start).zfill(8)+".wav", waveform[0:1,:], 24000)
|
|
|
473 |
|
474 |
-
if not os.path.exists(output_dir+"/videos/"):
|
475 |
-
|
476 |
|
477 |
video_clip = VideoFileClip(video)
|
478 |
-
audio_clip = AudioFileClip(wav)
|
479 |
-
audio_gen_clip = AudioFileClip(output_dir+"/gen/" + str(i+args.start).zfill(8) + ".wav")
|
480 |
-
|
481 |
-
|
482 |
-
|
|
|
483 |
video_clip_gen = video_clip.set_audio(audio_gen_clip)
|
484 |
-
video_clip_gt.write_videofile(output_dir+"/videos/" + str(i+args.start).zfill(8) + ".gt.mp4", codec="libx264", audio_codec="aac")
|
485 |
-
video_clip_gen.write_videofile(output_dir+"/videos/" + str(i+args.start).zfill(8) + ".gen.mp4", codec="libx264", audio_codec="aac")
|
|
|
486 |
|
|
|
188 |
type=str,
|
189 |
default="",
|
190 |
)
|
191 |
+
parser.add_argument(
|
192 |
+
"--wav_p",
|
193 |
+
type=str,
|
194 |
+
default="",
|
195 |
+
)
|
196 |
+
parser.add_argument(
|
197 |
+
"--txt_p",
|
198 |
+
type=str,
|
199 |
+
default="",
|
200 |
+
)
|
201 |
+
parser.add_argument(
|
202 |
+
"--video",
|
203 |
+
type=str,
|
204 |
+
default="",
|
205 |
+
)
|
206 |
+
parser.add_argument(
|
207 |
+
"--txt",
|
208 |
+
type=str,
|
209 |
+
default="",
|
210 |
+
)
|
211 |
|
212 |
args = parser.parse_args()
|
213 |
|
|
|
424 |
|
425 |
|
426 |
if __name__ == "__main__":
|
427 |
+
|
|
|
428 |
v2a_path = args.v2a_path
|
429 |
+
|
430 |
+
if args.wav_p == "":
|
431 |
+
scp = args.infer_list
|
432 |
|
433 |
+
with open(scp, "r") as fr:
|
434 |
+
lines = fr.readlines()
|
435 |
|
436 |
+
datas2 = []
|
437 |
+
for line in lines:
|
438 |
+
wav_p, video_p, txt_p, wav, video, txt = line.strip().split("\t")
|
439 |
+
datas2.append([[video, txt, wav], [video_p, txt_p, wav_p]])
|
440 |
+
else:
|
441 |
+
datas2 = [[[args.video, args.txt, None], [None, args.txt_p, args.wav_p]]]
|
442 |
|
443 |
print("datas2", len(datas2))
|
444 |
if True:
|
|
|
447 |
video_p, txt_p, wav_p = data_p
|
448 |
|
449 |
v2a_audio = v2a_path + video.replace("/", "__").strip(".") + ".flac"
|
450 |
+
#v2a_audio_p = v2a_path + video_p.replace("/", "__").strip(".") + ".flac"
|
451 |
|
452 |
+
print(video, wav, v2a_audio, video_p, wav_p)
|
453 |
|
454 |
+
if not os.path.exists(video) or not os.path.exists(v2a_audio):
|
455 |
continue
|
456 |
+
if not os.path.exists(wav_p):
|
457 |
continue
|
458 |
|
459 |
+
#energy = torch.from_numpy(np.load(v2a_audio+".npz")["arr_0"]).unsqueeze(0).unsqueeze(2)
|
460 |
+
#energy_p = torch.from_numpy(np.load(v2a_audio_p+".npz")["arr_0"]).unsqueeze(0).unsqueeze(2)
|
461 |
+
|
462 |
+
waveform_v2a, sr_v2a = torchaudio.load(v2a_audio)
|
463 |
+
duration_v2a = waveform_v2a.shape[-1] / sr_v2a
|
464 |
+
energy = []
|
465 |
+
for i in range(int(duration_v2a/(256/24000))):
|
466 |
+
energy.append(waveform_v2a[0,int(i*sr_v2a*(256/24000)):int((i+1)*sr_v2a*(256/24000))].abs().mean())
|
467 |
+
energy = np.array(energy)
|
468 |
+
energy = energy / max(energy)
|
469 |
+
|
470 |
+
waveform_p, sr_p = torchaudio.load(wav_p)
|
471 |
+
duration_p = waveform_p.shape[-1] / sr_p
|
472 |
+
energy_p = []
|
473 |
+
for i in range(int(duration_p/(256/24000))):
|
474 |
+
energy_p.append(waveform_p[0,int(i*sr_p*(256/24000)):int((i+1)*sr_p*(256/24000))].abs().mean())
|
475 |
+
energy_p = np.array(energy_p)
|
476 |
+
energy_p = energy_p / max(energy_p)
|
477 |
+
|
478 |
#print("energy shape", energy_p.shape, energy.shape)
|
479 |
#energy = torch.cat([energy_p, energy], dim=1)
|
480 |
|
|
|
491 |
wav_gen = torch.zeros(1, 24000)
|
492 |
sr_gen = 24000
|
493 |
|
494 |
+
#waveform, sr = torchaudio.load(wav)
|
495 |
+
#if sr != 24000:
|
496 |
+
# waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=24000)
|
497 |
+
#waveform_p, sr = torchaudio.load(wav_p)
|
498 |
+
#if sr != 24000:
|
499 |
+
# waveform_p = torchaudio.functional.resample(waveform_p, orig_freq=sr, new_freq=24000)
|
500 |
#print(wav_gen.shape, wav_gen.max(), waveform.max(), waveform_p.max())
|
501 |
|
502 |
+
#if not os.path.exists(output_dir):
|
503 |
+
# os.makedirs(output_dir)
|
504 |
+
#if not os.path.exists(output_dir+"/ref/"):
|
505 |
+
# os.makedirs(output_dir+"/ref/")
|
506 |
+
#if not os.path.exists(output_dir+"/gen/"):
|
507 |
+
# os.makedirs(output_dir+"/gen/")
|
508 |
+
#if not os.path.exists(output_dir+"/tgt/"):
|
509 |
+
# os.makedirs(output_dir+"/tgt/")
|
510 |
|
511 |
+
#torchaudio.save(output_dir+"/ref/"+str(i+args.start).zfill(8)+".wav", waveform_p[0:1,:], 24000)
|
512 |
+
#torchaudio.save(output_dir+"/gen/"+str(i+args.start).zfill(8)+".wav", normalize_wav(wav_gen[0:1,:], waveform_p[0:1,:]), 24000)
|
513 |
+
#torchaudio.save(output_dir+"/tgt/"+str(i+args.start).zfill(8)+".wav", waveform[0:1,:], 24000)
|
514 |
+
torchaudio.save(video+".gen.wav", normalize_wav(wav_gen[0:1,:], waveform_p[0:1,:]), 24000)
|
515 |
|
516 |
+
#if not os.path.exists(output_dir+"/videos/"):
|
517 |
+
# os.makedirs(output_dir+"/videos/")
|
518 |
|
519 |
video_clip = VideoFileClip(video)
|
520 |
+
#audio_clip = AudioFileClip(wav)
|
521 |
+
#audio_gen_clip = AudioFileClip(output_dir+"/gen/" + str(i+args.start).zfill(8) + ".wav")
|
522 |
+
audio_gen_clip = AudioFileClip(video+".gen.wav")
|
523 |
+
#print("video audio durations", video_clip.duration, audio_clip.duration, audio_gen_clip.duration)
|
524 |
+
#os.system("cp " + video + " " + output_dir+"/videos/" + str(i+args.start).zfill(8) + ".mp4")
|
525 |
+
#video_clip_gt = video_clip.set_audio(audio_clip)
|
526 |
video_clip_gen = video_clip.set_audio(audio_gen_clip)
|
527 |
+
#video_clip_gt.write_videofile(output_dir+"/videos/" + str(i+args.start).zfill(8) + ".gt.mp4", codec="libx264", audio_codec="aac")
|
528 |
+
#video_clip_gen.write_videofile(output_dir+"/videos/" + str(i+args.start).zfill(8) + ".gen.mp4", codec="libx264", audio_codec="aac")
|
529 |
+
video_clip_gen.write_videofile(video+".gen.mp4", codec="libx264", audio_codec="aac")
|
530 |
|
MMAudio/demo.py
CHANGED
@@ -178,8 +178,7 @@ def main():
|
|
178 |
audio = audios.float().cpu()[0]
|
179 |
if video_path is not None:
|
180 |
####save_path = output_dir / f'{video_path.stem}.flac'
|
181 |
-
|
182 |
-
save_path = str(output_dir) + "/__" + os.path.basename(video_path).strip(".") + ".flac"
|
183 |
else:
|
184 |
safe_filename = prompt.replace(' ', '_').replace('/', '_').replace('.', '')
|
185 |
save_path = output_dir / f'{safe_filename}.flac'
|
@@ -210,8 +209,7 @@ def main():
|
|
210 |
log.info(f'Audio saved to {save_path}')
|
211 |
if video_path is not None and not skip_video_composite:
|
212 |
####video_save_path = output_dir / f'{video_path.stem}.mp4'
|
213 |
-
|
214 |
-
video_save_path = str(output_dir) + "/__" + os.path.basename(video_path).strip(".") + ".mp4"
|
215 |
make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
|
216 |
log.info(f'Video saved to {video_save_path}')
|
217 |
|
|
|
178 |
audio = audios.float().cpu()[0]
|
179 |
if video_path is not None:
|
180 |
####save_path = output_dir / f'{video_path.stem}.flac'
|
181 |
+
save_path = str(output_dir) + "/" + str(video_path).replace("/", "__").strip(".") + ".flac"
|
|
|
182 |
else:
|
183 |
safe_filename = prompt.replace(' ', '_').replace('/', '_').replace('.', '')
|
184 |
save_path = output_dir / f'{safe_filename}.flac'
|
|
|
209 |
log.info(f'Audio saved to {save_path}')
|
210 |
if video_path is not None and not skip_video_composite:
|
211 |
####video_save_path = output_dir / f'{video_path.stem}.mp4'
|
212 |
+
video_save_path = str(output_dir) + "/" + str(video_path).replace("/", "__").strip(".") + ".mp4"
|
|
|
213 |
make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
|
214 |
log.info(f'Video saved to {video_save_path}')
|
215 |
|
app.py
CHANGED
@@ -18,19 +18,23 @@ import tempfile
|
|
18 |
|
19 |
import requests
|
20 |
import shutil
|
|
|
21 |
|
22 |
log = logging.getLogger()
|
23 |
|
24 |
|
25 |
#@spaces.GPU(duration=120)
|
26 |
-
def
|
27 |
|
28 |
video_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
|
29 |
|
|
|
|
|
30 |
output_dir = os.path.dirname(video_path)
|
31 |
-
video_save_path =
|
32 |
|
33 |
print("paths", video, video_path, output_dir, video_save_path)
|
|
|
34 |
|
35 |
if video.startswith("http"):
|
36 |
data = requests.get(video, timeout=60).content
|
@@ -39,39 +43,68 @@ def video_to_audio(video: gr.Video, prompt: str):
|
|
39 |
else:
|
40 |
shutil.copy(video, video_path)
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
if prompt == "":
|
43 |
-
|
44 |
else:
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
-
return video_save_path
|
48 |
|
49 |
|
50 |
-
|
51 |
-
fn=
|
52 |
description="""
|
53 |
Project page: <a href="https://acappemin.github.io/DeepAudio-V1.github.io">https://acappemin.github.io/DeepAudio-V1.github.io</a><br>
|
54 |
Code: <a href="https://github.com/acappemin/DeepAudio-V1">https://github.com/acappemin/DeepAudio-V1</a><br>
|
55 |
""",
|
56 |
inputs=[
|
57 |
-
gr.Video(),
|
58 |
-
gr.Text(label='Prompt'),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
],
|
60 |
-
outputs='playable_video',
|
61 |
cache_examples=False,
|
62 |
-
title='Video-to-Audio',
|
63 |
examples=[
|
64 |
[
|
65 |
'./tests/0235.mp4',
|
66 |
'',
|
|
|
|
|
|
|
67 |
],
|
68 |
[
|
69 |
'./tests/0778.mp4',
|
70 |
'',
|
|
|
|
|
|
|
71 |
],
|
72 |
])
|
73 |
|
74 |
|
75 |
if __name__ == "__main__":
|
76 |
-
gr.TabbedInterface([
|
77 |
|
|
|
18 |
|
19 |
import requests
|
20 |
import shutil
|
21 |
+
import numpy as np
|
22 |
|
23 |
log = logging.getLogger()
|
24 |
|
25 |
|
26 |
#@spaces.GPU(duration=120)
|
27 |
+
def video_to_audio_and_speech(video: gr.Video, prompt: str, text: str, audio_prompt: gr.Audio, text_prompt: str):
|
28 |
|
29 |
video_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
|
30 |
|
31 |
+
audio_p_path = tempfile.NamedTemporaryFile(delete=False, suffix='.wav').name
|
32 |
+
|
33 |
output_dir = os.path.dirname(video_path)
|
34 |
+
video_save_path = str(output_dir) + "/" + str(video_path).replace("/", "__").strip(".") + ".mp4"
|
35 |
|
36 |
print("paths", video, video_path, output_dir, video_save_path)
|
37 |
+
print("paths", audio_prompt, audio_p_path, audio_prompt[1].shape, audio_prompt[1].max(), audio_prompt[1].min(), type(audio_prompt[1]))
|
38 |
|
39 |
if video.startswith("http"):
|
40 |
data = requests.get(video, timeout=60).content
|
|
|
43 |
else:
|
44 |
shutil.copy(video, video_path)
|
45 |
|
46 |
+
if isinstance(audio_prompt, tuple):
|
47 |
+
sr, data = audio_prompt
|
48 |
+
torchaudio.save(audio_p_path, torch.from_numpy(data.reshape(1,-1)/32768.0), sr)
|
49 |
+
elif audio_prompt.startswith("http"):
|
50 |
+
data = requests.get(audio_prompt, timeout=60).content
|
51 |
+
with open(audio_p_path, "wb") as fw:
|
52 |
+
fw.write(data)
|
53 |
+
else:
|
54 |
+
shutil.copy(audio_prompt, audio_p_path)
|
55 |
+
|
56 |
if prompt == "":
|
57 |
+
command = "cd ./MMAudio; python ./demo.py --variant small_44k --output %s --video %s --calc_energy 1" % (output_dir, video_path)
|
58 |
else:
|
59 |
+
command = "cd ./MMAudio; python ./demo.py --variant small_44k --output %s --video %s --prompt %s --calc_energy 1" % (output_dir, video_path, prompt)
|
60 |
+
print("v2a command", command)
|
61 |
+
os.system(command)
|
62 |
+
|
63 |
+
command = "python ./F5-TTS/src/f5_tts/infer/infer_cli_test.py --output_dir %s --start 0 --end 1 --ckpt_file ./F5-TTS/ckpts/v2c/v2c_s44.pt --v2a_path %s --wav_p %s --txt_p \"%s\" --video %s --txt \"%s\"" % (output_dir, output_dir, audio_p_path, text_prompt, video_save_path, text)
|
64 |
+
print("v2s command", command)
|
65 |
+
os.system(command)
|
66 |
+
video_gen = output_dir + "/videos/gen/0001"
|
67 |
|
68 |
+
return video_save_path, video_gen
|
69 |
|
70 |
|
71 |
+
video_to_audio_and_speech_tab = gr.Interface(
|
72 |
+
fn=video_to_audio_and_speech,
|
73 |
description="""
|
74 |
Project page: <a href="https://acappemin.github.io/DeepAudio-V1.github.io">https://acappemin.github.io/DeepAudio-V1.github.io</a><br>
|
75 |
Code: <a href="https://github.com/acappemin/DeepAudio-V1">https://github.com/acappemin/DeepAudio-V1</a><br>
|
76 |
""",
|
77 |
inputs=[
|
78 |
+
gr.Video(label="Input Video"),
|
79 |
+
gr.Text(label='Video-to-Audio Text Prompt'),
|
80 |
+
gr.Text(label='Video-to-Speech Transcription'),
|
81 |
+
gr.Audio(label='Video-to-Speech Speech Prompt'),
|
82 |
+
gr.Text(label='Video-to-Speech Speech Prompt Transcription'),
|
83 |
+
],
|
84 |
+
outputs=[
|
85 |
+
gr.Video(label="Video-to-Audio Output"),
|
86 |
+
gr.Video(label="Video-to-Speech Output"),
|
87 |
],
|
|
|
88 |
cache_examples=False,
|
89 |
+
title='Video-to-Audio-and-Speech',
|
90 |
examples=[
|
91 |
[
|
92 |
'./tests/0235.mp4',
|
93 |
'',
|
94 |
+
"Who finally decided to show up for work Yay",
|
95 |
+
'./tests/Gobber-00-0778.wav',
|
96 |
+
"I've still got a few knocking around in here",
|
97 |
],
|
98 |
[
|
99 |
'./tests/0778.mp4',
|
100 |
'',
|
101 |
+
"I've still got a few knocking around in here",
|
102 |
+
'./tests/Gobber-00-0235.wav',
|
103 |
+
"Who finally decided to show up for work Yay",
|
104 |
],
|
105 |
])
|
106 |
|
107 |
|
108 |
if __name__ == "__main__":
|
109 |
+
gr.TabbedInterface([video_to_audio_and_speech_tab], ['Video-to-Audio-and-Speech']).launch(server_name='0.0.0.0', server_port=30459)
|
110 |
|