lshzhm commited on
Commit
9289e32
·
1 Parent(s): a6ed34a
F5-TTS/src/f5_tts/infer/infer_cli_test.py CHANGED
@@ -188,6 +188,26 @@ parser.add_argument(
188
  type=str,
189
  default="",
190
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
  args = parser.parse_args()
193
 
@@ -404,17 +424,21 @@ def normalize_wav(waveform, waveform_ref):
404
 
405
 
406
  if __name__ == "__main__":
407
- scp = args.infer_list
408
-
409
  v2a_path = args.v2a_path
 
 
 
410
 
411
- with open(scp, "r") as fr:
412
- lines = fr.readlines()
413
 
414
- datas2 = []
415
- for line in lines:
416
- wav_p, video_p, txt_p, wav, video, txt = line.strip().split("\t")
417
- datas2.append([[video, txt, wav], [video_p, txt_p, wav_p]])
 
 
418
 
419
  print("datas2", len(datas2))
420
  if True:
@@ -423,17 +447,34 @@ if __name__ == "__main__":
423
  video_p, txt_p, wav_p = data_p
424
 
425
  v2a_audio = v2a_path + video.replace("/", "__").strip(".") + ".flac"
426
- v2a_audio_p = v2a_path + video_p.replace("/", "__").strip(".") + ".flac"
427
 
428
- print(video, wav, v2a_audio, video_p, wav_p, v2a_audio_p)
429
 
430
- if not os.path.exists(video) or not os.path.exists(wav) or not os.path.exists(v2a_audio):
431
  continue
432
- if not os.path.exists(video_p) or not os.path.exists(wav_p) or not os.path.exists(v2a_audio_p):
433
  continue
434
 
435
- energy = torch.from_numpy(np.load(v2a_audio+".npz")["arr_0"]).unsqueeze(0).unsqueeze(2)
436
- energy_p = torch.from_numpy(np.load(v2a_audio_p+".npz")["arr_0"]).unsqueeze(0).unsqueeze(2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
  #print("energy shape", energy_p.shape, energy.shape)
438
  #energy = torch.cat([energy_p, energy], dim=1)
439
 
@@ -450,37 +491,40 @@ if __name__ == "__main__":
450
  wav_gen = torch.zeros(1, 24000)
451
  sr_gen = 24000
452
 
453
- waveform, sr = torchaudio.load(wav)
454
- if sr != 24000:
455
- waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=24000)
456
- waveform_p, sr = torchaudio.load(wav_p)
457
- if sr != 24000:
458
- waveform_p = torchaudio.functional.resample(waveform_p, orig_freq=sr, new_freq=24000)
459
  #print(wav_gen.shape, wav_gen.max(), waveform.max(), waveform_p.max())
460
 
461
- if not os.path.exists(output_dir):
462
- os.makedirs(output_dir)
463
- if not os.path.exists(output_dir+"/ref/"):
464
- os.makedirs(output_dir+"/ref/")
465
- if not os.path.exists(output_dir+"/gen/"):
466
- os.makedirs(output_dir+"/gen/")
467
- if not os.path.exists(output_dir+"/tgt/"):
468
- os.makedirs(output_dir+"/tgt/")
469
 
470
- torchaudio.save(output_dir+"/ref/"+str(i+args.start).zfill(8)+".wav", waveform_p[0:1,:], 24000)
471
- torchaudio.save(output_dir+"/gen/"+str(i+args.start).zfill(8)+".wav", normalize_wav(wav_gen[0:1,:], waveform_p[0:1,:]), 24000)
472
- torchaudio.save(output_dir+"/tgt/"+str(i+args.start).zfill(8)+".wav", waveform[0:1,:], 24000)
 
473
 
474
- if not os.path.exists(output_dir+"/videos/"):
475
- os.makedirs(output_dir+"/videos/")
476
 
477
  video_clip = VideoFileClip(video)
478
- audio_clip = AudioFileClip(wav)
479
- audio_gen_clip = AudioFileClip(output_dir+"/gen/" + str(i+args.start).zfill(8) + ".wav")
480
- print("video audio durations", video_clip.duration, audio_clip.duration, audio_gen_clip.duration)
481
- os.system("cp " + video + " " + output_dir+"/videos/" + str(i+args.start).zfill(8) + ".mp4")
482
- video_clip_gt = video_clip.set_audio(audio_clip)
 
483
  video_clip_gen = video_clip.set_audio(audio_gen_clip)
484
- video_clip_gt.write_videofile(output_dir+"/videos/" + str(i+args.start).zfill(8) + ".gt.mp4", codec="libx264", audio_codec="aac")
485
- video_clip_gen.write_videofile(output_dir+"/videos/" + str(i+args.start).zfill(8) + ".gen.mp4", codec="libx264", audio_codec="aac")
 
486
 
 
188
  type=str,
189
  default="",
190
  )
191
+ parser.add_argument(
192
+ "--wav_p",
193
+ type=str,
194
+ default="",
195
+ )
196
+ parser.add_argument(
197
+ "--txt_p",
198
+ type=str,
199
+ default="",
200
+ )
201
+ parser.add_argument(
202
+ "--video",
203
+ type=str,
204
+ default="",
205
+ )
206
+ parser.add_argument(
207
+ "--txt",
208
+ type=str,
209
+ default="",
210
+ )
211
 
212
  args = parser.parse_args()
213
 
 
424
 
425
 
426
  if __name__ == "__main__":
427
+
 
428
  v2a_path = args.v2a_path
429
+
430
+ if args.wav_p == "":
431
+ scp = args.infer_list
432
 
433
+ with open(scp, "r") as fr:
434
+ lines = fr.readlines()
435
 
436
+ datas2 = []
437
+ for line in lines:
438
+ wav_p, video_p, txt_p, wav, video, txt = line.strip().split("\t")
439
+ datas2.append([[video, txt, wav], [video_p, txt_p, wav_p]])
440
+ else:
441
+ datas2 = [[[args.video, args.txt, None], [None, args.txt_p, args.wav_p]]]
442
 
443
  print("datas2", len(datas2))
444
  if True:
 
447
  video_p, txt_p, wav_p = data_p
448
 
449
  v2a_audio = v2a_path + video.replace("/", "__").strip(".") + ".flac"
450
+ #v2a_audio_p = v2a_path + video_p.replace("/", "__").strip(".") + ".flac"
451
 
452
+ print(video, wav, v2a_audio, video_p, wav_p)
453
 
454
+ if not os.path.exists(video) or not os.path.exists(v2a_audio):
455
  continue
456
+ if not os.path.exists(wav_p):
457
  continue
458
 
459
+ #energy = torch.from_numpy(np.load(v2a_audio+".npz")["arr_0"]).unsqueeze(0).unsqueeze(2)
460
+ #energy_p = torch.from_numpy(np.load(v2a_audio_p+".npz")["arr_0"]).unsqueeze(0).unsqueeze(2)
461
+
462
+ waveform_v2a, sr_v2a = torchaudio.load(v2a_audio)
463
+ duration_v2a = waveform_v2a.shape[-1] / sr_v2a
464
+ energy = []
465
+ for i in range(int(duration_v2a/(256/24000))):
466
+ energy.append(waveform_v2a[0,int(i*sr_v2a*(256/24000)):int((i+1)*sr_v2a*(256/24000))].abs().mean())
467
+ energy = np.array(energy)
468
+ energy = energy / max(energy)
469
+
470
+ waveform_p, sr_p = torchaudio.load(wav_p)
471
+ duration_p = waveform_p.shape[-1] / sr_p
472
+ energy_p = []
473
+ for i in range(int(duration_p/(256/24000))):
474
+ energy_p.append(waveform_p[0,int(i*sr_p*(256/24000)):int((i+1)*sr_p*(256/24000))].abs().mean())
475
+ energy_p = np.array(energy_p)
476
+ energy_p = energy_p / max(energy_p)
477
+
478
  #print("energy shape", energy_p.shape, energy.shape)
479
  #energy = torch.cat([energy_p, energy], dim=1)
480
 
 
491
  wav_gen = torch.zeros(1, 24000)
492
  sr_gen = 24000
493
 
494
+ #waveform, sr = torchaudio.load(wav)
495
+ #if sr != 24000:
496
+ # waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=24000)
497
+ #waveform_p, sr = torchaudio.load(wav_p)
498
+ #if sr != 24000:
499
+ # waveform_p = torchaudio.functional.resample(waveform_p, orig_freq=sr, new_freq=24000)
500
  #print(wav_gen.shape, wav_gen.max(), waveform.max(), waveform_p.max())
501
 
502
+ #if not os.path.exists(output_dir):
503
+ # os.makedirs(output_dir)
504
+ #if not os.path.exists(output_dir+"/ref/"):
505
+ # os.makedirs(output_dir+"/ref/")
506
+ #if not os.path.exists(output_dir+"/gen/"):
507
+ # os.makedirs(output_dir+"/gen/")
508
+ #if not os.path.exists(output_dir+"/tgt/"):
509
+ # os.makedirs(output_dir+"/tgt/")
510
 
511
+ #torchaudio.save(output_dir+"/ref/"+str(i+args.start).zfill(8)+".wav", waveform_p[0:1,:], 24000)
512
+ #torchaudio.save(output_dir+"/gen/"+str(i+args.start).zfill(8)+".wav", normalize_wav(wav_gen[0:1,:], waveform_p[0:1,:]), 24000)
513
+ #torchaudio.save(output_dir+"/tgt/"+str(i+args.start).zfill(8)+".wav", waveform[0:1,:], 24000)
514
+ torchaudio.save(video+".gen.wav", normalize_wav(wav_gen[0:1,:], waveform_p[0:1,:]), 24000)
515
 
516
+ #if not os.path.exists(output_dir+"/videos/"):
517
+ # os.makedirs(output_dir+"/videos/")
518
 
519
  video_clip = VideoFileClip(video)
520
+ #audio_clip = AudioFileClip(wav)
521
+ #audio_gen_clip = AudioFileClip(output_dir+"/gen/" + str(i+args.start).zfill(8) + ".wav")
522
+ audio_gen_clip = AudioFileClip(video+".gen.wav")
523
+ #print("video audio durations", video_clip.duration, audio_clip.duration, audio_gen_clip.duration)
524
+ #os.system("cp " + video + " " + output_dir+"/videos/" + str(i+args.start).zfill(8) + ".mp4")
525
+ #video_clip_gt = video_clip.set_audio(audio_clip)
526
  video_clip_gen = video_clip.set_audio(audio_gen_clip)
527
+ #video_clip_gt.write_videofile(output_dir+"/videos/" + str(i+args.start).zfill(8) + ".gt.mp4", codec="libx264", audio_codec="aac")
528
+ #video_clip_gen.write_videofile(output_dir+"/videos/" + str(i+args.start).zfill(8) + ".gen.mp4", codec="libx264", audio_codec="aac")
529
+ video_clip_gen.write_videofile(video+".gen.mp4", codec="libx264", audio_codec="aac")
530
 
MMAudio/demo.py CHANGED
@@ -178,8 +178,7 @@ def main():
178
  audio = audios.float().cpu()[0]
179
  if video_path is not None:
180
  ####save_path = output_dir / f'{video_path.stem}.flac'
181
- ####save_path = str(output_dir) + "/" + str(video_path).replace("/", "__").strip(".") + ".flac"
182
- save_path = str(output_dir) + "/__" + os.path.basename(video_path).strip(".") + ".flac"
183
  else:
184
  safe_filename = prompt.replace(' ', '_').replace('/', '_').replace('.', '')
185
  save_path = output_dir / f'{safe_filename}.flac'
@@ -210,8 +209,7 @@ def main():
210
  log.info(f'Audio saved to {save_path}')
211
  if video_path is not None and not skip_video_composite:
212
  ####video_save_path = output_dir / f'{video_path.stem}.mp4'
213
- ####video_save_path = str(output_dir) + "/" + str(video_path).replace("/", "__").strip(".") + ".mp4"
214
- video_save_path = str(output_dir) + "/__" + os.path.basename(video_path).strip(".") + ".mp4"
215
  make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
216
  log.info(f'Video saved to {video_save_path}')
217
 
 
178
  audio = audios.float().cpu()[0]
179
  if video_path is not None:
180
  ####save_path = output_dir / f'{video_path.stem}.flac'
181
+ save_path = str(output_dir) + "/" + str(video_path).replace("/", "__").strip(".") + ".flac"
 
182
  else:
183
  safe_filename = prompt.replace(' ', '_').replace('/', '_').replace('.', '')
184
  save_path = output_dir / f'{safe_filename}.flac'
 
209
  log.info(f'Audio saved to {save_path}')
210
  if video_path is not None and not skip_video_composite:
211
  ####video_save_path = output_dir / f'{video_path.stem}.mp4'
212
+ video_save_path = str(output_dir) + "/" + str(video_path).replace("/", "__").strip(".") + ".mp4"
 
213
  make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
214
  log.info(f'Video saved to {video_save_path}')
215
 
app.py CHANGED
@@ -18,19 +18,23 @@ import tempfile
18
 
19
  import requests
20
  import shutil
 
21
 
22
  log = logging.getLogger()
23
 
24
 
25
  #@spaces.GPU(duration=120)
26
- def video_to_audio(video: gr.Video, prompt: str):
27
 
28
  video_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
29
 
 
 
30
  output_dir = os.path.dirname(video_path)
31
- video_save_path = os.path.join(str(output_dir), "__" + os.path.basename(video_path).strip(".") + ".mp4")
32
 
33
  print("paths", video, video_path, output_dir, video_save_path)
 
34
 
35
  if video.startswith("http"):
36
  data = requests.get(video, timeout=60).content
@@ -39,39 +43,68 @@ def video_to_audio(video: gr.Video, prompt: str):
39
  else:
40
  shutil.copy(video, video_path)
41
 
 
 
 
 
 
 
 
 
 
 
42
  if prompt == "":
43
- os.system("cd ./MMAudio; python ./demo.py --variant small_44k --output %s --video %s --calc_energy 1" % (output_dir, video_path))
44
  else:
45
- os.system("cd ./MMAudio; python ./demo.py --variant small_44k --output %s --video %s --prompt %s --calc_energy 1" % (output_dir, video_path, prompt))
 
 
 
 
 
 
 
46
 
47
- return video_save_path
48
 
49
 
50
- video_to_audio_tab = gr.Interface(
51
- fn=video_to_audio,
52
  description="""
53
  Project page: <a href="https://acappemin.github.io/DeepAudio-V1.github.io">https://acappemin.github.io/DeepAudio-V1.github.io</a><br>
54
  Code: <a href="https://github.com/acappemin/DeepAudio-V1">https://github.com/acappemin/DeepAudio-V1</a><br>
55
  """,
56
  inputs=[
57
- gr.Video(),
58
- gr.Text(label='Prompt'),
 
 
 
 
 
 
 
59
  ],
60
- outputs='playable_video',
61
  cache_examples=False,
62
- title='Video-to-Audio',
63
  examples=[
64
  [
65
  './tests/0235.mp4',
66
  '',
 
 
 
67
  ],
68
  [
69
  './tests/0778.mp4',
70
  '',
 
 
 
71
  ],
72
  ])
73
 
74
 
75
  if __name__ == "__main__":
76
- gr.TabbedInterface([video_to_audio_tab], ['Video-to-Audio']).launch()
77
 
 
18
 
19
  import requests
20
  import shutil
21
+ import numpy as np
22
 
23
  log = logging.getLogger()
24
 
25
 
26
  #@spaces.GPU(duration=120)
27
+ def video_to_audio_and_speech(video: gr.Video, prompt: str, text: str, audio_prompt: gr.Audio, text_prompt: str):
28
 
29
  video_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
30
 
31
+ audio_p_path = tempfile.NamedTemporaryFile(delete=False, suffix='.wav').name
32
+
33
  output_dir = os.path.dirname(video_path)
34
+ video_save_path = str(output_dir) + "/" + str(video_path).replace("/", "__").strip(".") + ".mp4"
35
 
36
  print("paths", video, video_path, output_dir, video_save_path)
37
+ print("paths", audio_prompt, audio_p_path, audio_prompt[1].shape, audio_prompt[1].max(), audio_prompt[1].min(), type(audio_prompt[1]))
38
 
39
  if video.startswith("http"):
40
  data = requests.get(video, timeout=60).content
 
43
  else:
44
  shutil.copy(video, video_path)
45
 
46
+ if isinstance(audio_prompt, tuple):
47
+ sr, data = audio_prompt
48
+ torchaudio.save(audio_p_path, torch.from_numpy(data.reshape(1,-1)/32768.0), sr)
49
+ elif audio_prompt.startswith("http"):
50
+ data = requests.get(audio_prompt, timeout=60).content
51
+ with open(audio_p_path, "wb") as fw:
52
+ fw.write(data)
53
+ else:
54
+ shutil.copy(audio_prompt, audio_p_path)
55
+
56
  if prompt == "":
57
+ command = "cd ./MMAudio; python ./demo.py --variant small_44k --output %s --video %s --calc_energy 1" % (output_dir, video_path)
58
  else:
59
+ command = "cd ./MMAudio; python ./demo.py --variant small_44k --output %s --video %s --prompt %s --calc_energy 1" % (output_dir, video_path, prompt)
60
+ print("v2a command", command)
61
+ os.system(command)
62
+
63
+ command = "python ./F5-TTS/src/f5_tts/infer/infer_cli_test.py --output_dir %s --start 0 --end 1 --ckpt_file ./F5-TTS/ckpts/v2c/v2c_s44.pt --v2a_path %s --wav_p %s --txt_p \"%s\" --video %s --txt \"%s\"" % (output_dir, output_dir, audio_p_path, text_prompt, video_save_path, text)
64
+ print("v2s command", command)
65
+ os.system(command)
66
+ video_gen = output_dir + "/videos/gen/0001"
67
 
68
+ return video_save_path, video_gen
69
 
70
 
71
+ video_to_audio_and_speech_tab = gr.Interface(
72
+ fn=video_to_audio_and_speech,
73
  description="""
74
  Project page: <a href="https://acappemin.github.io/DeepAudio-V1.github.io">https://acappemin.github.io/DeepAudio-V1.github.io</a><br>
75
  Code: <a href="https://github.com/acappemin/DeepAudio-V1">https://github.com/acappemin/DeepAudio-V1</a><br>
76
  """,
77
  inputs=[
78
+ gr.Video(label="Input Video"),
79
+ gr.Text(label='Video-to-Audio Text Prompt'),
80
+ gr.Text(label='Video-to-Speech Transcription'),
81
+ gr.Audio(label='Video-to-Speech Speech Prompt'),
82
+ gr.Text(label='Video-to-Speech Speech Prompt Transcription'),
83
+ ],
84
+ outputs=[
85
+ gr.Video(label="Video-to-Audio Output"),
86
+ gr.Video(label="Video-to-Speech Output"),
87
  ],
 
88
  cache_examples=False,
89
+ title='Video-to-Audio-and-Speech',
90
  examples=[
91
  [
92
  './tests/0235.mp4',
93
  '',
94
+ "Who finally decided to show up for work Yay",
95
+ './tests/Gobber-00-0778.wav',
96
+ "I've still got a few knocking around in here",
97
  ],
98
  [
99
  './tests/0778.mp4',
100
  '',
101
+ "I've still got a few knocking around in here",
102
+ './tests/Gobber-00-0235.wav',
103
+ "Who finally decided to show up for work Yay",
104
  ],
105
  ])
106
 
107
 
108
  if __name__ == "__main__":
109
+ gr.TabbedInterface([video_to_audio_and_speech_tab], ['Video-to-Audio-and-Speech']).launch(server_name='0.0.0.0', server_port=30459)
110