Spaces:

lshzhm
/

DeepAudio-V1

Running

App Files Files Community

lshzhm commited on 9 days ago

Commit

9289e32

1 Parent(s): a6ed34a

gradio

Browse files

Files changed (3) hide show

F5-TTS/src/f5_tts/infer/infer_cli_test.py +84 -40
MMAudio/demo.py +2 -4
app.py +45 -12

F5-TTS/src/f5_tts/infer/infer_cli_test.py CHANGED Viewed

@@ -188,6 +188,26 @@ parser.add_argument(
     type=str,
     default="",
 )
 args = parser.parse_args()
@@ -404,17 +424,21 @@ def normalize_wav(waveform, waveform_ref):
 if __name__ == "__main__":
-    scp = args.infer_list
     v2a_path = args.v2a_path
-    with open(scp, "r") as fr:
-        lines = fr.readlines()
-    datas2 = []
-    for line in lines:
-        wav_p, video_p, txt_p, wav, video, txt = line.strip().split("\t")
-        datas2.append([[video, txt, wav], [video_p, txt_p, wav_p]])
     print("datas2", len(datas2))
     if True:
@@ -423,17 +447,34 @@ if __name__ == "__main__":
             video_p, txt_p, wav_p = data_p
             v2a_audio = v2a_path + video.replace("/", "__").strip(".") + ".flac"
-            v2a_audio_p = v2a_path + video_p.replace("/", "__").strip(".") + ".flac"
-            print(video, wav, v2a_audio, video_p, wav_p, v2a_audio_p)
-            if not os.path.exists(video) or not os.path.exists(wav) or not os.path.exists(v2a_audio):
                 continue
-            if not os.path.exists(video_p) or not os.path.exists(wav_p) or not os.path.exists(v2a_audio_p):
                 continue
-            energy = torch.from_numpy(np.load(v2a_audio+".npz")["arr_0"]).unsqueeze(0).unsqueeze(2)
-            energy_p = torch.from_numpy(np.load(v2a_audio_p+".npz")["arr_0"]).unsqueeze(0).unsqueeze(2)
             #print("energy shape", energy_p.shape, energy.shape)
             #energy = torch.cat([energy_p, energy], dim=1)
@@ -450,37 +491,40 @@ if __name__ == "__main__":
                 wav_gen = torch.zeros(1, 24000)
                 sr_gen = 24000
-            waveform, sr = torchaudio.load(wav)
-            if sr != 24000:
-                waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=24000)
-            waveform_p, sr = torchaudio.load(wav_p)
-            if sr != 24000:
-                waveform_p = torchaudio.functional.resample(waveform_p, orig_freq=sr, new_freq=24000)
             #print(wav_gen.shape, wav_gen.max(), waveform.max(), waveform_p.max())
-            if not os.path.exists(output_dir):
-                os.makedirs(output_dir)
-            if not os.path.exists(output_dir+"/ref/"):
-                os.makedirs(output_dir+"/ref/")
-            if not os.path.exists(output_dir+"/gen/"):
-                os.makedirs(output_dir+"/gen/")
-            if not os.path.exists(output_dir+"/tgt/"):
-                os.makedirs(output_dir+"/tgt/")
-            torchaudio.save(output_dir+"/ref/"+str(i+args.start).zfill(8)+".wav", waveform_p[0:1,:], 24000)
-            torchaudio.save(output_dir+"/gen/"+str(i+args.start).zfill(8)+".wav", normalize_wav(wav_gen[0:1,:], waveform_p[0:1,:]), 24000)
-            torchaudio.save(output_dir+"/tgt/"+str(i+args.start).zfill(8)+".wav", waveform[0:1,:], 24000)
-            if not os.path.exists(output_dir+"/videos/"):
-                os.makedirs(output_dir+"/videos/")
             video_clip = VideoFileClip(video)
-            audio_clip = AudioFileClip(wav)
-            audio_gen_clip = AudioFileClip(output_dir+"/gen/" + str(i+args.start).zfill(8) + ".wav")
-            print("video audio durations", video_clip.duration, audio_clip.duration, audio_gen_clip.duration)
-            os.system("cp " + video + " " + output_dir+"/videos/" + str(i+args.start).zfill(8) + ".mp4")
-            video_clip_gt = video_clip.set_audio(audio_clip)
             video_clip_gen = video_clip.set_audio(audio_gen_clip)
-            video_clip_gt.write_videofile(output_dir+"/videos/" + str(i+args.start).zfill(8) + ".gt.mp4", codec="libx264", audio_codec="aac")
-            video_clip_gen.write_videofile(output_dir+"/videos/" + str(i+args.start).zfill(8) + ".gen.mp4", codec="libx264", audio_codec="aac")

     type=str,
     default="",
 )
+parser.add_argument(
+    "--wav_p",
+    type=str,
+    default="",
+)
+parser.add_argument(
+    "--txt_p",
+    type=str,
+    default="",
+)
+parser.add_argument(
+    "--video",
+    type=str,
+    default="",
+)
+parser.add_argument(
+    "--txt",
+    type=str,
+    default="",
+)
 args = parser.parse_args()
 if __name__ == "__main__":
     v2a_path = args.v2a_path
+    if args.wav_p == "":
+        scp = args.infer_list
+        with open(scp, "r") as fr:
+            lines = fr.readlines()
+        datas2 = []
+        for line in lines:
+            wav_p, video_p, txt_p, wav, video, txt = line.strip().split("\t")
+            datas2.append([[video, txt, wav], [video_p, txt_p, wav_p]])
+    else:
+        datas2 = [[[args.video, args.txt, None], [None, args.txt_p, args.wav_p]]]
     print("datas2", len(datas2))
     if True:
             video_p, txt_p, wav_p = data_p
             v2a_audio = v2a_path + video.replace("/", "__").strip(".") + ".flac"
+            #v2a_audio_p = v2a_path + video_p.replace("/", "__").strip(".") + ".flac"
+            print(video, wav, v2a_audio, video_p, wav_p)
+            if not os.path.exists(video) or not os.path.exists(v2a_audio):
                 continue
+            if not os.path.exists(wav_p):
                 continue
+            #energy = torch.from_numpy(np.load(v2a_audio+".npz")["arr_0"]).unsqueeze(0).unsqueeze(2)
+            #energy_p = torch.from_numpy(np.load(v2a_audio_p+".npz")["arr_0"]).unsqueeze(0).unsqueeze(2)
+            waveform_v2a, sr_v2a = torchaudio.load(v2a_audio)
+            duration_v2a = waveform_v2a.shape[-1] / sr_v2a
+            energy = []
+            for i in range(int(duration_v2a/(256/24000))):
+                energy.append(waveform_v2a[0,int(i*sr_v2a*(256/24000)):int((i+1)*sr_v2a*(256/24000))].abs().mean())
+            energy = np.array(energy)
+            energy = energy / max(energy)
+            waveform_p, sr_p = torchaudio.load(wav_p)
+            duration_p = waveform_p.shape[-1] / sr_p
+            energy_p = []
+            for i in range(int(duration_p/(256/24000))):
+                energy_p.append(waveform_p[0,int(i*sr_p*(256/24000)):int((i+1)*sr_p*(256/24000))].abs().mean())
+            energy_p = np.array(energy_p)
+            energy_p = energy_p / max(energy_p)
             #print("energy shape", energy_p.shape, energy.shape)
             #energy = torch.cat([energy_p, energy], dim=1)
                 wav_gen = torch.zeros(1, 24000)
                 sr_gen = 24000
+            #waveform, sr = torchaudio.load(wav)
+            #if sr != 24000:
+            #    waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=24000)
+            #waveform_p, sr = torchaudio.load(wav_p)
+            #if sr != 24000:
+            #    waveform_p = torchaudio.functional.resample(waveform_p, orig_freq=sr, new_freq=24000)
             #print(wav_gen.shape, wav_gen.max(), waveform.max(), waveform_p.max())
+            #if not os.path.exists(output_dir):
+            #    os.makedirs(output_dir)
+            #if not os.path.exists(output_dir+"/ref/"):
+            #    os.makedirs(output_dir+"/ref/")
+            #if not os.path.exists(output_dir+"/gen/"):
+            #    os.makedirs(output_dir+"/gen/")
+            #if not os.path.exists(output_dir+"/tgt/"):
+            #    os.makedirs(output_dir+"/tgt/")
+            #torchaudio.save(output_dir+"/ref/"+str(i+args.start).zfill(8)+".wav", waveform_p[0:1,:], 24000)
+            #torchaudio.save(output_dir+"/gen/"+str(i+args.start).zfill(8)+".wav", normalize_wav(wav_gen[0:1,:], waveform_p[0:1,:]), 24000)
+            #torchaudio.save(output_dir+"/tgt/"+str(i+args.start).zfill(8)+".wav", waveform[0:1,:], 24000)
+            torchaudio.save(video+".gen.wav", normalize_wav(wav_gen[0:1,:], waveform_p[0:1,:]), 24000)
+            #if not os.path.exists(output_dir+"/videos/"):
+            #    os.makedirs(output_dir+"/videos/")
             video_clip = VideoFileClip(video)
+            #audio_clip = AudioFileClip(wav)
+            #audio_gen_clip = AudioFileClip(output_dir+"/gen/" + str(i+args.start).zfill(8) + ".wav")
+            audio_gen_clip = AudioFileClip(video+".gen.wav")
+            #print("video audio durations", video_clip.duration, audio_clip.duration, audio_gen_clip.duration)
+            #os.system("cp " + video + " " + output_dir+"/videos/" + str(i+args.start).zfill(8) + ".mp4")
+            #video_clip_gt = video_clip.set_audio(audio_clip)
             video_clip_gen = video_clip.set_audio(audio_gen_clip)
+            #video_clip_gt.write_videofile(output_dir+"/videos/" + str(i+args.start).zfill(8) + ".gt.mp4", codec="libx264", audio_codec="aac")
+            #video_clip_gen.write_videofile(output_dir+"/videos/" + str(i+args.start).zfill(8) + ".gen.mp4", codec="libx264", audio_codec="aac")
+            video_clip_gen.write_videofile(video+".gen.mp4", codec="libx264", audio_codec="aac")

MMAudio/demo.py CHANGED Viewed

@@ -178,8 +178,7 @@ def main():
         audio = audios.float().cpu()[0]
         if video_path is not None:
             ####save_path = output_dir / f'{video_path.stem}.flac'
-            ####save_path = str(output_dir) + "/" + str(video_path).replace("/", "__").strip(".") + ".flac"
-            save_path = str(output_dir) + "/__" + os.path.basename(video_path).strip(".") + ".flac"
         else:
             safe_filename = prompt.replace(' ', '_').replace('/', '_').replace('.', '')
             save_path = output_dir / f'{safe_filename}.flac'
@@ -210,8 +209,7 @@ def main():
         log.info(f'Audio saved to {save_path}')
         if video_path is not None and not skip_video_composite:
             ####video_save_path = output_dir / f'{video_path.stem}.mp4'
-            ####video_save_path = str(output_dir) + "/" + str(video_path).replace("/", "__").strip(".") + ".mp4"
-            video_save_path = str(output_dir) + "/__" + os.path.basename(video_path).strip(".") + ".mp4"
             make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
             log.info(f'Video saved to {video_save_path}')

         audio = audios.float().cpu()[0]
         if video_path is not None:
             ####save_path = output_dir / f'{video_path.stem}.flac'
+            save_path = str(output_dir) + "/" + str(video_path).replace("/", "__").strip(".") + ".flac"
         else:
             safe_filename = prompt.replace(' ', '_').replace('/', '_').replace('.', '')
             save_path = output_dir / f'{safe_filename}.flac'
         log.info(f'Audio saved to {save_path}')
         if video_path is not None and not skip_video_composite:
             ####video_save_path = output_dir / f'{video_path.stem}.mp4'
+            video_save_path = str(output_dir) + "/" + str(video_path).replace("/", "__").strip(".") + ".mp4"
             make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
             log.info(f'Video saved to {video_save_path}')

app.py CHANGED Viewed

@@ -18,19 +18,23 @@ import tempfile
 import requests
 import shutil
 log = logging.getLogger()
 #@spaces.GPU(duration=120)
-def video_to_audio(video: gr.Video, prompt: str):
     video_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
     output_dir = os.path.dirname(video_path)
-    video_save_path = os.path.join(str(output_dir), "__" + os.path.basename(video_path).strip(".") + ".mp4")
     print("paths", video, video_path, output_dir, video_save_path)
     if video.startswith("http"):
         data = requests.get(video, timeout=60).content
@@ -39,39 +43,68 @@ def video_to_audio(video: gr.Video, prompt: str):
     else:
         shutil.copy(video, video_path)
     if prompt == "":
-        os.system("cd ./MMAudio; python ./demo.py --variant small_44k --output %s --video %s --calc_energy 1" % (output_dir, video_path))
     else:
-        os.system("cd ./MMAudio; python ./demo.py --variant small_44k --output %s --video %s --prompt %s --calc_energy 1" % (output_dir, video_path, prompt))
-    return video_save_path
-video_to_audio_tab = gr.Interface(
-    fn=video_to_audio,
     description="""
     Project page: <a href="https://acappemin.github.io/DeepAudio-V1.github.io">https://acappemin.github.io/DeepAudio-V1.github.io</a><br>
     Code: <a href="https://github.com/acappemin/DeepAudio-V1">https://github.com/acappemin/DeepAudio-V1</a><br>
     """,
     inputs=[
-        gr.Video(),
-        gr.Text(label='Prompt'),
     ],
-    outputs='playable_video',
     cache_examples=False,
-    title='Video-to-Audio',
     examples=[
         [
             './tests/0235.mp4',
             '',
         ],
         [
             './tests/0778.mp4',
             '',
         ],
     ])
 if __name__ == "__main__":
-    gr.TabbedInterface([video_to_audio_tab], ['Video-to-Audio']).launch()

 import requests
 import shutil
+import numpy as np
 log = logging.getLogger()
 #@spaces.GPU(duration=120)
+def video_to_audio_and_speech(video: gr.Video, prompt: str, text: str, audio_prompt: gr.Audio, text_prompt: str):
     video_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
+    audio_p_path = tempfile.NamedTemporaryFile(delete=False, suffix='.wav').name
     output_dir = os.path.dirname(video_path)
+    video_save_path = str(output_dir) + "/" + str(video_path).replace("/", "__").strip(".") + ".mp4"
     print("paths", video, video_path, output_dir, video_save_path)
+    print("paths", audio_prompt, audio_p_path, audio_prompt[1].shape, audio_prompt[1].max(), audio_prompt[1].min(), type(audio_prompt[1]))
     if video.startswith("http"):
         data = requests.get(video, timeout=60).content
     else:
         shutil.copy(video, video_path)
+    if isinstance(audio_prompt, tuple):
+        sr, data = audio_prompt
+        torchaudio.save(audio_p_path, torch.from_numpy(data.reshape(1,-1)/32768.0), sr)
+    elif audio_prompt.startswith("http"):
+        data = requests.get(audio_prompt, timeout=60).content
+        with open(audio_p_path, "wb") as fw:
+            fw.write(data)
+    else:
+        shutil.copy(audio_prompt, audio_p_path)
     if prompt == "":
+        command = "cd ./MMAudio; python ./demo.py --variant small_44k --output %s --video %s --calc_energy 1" % (output_dir, video_path)
     else:
+        command = "cd ./MMAudio; python ./demo.py --variant small_44k --output %s --video %s --prompt %s --calc_energy 1" % (output_dir, video_path, prompt)
+    print("v2a command", command)
+    os.system(command)
+    command = "python ./F5-TTS/src/f5_tts/infer/infer_cli_test.py --output_dir %s --start 0 --end 1 --ckpt_file ./F5-TTS/ckpts/v2c/v2c_s44.pt --v2a_path %s --wav_p %s --txt_p \"%s\" --video %s --txt \"%s\"" % (output_dir, output_dir, audio_p_path, text_prompt, video_save_path, text)
+    print("v2s command", command)
+    os.system(command)
+    video_gen = output_dir + "/videos/gen/0001"
+    return video_save_path, video_gen
+video_to_audio_and_speech_tab = gr.Interface(
+    fn=video_to_audio_and_speech,
     description="""
     Project page: <a href="https://acappemin.github.io/DeepAudio-V1.github.io">https://acappemin.github.io/DeepAudio-V1.github.io</a><br>
     Code: <a href="https://github.com/acappemin/DeepAudio-V1">https://github.com/acappemin/DeepAudio-V1</a><br>
     """,
     inputs=[
+        gr.Video(label="Input Video"),
+        gr.Text(label='Video-to-Audio Text Prompt'),
+        gr.Text(label='Video-to-Speech Transcription'),
+        gr.Audio(label='Video-to-Speech Speech Prompt'),
+        gr.Text(label='Video-to-Speech Speech Prompt Transcription'),
+    ],
+    outputs=[
+        gr.Video(label="Video-to-Audio Output"),
+        gr.Video(label="Video-to-Speech Output"),
     ],
     cache_examples=False,
+    title='Video-to-Audio-and-Speech',
     examples=[
         [
             './tests/0235.mp4',
             '',
+            "Who finally decided to show up for work Yay",
+            './tests/Gobber-00-0778.wav',
+            "I've still got a few knocking around in here",
         ],
         [
             './tests/0778.mp4',
             '',
+            "I've still got a few knocking around in here",
+            './tests/Gobber-00-0235.wav',
+            "Who finally decided to show up for work Yay",
         ],
     ])
 if __name__ == "__main__":
+    gr.TabbedInterface([video_to_audio_and_speech_tab], ['Video-to-Audio-and-Speech']).launch(server_name='0.0.0.0', server_port=30459)