E2-F5-TTS

Running on Zero

App Files Files Community

M4xjunior commited on Dec 1, 2024

Commit

5f1f288

verified ·

1 Parent(s): 375ecba

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -9

app.py CHANGED Viewed

@@ -66,7 +66,7 @@ def load_f5tts():
     ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename, use_auth_token=token)
     F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
-    return load_model(DiT, F5TTS_model_cfg, ckpt_path)
@@ -120,7 +120,7 @@ def generate_response(messages, model, tokenizer):
 @gpu_decorator
 def infer(
-    ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=1, show_info=gr.Info
 ):
     ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
@@ -148,6 +148,7 @@ def infer(
         ema_model,
         vocoder,
         cross_fade_duration=cross_fade_duration,
         speed=speed,
         show_info=show_info,
         progress=gr.Progress(),
@@ -170,13 +171,7 @@ def infer(
 with gr.Blocks() as app_credits:
-    gr.Markdown("""
-# Credits
-* [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
-* [RootingInLoad](https://github.com/RootingInLoad) for initial chunk generation and podcast app exploration
-* [jpgallegoar](https://github.com/jpgallegoar) for multiple speech-type generation & voice chat
-""")
 with gr.Blocks() as app_tts:
     gr.Markdown("# Batched TTS")
     ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
@@ -209,6 +204,14 @@ with gr.Blocks() as app_tts:
             step=0.01,
             info="Set the duration of the cross-fade between audio clips.",
         )
     audio_output = gr.Audio(label="Synthesized Audio")
     spectrogram_output = gr.Image(label="Spectrogram")
@@ -230,6 +233,7 @@ with gr.Blocks() as app_tts:
             remove_silence,
             cross_fade_duration_slider,
             speed_slider,
         )
         return audio_out, spectrogram_path, gr.update(value=ref_text_out)
@@ -242,6 +246,7 @@ with gr.Blocks() as app_tts:
             remove_silence,
             cross_fade_duration_slider,
             speed_slider,
         ],
         outputs=[audio_output, spectrogram_output, ref_text_input],
     )

     ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename, use_auth_token=token)
     F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
+    return load_model(DiT, F5TTS_model_cfg, ckpt_path, use_ema=True)
 @gpu_decorator
 def infer(
+    ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=1, nfe=32, show_info=gr.Info
 ):
     ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
         ema_model,
         vocoder,
         cross_fade_duration=cross_fade_duration,
+        nfe_step=nfe,
         speed=speed,
         show_info=show_info,
         progress=gr.Progress(),
 with gr.Blocks() as app_credits:
+     gr.Markdown("F5-TTS")
 with gr.Blocks() as app_tts:
     gr.Markdown("# Batched TTS")
     ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
             step=0.01,
             info="Set the duration of the cross-fade between audio clips.",
         )
+        nfe_slider = gr.Slider(
+                    label="NFE",
+                    minimum=16,
+                    maximum=64,
+                    value=32,
+                    step=1,
+                    info="Ajuste NFE Step.",
+                )
     audio_output = gr.Audio(label="Synthesized Audio")
     spectrogram_output = gr.Image(label="Spectrogram")
             remove_silence,
             cross_fade_duration_slider,
             speed_slider,
+            nfe_slider,
         )
         return audio_out, spectrogram_path, gr.update(value=ref_text_out)
             remove_silence,
             cross_fade_duration_slider,
             speed_slider,
+            nfe_slider,
         ],
         outputs=[audio_output, spectrogram_output, ref_text_input],
     )