Update app.py
Browse files
app.py
CHANGED
@@ -66,7 +66,7 @@ def load_f5tts():
|
|
66 |
ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename, use_auth_token=token)
|
67 |
|
68 |
F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
|
69 |
-
return load_model(DiT, F5TTS_model_cfg, ckpt_path)
|
70 |
|
71 |
|
72 |
|
@@ -120,7 +120,7 @@ def generate_response(messages, model, tokenizer):
|
|
120 |
|
121 |
@gpu_decorator
|
122 |
def infer(
|
123 |
-
ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=1, show_info=gr.Info
|
124 |
):
|
125 |
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
|
126 |
|
@@ -148,6 +148,7 @@ def infer(
|
|
148 |
ema_model,
|
149 |
vocoder,
|
150 |
cross_fade_duration=cross_fade_duration,
|
|
|
151 |
speed=speed,
|
152 |
show_info=show_info,
|
153 |
progress=gr.Progress(),
|
@@ -170,13 +171,7 @@ def infer(
|
|
170 |
|
171 |
|
172 |
with gr.Blocks() as app_credits:
|
173 |
-
|
174 |
-
# Credits
|
175 |
-
|
176 |
-
* [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
|
177 |
-
* [RootingInLoad](https://github.com/RootingInLoad) for initial chunk generation and podcast app exploration
|
178 |
-
* [jpgallegoar](https://github.com/jpgallegoar) for multiple speech-type generation & voice chat
|
179 |
-
""")
|
180 |
with gr.Blocks() as app_tts:
|
181 |
gr.Markdown("# Batched TTS")
|
182 |
ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
@@ -209,6 +204,14 @@ with gr.Blocks() as app_tts:
|
|
209 |
step=0.01,
|
210 |
info="Set the duration of the cross-fade between audio clips.",
|
211 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
|
213 |
audio_output = gr.Audio(label="Synthesized Audio")
|
214 |
spectrogram_output = gr.Image(label="Spectrogram")
|
@@ -230,6 +233,7 @@ with gr.Blocks() as app_tts:
|
|
230 |
remove_silence,
|
231 |
cross_fade_duration_slider,
|
232 |
speed_slider,
|
|
|
233 |
)
|
234 |
return audio_out, spectrogram_path, gr.update(value=ref_text_out)
|
235 |
|
@@ -242,6 +246,7 @@ with gr.Blocks() as app_tts:
|
|
242 |
remove_silence,
|
243 |
cross_fade_duration_slider,
|
244 |
speed_slider,
|
|
|
245 |
],
|
246 |
outputs=[audio_output, spectrogram_output, ref_text_input],
|
247 |
)
|
|
|
66 |
ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename, use_auth_token=token)
|
67 |
|
68 |
F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
|
69 |
+
return load_model(DiT, F5TTS_model_cfg, ckpt_path, use_ema=True)
|
70 |
|
71 |
|
72 |
|
|
|
120 |
|
121 |
@gpu_decorator
|
122 |
def infer(
|
123 |
+
ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=1, nfe=32, show_info=gr.Info
|
124 |
):
|
125 |
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
|
126 |
|
|
|
148 |
ema_model,
|
149 |
vocoder,
|
150 |
cross_fade_duration=cross_fade_duration,
|
151 |
+
nfe_step=nfe,
|
152 |
speed=speed,
|
153 |
show_info=show_info,
|
154 |
progress=gr.Progress(),
|
|
|
171 |
|
172 |
|
173 |
with gr.Blocks() as app_credits:
|
174 |
+
gr.Markdown("F5-TTS")
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
with gr.Blocks() as app_tts:
|
176 |
gr.Markdown("# Batched TTS")
|
177 |
ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
|
|
204 |
step=0.01,
|
205 |
info="Set the duration of the cross-fade between audio clips.",
|
206 |
)
|
207 |
+
nfe_slider = gr.Slider(
|
208 |
+
label="NFE",
|
209 |
+
minimum=16,
|
210 |
+
maximum=64,
|
211 |
+
value=32,
|
212 |
+
step=1,
|
213 |
+
info="Ajuste NFE Step.",
|
214 |
+
)
|
215 |
|
216 |
audio_output = gr.Audio(label="Synthesized Audio")
|
217 |
spectrogram_output = gr.Image(label="Spectrogram")
|
|
|
233 |
remove_silence,
|
234 |
cross_fade_duration_slider,
|
235 |
speed_slider,
|
236 |
+
nfe_slider,
|
237 |
)
|
238 |
return audio_out, spectrogram_path, gr.update(value=ref_text_out)
|
239 |
|
|
|
246 |
remove_silence,
|
247 |
cross_fade_duration_slider,
|
248 |
speed_slider,
|
249 |
+
nfe_slider,
|
250 |
],
|
251 |
outputs=[audio_output, spectrogram_output, ref_text_input],
|
252 |
)
|